In [3]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [17]:
import pickle

In [5]:
df_1 = pd.read_excel('data/final_target.xlsx', index_col = 0)
df_2 = pd.read_excel('data/final_norm.xlsx', index_col = 0)
df = pd.concat([df_1, df_2])

In [6]:
df.head(5)

Unnamed: 0,Post,Type
0,I am tired,Suicidal
1,discouraged,Suicidal
2,I mourn,Suicidal
3,tired,Suicidal
4,tired of everything,Suicidal


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7596 entries, 0 to 11032
Data columns (total 2 columns):
Post    7596 non-null object
Type    7596 non-null object
dtypes: object(2)
memory usage: 178.0+ KB


In [8]:
df['Type'] = df['Type'].map({'Suicidal':1, 'Normal': 0})

In [9]:
df = df.sample(frac=1).reset_index(drop=True)

In [10]:
df.tail(5)

Unnamed: 0,Post,Type
7591,to relax from all,0
7592,100500,0
7593,famous archival photographs in color the nerd,0
7594,builtfromsketch,0
7595,very warm feeling for me it's just doroga home,0


In [11]:
import nltk
#nltk.download('punkt')

In [25]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\katal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [35]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\katal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [12]:
# Remove nulls if any
df['Post'].dropna(inplace=True)

# Change all the text to lower case 
df['Post'] = [i.lower() for i in df['Post']]

# Tokenization
df['Post'] = [word_tokenize(i) for i in df['Post']]

#
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index, entry in enumerate(df['Post']):
    final_words = []
    word_lemmatized = WordNetLemmatizer()
    
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
            final_words.append(word_final)
    df.loc[index, 'text_final'] = str(final_words)        

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\katal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(df['text_final'], df['Type'], test_size = 0.3)

In [14]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=5000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [16]:
pickle.dump(Tfidf_vect.vocabulary_,open("feature.pkl","wb"))

In [None]:
#Load it later
transformer = TfidfTransformer()
loaded_vec = CountVectorizer(decode_error="replace",vocabulary=pickle.load(open("feature.pkl", "rb")))
tfidf = transformer.fit_transform(loaded_vec.fit_transform(np.array(["aaa ccc eee"])))

In [37]:
train_X_Tfidf = Tfidf_vect.transform(train_X)
test_X_Tfidf = Tfidf_vect.transform(test_X)

In [39]:
#print(Tfidf_vect.vocabulary_)

In [42]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_X_Tfidf, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [43]:
predictions_NB = Naive.predict(test_X_Tfidf)

In [44]:
accuracy_score(predictions_NB, test_y)

0.8771390960947784

In [53]:
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score

In [50]:
recall_score(predictions_NB, test_y)

0.9281767955801105

In [52]:
precision_score(predictions_NB, test_y)

0.38620689655172413

In [55]:
roc_auc_score(predictions_NB, test_y)

0.900456367284812

## SVM 

In [303]:
SVM = svm.SVC(C=1.0, kernel='linear', gamma = 0.001, probability=True)
SVM.fit(train_X_Tfidf, train_y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [304]:
predictions_svm = SVM.predict(test_X_Tfidf)

In [305]:
accuracy_score(predictions_svm, test_y)

0.9148749451513822

In [306]:
recall_score(predictions_svm, test_y)

0.8413597733711048

In [307]:
precision_score(predictions_svm, test_y)

0.6827586206896552

In [308]:
roc_auc_score(predictions_svm, test_y)

0.8848543415142127

In [276]:
from sklearn.model_selection import GridSearchCV

In [314]:
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

In [315]:
%%time
grid_search.fit(train_X_Tfidf, train_y)

Wall time: 38.8 s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'gamma': [0.001, 0.01, 0.1, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [316]:
grid_search.best_params_

{'C': 1, 'gamma': 1}

In [317]:
grid_search.best_score_

0.9611166172946215

In [318]:
svm_grid_preds = grid_search.best_estimator_.predict(test_X_Tfidf)

In [319]:
roc_auc_score(svm_grid_preds, test_y)

0.8968121443437256

In [320]:
precision_score(svm_grid_preds, test_y)

0.5908045977011495

In [321]:
recall_score(svm_grid_preds, test_y)

0.8831615120274914

In [322]:
f1_score(svm_grid_preds, test_y)

0.7079889807162536

In [323]:
accuracy_score(svm_grid_preds, test_y)

0.9069767441860465

## Testing with new input

In [61]:
from googletrans import Translator

In [62]:
translator = Translator()

In [351]:
a = "this is only my stomach could have an outward like a person without the kind of anxieties nothing and nobody understands what's happening in my stomach éxx here 13 years ago, the world is left to die körmï"
#a = translator.translate(a)

In [340]:
a = "discouraged"

In [352]:
words = word_tokenize(a)

In [353]:
final_words = []
word_lemmatized = WordNetLemmatizer()
    
for word, tag in pos_tag(words):
    if word not in stopwords.words('english') and word.isalpha():
        word_final = word_lemmatized.lemmatize(word, tag_map[tag[0]])
        final_words.append(word_final)
final = str(final_words)        

In [354]:
final

"['stomach', 'could', 'outward', 'like', 'person', 'without', 'kind', 'anxiety', 'nothing', 'nobody', 'understand', 'happen', 'stomach', 'éxx', 'year', 'ago', 'world', 'leave', 'die', 'körmï']"

In [355]:
final = Tfidf_vect.transform([final])

In [356]:
pred_n = Naive.predict_proba(final)

In [357]:
pred_svm = grid_search.best_estimator_.predict(final)

In [358]:
pred_n

array([[0.68206581, 0.31793419]])

In [359]:
pred_svm

array([1], dtype=int64)

In [360]:
import pickle

In [362]:
with open('ml_svm.pkl', 'wb') as handle:
    pickle.dump(SVM, handle, pickle.HIGHEST_PROTOCOL)