# Suicidial Ideation Detection
#### Ian Steenstra
#### Social Computing
#### April 27th, 2020

In [92]:
import pandas as pd
import numpy as np
from textblob import TextBlob as tb
import nltk
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, r2_score
from nltk.corpus import wordnet as wn
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
import pickle
from sklearn.externals import joblib
from gensim.models import FastText



### Data Preprocessing

In [3]:
data = pd.DataFrame(columns=('text', 'suicidal_ideation'))
for i in range(10):
    data = pd.concat([data, pd.read_csv('annotated_data'+str(i)+'.csv')], sort=False)

In [131]:
data = data.dropna()
data.head()

Unnamed: 0,text,suicidal_ideation
0,I got a surprise gift left outside from @_toky...,0.0
1,never gonna live to be a teenager,1.0
2,RT @GeoffreySupran: Hi @Harvard: Even as you r...,0.0
3,Google cares about me :'),0.0
4,Does anyone else feel like they have to commit...,1.0


In [6]:
X = data['text']
y = data['suicidal_ideation']

In [7]:
counts = y.value_counts()
size = y.shape[0]
print('Percent of yeses: ', counts[1]/size)
print('Percent of noes: ', counts[0]/size)

Percent of yeses:  0.3998871968415116
Percent of noes:  0.6001128031584885


In [8]:
X = X.apply(lambda text: re.sub(r"http\S+", "", text).lower())

### Feature Extraction

#### Sentiment Feature

In [9]:
sentiment_features = []

for text in X:
    sentiment = tb(text).sentiment
    sentiment_features.append([sentiment.polarity, sentiment.subjectivity])
    
sentiment_features = pd.DataFrame(sentiment_features)
sentiment_features.shape

(1773, 2)

#### TF-IDF Feature

In [10]:
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2)) 
tfidf_features = pd.DataFrame(tfidf.fit_transform(X).toarray())

tfidf_features.shape

(1773, 23187)

#### Absolute Words Feature

In [5]:
absol_words = [word.lower() for word in [
    'Absolutely',
    'All',
    'Always',
    'Complete',
    'Completely',
    'Constant',
    'Constantly',
    'Definitely',
    'Entire',
    'Ever',
    'Every',
    'Everyone',
    'Everything',
    'Full',
    'Must',
    'Never',
    'Nothing',
    'Totally',
    'Whole']]

absol_words_syn = []
for word in absol_words:
    for ss in wn.synsets(word):
        absol_words_syn.extend(ss.lemma_names())
absol_words = list(dict.fromkeys(absol_words_syn))

In [11]:
def countWord(lst, w): 
    count = 0
    for ele in lst: 
        if (ele == w): 
            count = count + 1
    return count

In [12]:
absol_word_counts = []
for text in X:
    count = 0
    tokens = nltk.word_tokenize(text)
    for tok in tokens:
        count += countWord(absol_words, tok)
    absol_word_counts.append([count])
    
absol_word_counts = pd.DataFrame(absol_word_counts)
absol_word_counts.shape

(1773, 1)

#### Below is where I use a pretrained humor detection model to test whether it would generate valuable features

In [121]:
text=[]
for sentence in X:
    sent_word_list = nltk.word_tokenize(sentence)
    text.append(sent_word_list)

w2v = FastText(text, min_count=1)

for index, row in data.iterrows():
    sent = row['text']
    if len(sent)!=0:
        sent_vect = sum([w2v[w.lower()] for w in nltk.word_tokenize(sent)])/(len(sent.split())+0.001)
    else:  
        sent_vect = np.zeros((100,))
    vect_record.append(sent_vect) 



In [129]:
humor_feature = pd.DataFrame(vect_record, columns=range(100))
humor_feature.to_csv('humor_features.csv')
print(humor_feature.head())
print(humor_feature.shape)

         0         1         2         3         4         5         6   \
0 -0.712867  0.475950  0.458042 -0.055522  0.262554 -0.765900 -0.476390   
1 -0.715446  0.474678  0.460877 -0.055684  0.263628 -0.766737 -0.477077   
2 -0.732085  0.487082  0.470120 -0.056042  0.271327 -0.787066 -0.489229   
3 -0.656790  0.442275  0.423596 -0.050908  0.245487 -0.710136 -0.436591   
4 -0.724267  0.480995  0.467234 -0.057034  0.269219 -0.779788 -0.482606   

         7         8         9   ...        90        91        92        93  \
0 -0.296177 -0.731238  0.447303  ...  0.040290  0.102925  0.421974 -0.184266   
1 -0.297413 -0.732013  0.447009  ...  0.040072  0.102386  0.423000 -0.183198   
2 -0.305295 -0.749801  0.458566  ...  0.043675  0.103281  0.432028 -0.189249   
3 -0.272788 -0.674683  0.413985  ...  0.037736  0.091230  0.388590 -0.169146   
4 -0.301063 -0.742236  0.453675  ...  0.041673  0.104214  0.428427 -0.184741   

         94        95        96        97        98        99  
0  0

In [93]:
lf_from_joblib = joblib.load('humor_detection_linreg.pkl') 

#### Decided not to use the humor feature because all text were classified as 0 or not humorous

In [130]:
humor_pred = lf_from_joblib.predict(humor_feature)
sum(humor_pred)

0

#### Combined feature vectors

In [13]:
features = pd.concat([tfidf_features, sentiment_features, absol_word_counts], axis=1)
features.shape

(1773, 23190)

#### Scaled feature vector

In [14]:
features = features.values
min_max_scaler = preprocessing.MinMaxScaler()
features_scaled = min_max_scaler.fit_transform(features)
features = pd.DataFrame(features_scaled)

### Training w/ GridSearchCV

In [74]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],'kernel': ['linear', 'rbf', 'poly']}

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2)

In [77]:
grid = GridSearchCV(SVC(), param_grid, verbose=2, cv=3, n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 23.7min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['linear', 'rbf', 'poly']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

### Results

In [78]:
print(grid.best_estimator_)

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [79]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions,target_names=['no', 'yes']))

[[183  28]
 [ 29 115]]
              precision    recall  f1-score   support

          no       0.86      0.87      0.87       211
         yes       0.80      0.80      0.80       144

    accuracy                           0.84       355
   macro avg       0.83      0.83      0.83       355
weighted avg       0.84      0.84      0.84       355



### Feature Importance

In [84]:
features = pd.concat([tfidf_features], axis=1) # changed the features in the list
features.shape

features = features.values
min_max_scaler = preprocessing.MinMaxScaler()
features_scaled = min_max_scaler.fit_transform(features)
features = pd.DataFrame(features_scaled)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=1234) 

In [86]:
model = SVC(C=0.1, gamma=1, kernel='linear').fit(X_train, y_train)

In [87]:
y_pred = model.predict(X_test)

In [88]:
print(classification_report(y_test,y_pred,target_names=['no', 'yes']))

              precision    recall  f1-score   support

          no       0.87      0.85      0.86       225
         yes       0.75      0.78      0.77       130

    accuracy                           0.83       355
   macro avg       0.81      0.82      0.81       355
weighted avg       0.83      0.83      0.83       355



#### Accuracy & Precision of Features:
- All: 83%: precision - 76% yes, 87% no
- TF-IDF: 83%: precision - 75% yes, 87% no
- Sentiment: 63%: precision - 0% yes, 63% no
- Absol_Word-Count: 63%: precision - 0% yes, 63% no