#### Setup
Same as before

In [36]:
import nltk
import time
import eli5
import string
import warnings
import numpy as np
import pandas as pd 
import seaborn as sns 
import plotly.express as px
import matplotlib.pyplot as plt

from eli5.lime import TextExplainer

from wordcloud import WordCloud, STOPWORDS

from nltk.corpus import stopwords 
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, sent_tokenize 
from nltk.stem import PorterStemmer, WordNetLemmatizer 

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV 
from sklearn.metrics import mean_squared_error, accuracy_score, recall_score,f1_score, classification_report

sns.set_style("whitegrid")
sns.set_palette("rocket")

warnings.simplefilter(action='ignore')

In [24]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/lois/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/lois/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lois/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [37]:
suicide = pd.read_csv("/Users/Lois/Downloads/ML ipynb/Capstone/SuicideDetection.csv")
suicide = suicide.sample(20000, random_state = 42)

In [38]:
#only keeping the text and class columns
suicide.drop("Unnamed: 0", axis = 1, inplace = True)

#renaming "class" column as "suicide"
suicide.rename({"class" : "suicide"}, axis = 1, inplace = True)

#numerically representing the contents of "suicide" column
suicide.replace({"suicide" : 1, "non-suicide" : 0}, inplace = True)
suicide.head()

Unnamed: 0,text,suicide
74414,I Don't know?7? Months self harm free and the ...,1
149516,I HAVE TO START BECOMING RICH I HAVE TO START ...,0
12484,"A poem (haiku) for u/Me-Game-Dev hi, hello hel...",0
14043,I've honestly got no idea what to do anymore.I...,1
30673,Do you ever just cry? Like you just think abou...,0


In [39]:
#train test split
X = suicide['text']
y = suicide['suicide']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [40]:
#tokenization, stop word removal, punctuation removal, lower casing, and stemming
def preprocess_text(text):
    tokenizer = TweetTokenizer(preserve_case = False)
    swords = stopwords.words('english')
    punc = list(string.punctuation)
    stemmer = PorterStemmer()
    
    proprocessed_text = []
    for entry in text:
        tokens = tokenizer.tokenize(entry)
        proprocessed_text.append([stemmer.stem(word) for word in tokens if word not in swords and word not in punc])
    return proprocessed_text

In [41]:
#preprocessing
pX_train = preprocess_text(X_train)
pX_train = [' '.join(x) for x in pX_train]

pX_test = preprocess_text(X_test)
pX_test = [' '.join(x) for x in pX_test]

#### Model Interpretation 
We will be using the eli5 library to interpret the SVC model. 

In [42]:
#SVC model
svc = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC(kernel = 'linear', decision_function_shape = 'ovo'))
]).fit(pX_train, y_train)

##### Model Stats

In [75]:
y_pred = svc.predict(pX_test)
print(classification_report(y_test, y_pred))
print("accuracy: {:0.3f}".format(accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.92      0.93      0.92      2487
           1       0.93      0.92      0.92      2513

    accuracy                           0.92      5000
   macro avg       0.92      0.92      0.92      5000
weighted avg       0.92      0.92      0.92      5000

accuracy: 0.924


**Example prediction of a non-suicidal post**

Recall that 0 = not suicidal and 1 = suicidal

In [52]:
#example prediction of the first datapoint
y_pred0 = svc.predict(pX_test)[0]
print(f'Predicted class: {y_pred0} \nActual class: {y_test.iloc[0]} \nPost: {X_test.iloc[0]}')

Predicted class: 0 
Actual class: 0 
Post: A teenage love story. So it all started in 2nd grade. I really liked that one girl in my class (let's call her Sara for sake of privacy) and never knew how to tell her. 

Fast forward to 3rd grade and our teacher was pissed at our class so she decided to rearrange our seats, now... take a hot guess, who was chosen to seat with me. If you guessed Sara, you were right.

We started getting along, having fun on lessons and lunch breaks. We were best friends.

Fast forward again now to 4th grade. All the classes were mixed and we sadly weren't in the same one. We still hung out and stuff but didn't have as much time for each other as before. Year 2016, February 14th, Valentines day. "It's my chance!" so since we didn't finish lessons at the same time I gave my close friend a letter from me to give to her after school. He gave it to her. Same day, just after class, she messages me and tells me she had a big crush on me since 1st grade "The fuck!?" w

**Example prediction of a suicidal post**

In [54]:
y_pred1 = svc.predict(pX_test)[1]
print(f'Predicted class: {y_pred1} \nActual class: {y_test.iloc[1]} \nPost: {X_test.iloc[1]}')

Predicted class: 1 
Actual class: 1 
Post: I don't want to feel this way.I just want to get my meds straightened out and be happy. I don't want to feel sad, worthless, or like life is pointless. I want to enjoy life again and feel like others love me. 

I just need someone to tell me that it'll all get straightened out and I'll be happy again. I really need some one to reassure me because it all feels so bleak. I'm sorry. 


#### eli5

In [80]:
#the most important features
eli5.show_weights(svc, top=10)

Weight?,Feature
+7.969,suicid
+5.339,kill
+3.849,end
+3.259,pill
+3.131,life
+3.015,die
+2.878,live
+2.768,hang
+2.659,anymor
+2.657,overdos
