In [5]:
import pandas as pd
from nltk import re
import numpy as np
import keras.layers
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import tensorflow as tf
import html
import string
import time
import nltk
from keras.callbacks import EarlyStopping,TensorBoard,ModelCheckpoint
import seaborn as sns

In [3]:
import csv
df = pd.read_csv('Cleaned_Depression_Vs_Suicide.csv', lineterminator = '\n')

In [3]:
df

Unnamed: 0,text,class
0,Feeling a bit depressedI've been in a big low ...,SuicideWatch
1,Was going to hang myself but didn't have guts ...,SuicideWatch
2,Have you ever maintained a poor friendship jus...,depression
3,I haven't felt positive feelings in a long tim...,depression
4,Partners of those who suffer from depression; ...,depression
...,...,...
609767,They all hate meI feel like everything I say a...,SuicideWatch
609768,Just need to vent.I'm not going to lie I have ...,depression
609769,I promised myself I wouldn’t go through with i...,SuicideWatch
609770,Getting off of anti-deprssantsI've been taking...,depression


In [4]:
df.dropna(axis=0,inplace=True)

In [5]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [6]:
df.describe() 

Unnamed: 0,text,class
count,609772,609772
unique,609772,2
top,"Need some advice. I'm not suicidal, but someo...",SuicideWatch
freq,1,304886


In [7]:
def convert_lower(text):
    lower_text = text.lower()
    return lower_text

df["text"] = df['text'].apply(lambda x: convert_lower(x))

# removing punctuation

def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

df["text"] = df['text'].apply(remove_punctuations)

# removing numbers

df['text'] = df['text'].str.replace('\d+', '')

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Karthik Ram
[nltk_data]     Srinivas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
#tokenization

df['tokenized_text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Karthik Ram
[nltk_data]     Srinivas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# removing stopwrods

stopwords = nltk.corpus.stopwords.words("english")

def stopwords_remove(text):
    text_cleaned = [word for word in text if word not in stopwords]
    return text_cleaned

df["tokenized_text"] = df["tokenized_text"].apply(lambda x: stopwords_remove(x))

In [12]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

df['stemmed'] = df['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])

In [13]:
df

Unnamed: 0,text,class,tokenized_text,stemmed
0,feeling a bit depressedive been in a big low a...,SuicideWatch,"[feeling, bit, depressedive, big, low, weekend...","[feel, bit, depressed, big, low, weekend, dont..."
1,was going to hang myself but didnt have guts e...,SuicideWatch,"[going, hang, didnt, guts, enough, kick, away,...","[go, hang, didnt, gut, enough, kick, away, cha..."
2,have you ever maintained a poor friendship jus...,depression,"[ever, maintained, poor, friendship, keep, las...","[ever, maintain, poor, friendship, keep, last,..."
3,i havent felt positive feelings in a long time...,depression,"[havent, felt, positive, feelings, long, time,...","[havent, felt, posit, feel, long, time, dont, ..."
4,partners of those who suffer from depression h...,depression,"[partners, suffer, depression, work, itmy, lon...","[partner, suffer, depress, work, itmi, long, t..."
...,...,...,...,...
609767,they all hate mei feel like everything i say a...,SuicideWatch,"[hate, mei, feel, like, everything, say, pushe...","[hate, mei, feel, like, everyth, say, push, pe..."
609768,just need to ventim not going to lie i have ne...,depression,"[need, ventim, going, lie, never, felt, low, l...","[need, ventim, go, lie, never, felt, low, life..."
609769,i promised myself i wouldn’t go through with i...,SuicideWatch,"[promised, ’, go, mom, dies, fucking, readyi, ...","[promis, ’, go, mom, die, fuck, readyi, want, ..."
609770,getting off of antideprssantsive been taking c...,depression,"[getting, antideprssantsive, taking, celexa, y...","[get, antideprssants, take, celexa, year, well..."


In [14]:
input2_corrected = [" ".join(x) for x in df['stemmed']]

from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
 

tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(input2_corrected)

In [15]:
tfidf_vectorizer_vectors

<609772x395472 sparse matrix of type '<class 'numpy.float64'>'
	with 45764601 stored elements in Compressed Sparse Row format>

In [16]:
def dummies(x):
    if x == 'SuicideWatch':
        return 1
    if x == 'depression':
        return 2

df['class'] = df['class'].apply(lambda x: dummies(x))

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectorizer_vectors, df['class'], test_size=0.3, random_state=101)

In [18]:
df

Unnamed: 0,text,class,tokenized_text,stemmed
0,feeling a bit depressedive been in a big low a...,1,"[feeling, bit, depressedive, big, low, weekend...","[feel, bit, depressed, big, low, weekend, dont..."
1,was going to hang myself but didnt have guts e...,1,"[going, hang, didnt, guts, enough, kick, away,...","[go, hang, didnt, gut, enough, kick, away, cha..."
2,have you ever maintained a poor friendship jus...,2,"[ever, maintained, poor, friendship, keep, las...","[ever, maintain, poor, friendship, keep, last,..."
3,i havent felt positive feelings in a long time...,2,"[havent, felt, positive, feelings, long, time,...","[havent, felt, posit, feel, long, time, dont, ..."
4,partners of those who suffer from depression h...,2,"[partners, suffer, depression, work, itmy, lon...","[partner, suffer, depress, work, itmi, long, t..."
...,...,...,...,...
609767,they all hate mei feel like everything i say a...,1,"[hate, mei, feel, like, everything, say, pushe...","[hate, mei, feel, like, everyth, say, push, pe..."
609768,just need to ventim not going to lie i have ne...,2,"[need, ventim, going, lie, never, felt, low, l...","[need, ventim, go, lie, never, felt, low, life..."
609769,i promised myself i wouldn’t go through with i...,1,"[promised, ’, go, mom, dies, fucking, readyi, ...","[promis, ’, go, mom, die, fuck, readyi, want, ..."
609770,getting off of antideprssantsive been taking c...,2,"[getting, antideprssantsive, taking, celexa, y...","[get, antideprssants, take, celexa, year, well..."


In [19]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='sag')

lr.fit(X_train,y_train)
logistic_predictions = lr.predict(X_test)

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,logistic_predictions))
print(classification_report(y_test,logistic_predictions))
print(accuracy_score(y_test, logistic_predictions))

[[72343 19209]
 [18424 72956]]
              precision    recall  f1-score   support

           1       0.80      0.79      0.79     91552
           2       0.79      0.80      0.79     91380

    accuracy                           0.79    182932
   macro avg       0.79      0.79      0.79    182932
weighted avg       0.79      0.79      0.79    182932

0.7942787483873789


In [21]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()

MNB.fit(X_train, y_train)
predicted = MNB.predict(X_test)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predicted))
print(classification_report(y_test,predicted))
print(accuracy_score(y_test, predicted))

[[66065 25487]
 [18658 72722]]
              precision    recall  f1-score   support

           1       0.78      0.72      0.75     91552
           2       0.74      0.80      0.77     91380

    accuracy                           0.76    182932
   macro avg       0.76      0.76      0.76    182932
weighted avg       0.76      0.76      0.76    182932

0.7586808212887849
