In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning Libraries
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

import pickle
import string

# Text Processing libraries
import nltk
from nltk.stem import PorterStemmer

import os
for dirname, _, filenames in os.walk('/Users/muema'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

ModuleNotFoundError: No module named 'xgboost'

In [None]:
data = pd.read_csv('/Users/muema/Suicide_Detection.csv')
data.head()

# Data Preprocessing

In [None]:
data.shape

- The given dataset contain more than **2 lakh** rows.
- It takes take significant time and resources, as it involves optimizing the parameters of the model and processing the data iteratively.
- So I will take 10000 data points which will help to reduce complexity.

In [None]:
df = data.sample(n=10000, random_state=42)

In [None]:
df.info()

In [None]:
df['Unnamed: 0'].is_unique

In [None]:
df.drop(columns = 'Unnamed: 0',inplace=True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

# Data Visualisation

In [None]:
classCnt = df['class'].value_counts()
print(classCnt)

plt.figure(figsize=((20,5)))

plt.subplot(1,2,1)
sns.countplot(df,x='class')

plt.subplot(1,2,2)
plt.pie(classCnt,labels = classCnt.index,autopct='%.0f%%')

plt.show()

# Text Preprocessing

## Lowering the Text
- Changing the case involves converting all text to lowercase or uppercase so that all word strings follow a consistent format. 
- Lowercasing is the more frequent choice in NLP software.

In [None]:
df['text']= df['text'].str.lower()

## Remove Punctuations
- Text preprocessing involves various techniques to clean and transform raw text data into a more suitable format for analysis.
- Removing punctuation is one of the most commonly used preprocessing techniques, as punctuation marks do not usually add much semantic value to the text and can interfere with downstream NLP tasks.
-  String library of Python contains some pre-defined list of punctuations such as **‘!”#$%&'()*+,-./:;?@[\]^_`{|}~’**

In [None]:
df['text'] = df['text'].str.replace(r'[^\w\s]+', '',regex = True)

## Stop word removal

- Stopwords are the most commonly occurring words in a language, such as "the", "and", "a", "an", "in", "to", etc. 
- These words have very little semantic value and are often used to connect meaningful words in a sentence. 
- In many natural language processing (NLP) tasks, stopwords can be removed without affecting the meaning of the text, and doing so can actually improve the performance of the NLP models.

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

## Tokenization   
- The tokenization stage involves converting a sentence into a stream of words, also called “tokens.”
- Tokens are usually words, but they can also be phrases, symbols, or other meaningful units of text.

In [None]:
df['text'] = df['text'].apply(lambda x:nltk.word_tokenize(x))

## Stemming
- The term word stem is borrowed from linguistics and used to refer to the base or root form of a word.
- Stemming is the process of converting all words to their base form, or stem. 

In [None]:
ps = PorterStemmer()
df['text'] = df['text'].apply(lambda x : [ps.stem(i) for i in x])

In [None]:
df['text']=df['text'].apply(lambda x : ' '.join(x))

In [None]:
df.head()

In [None]:
# Saved the cleaned dataset.
df.to_csv('file1.csv')

In [None]:
dfnew = pd.read_csv('file1.csv')
dfnew.head()

In [None]:
dfnew.info()

In [None]:
ind = dfnew[dfnew['text'].isnull()].index

In [None]:
df.iloc[ind]

- **'text'** column contains None values.
- It may contain puctuations or emojies.
- So I am going to drop that rows

In [None]:
dfnew.dropna(inplace=True)

# Machine Learning - Model Selection

In [None]:
x,y = dfnew['text'],dfnew['class']

## TF-IDF Vectorizer

In [None]:
vectorizer = TfidfVectorizer(min_df=50,max_features=5000)
x =  vectorizer.fit_transform(x).toarray()

In [None]:
# Save the model
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)    

In [None]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=5)

In [None]:
X_train.shape,X_test.shape

## Naive Bayes (Voting Classifier)

In [None]:
nb = GaussianNB()
nb2 = BernoulliNB()
nb3 = MultinomialNB()
VotingClassifiers = VotingClassifier(estimators=[('GaussianNB', nb),('BernoulliNB',nb2), ('MultinomialNB', nb3)], voting = 'soft')
VotingClassifiers.fit(X_train, y_train)
print('Training score:',VotingClassifiers.score(X_train, y_train))
print('Testing score:',VotingClassifiers.score(X_test,y_test))

In [None]:
y_act=y_test
y_pred=VotingClassifiers.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='summer')
print(classification_report(y_act,y_pred))

## Random Forest

In [None]:
classifiers = RandomizedSearchCV(RandomForestClassifier(),{'n_estimators':[4,5],'criterion':['entropy'],
                                                      'max_depth':range(1,4),'min_samples_split':range(2,5)},random_state=12)
classifiers.fit(X_train, y_train)
print('Training score:',classifiers.score(X_train, y_train))
print('Testing score:',classifiers.score(X_test,y_test))
print(classifiers.best_estimator_)

In [None]:
y_act=y_test
y_pred=classifiers.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='Spectral')
print(classification_report(y_act,y_pred))

## Decision Tree

In [None]:
model2 = DecisionTreeClassifier(criterion='gini',splitter='random',min_samples_leaf=70,max_depth=4,random_state=0)
model2.fit(X_train, y_train)
print(model2.score(X_train, y_train))
print(model2.score(X_test,y_test))

In [None]:
y_act=y_test
y_pred=model2.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='PiYG')
print(classification_report(y_act,y_pred))

## Gradient Boosting

In [None]:
model3 = RandomizedSearchCV(GradientBoostingClassifier(),{"learning_rate": range(3,5),
                "max_depth":[200],"max_features":range(6,10,2),
                 "n_estimators":[10]},random_state=8,n_jobs=-1)
model3.fit(X_train,y_train)
print('Training score:',model3.score(X_train,y_train))
print('Testing score:',model3.score(X_test,y_test))
model3.best_params_

In [None]:
#confusion matrix and classification report
y_act=y_test
y_pred=model3.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='PRGn')
print(classification_report(y_act,y_pred))

## XG Boost

In [None]:
model = XGBClassifier( eval_metric='map',max_depth=200,n_estimators=70,learning_rate=1.99)
model.fit(X_train,y_train.replace({"non-suicide":0,'suicide':1}))
print('Training score:',model.score(X_train,y_train.replace({"non-suicide":0,'suicide':1})))
print('Testing score:',model.score(X_test,y_test.replace({"non-suicide":0,'suicide':1})))

In [None]:
#confusion matrix and classification report
y_act = y_test.replace({"non-suicide":0,'suicide':1})
y_pred = model.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='Spectral')
print(classification_report(y_act,y_pred))

## K-Nearest Neighbour

In [None]:
model = RandomizedSearchCV(KNeighborsClassifier(),{'n_neighbors':[8],'metric':['manhattan','minkowski','cosine','tanimoto'],
                                                   'p':[1,2]},random_state=42,n_jobs=-1)
model.fit(X_train, y_train)
print('Training score:',model.score(X_train, y_train))
print('Testing score:',model.score(X_test,y_test))
print(model.best_estimator_)

In [None]:
y_act = y_test
y_pred = model.predict(X_test)
sns.heatmap(confusion_matrix(y_act,y_pred),annot=True,cmap='summer')
print(classification_report(y_act,y_pred))

# Conclusion

- From the above we can say that out of all models, **Naive Bayes (Voting Classifier)**  is best fit model for the dataset.
+ Training score: 0.899271324474925
* Testing score: 0.8753333333333333

In [None]:
# save the Model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(VotingClassifiers, f)    

In [None]:
def preprocess(inp):
    inp = inp.lower() #convert to lower case 
    inp = inp.replace(r'[^\w\s]+', '') #remove punctuations
    inp = [word for word in inp.split() if word not in (stop_words)] #tokenize the sentence
    inp = ' '.join([ps.stem(i) for i in inp]) #stremming
    inputToModel = vectorizer.transform([inp]).toarray() #transform to vector form
    return inputToModel

In [None]:
def app(input_text):
    # Define the input text box
    print('Input : ',input_text) #take input from user
    processed_array = preprocess(input_text) #preprocess the text 
    predict = VotingClassifiers.predict(processed_array) #Model prediction
    print('Output : ', predict[0])

In [None]:
app('i am tired of my life i want to end my life')

In [None]:
app('Have a nice day! Happy Coding😊')

Repository on Github: https://github.com/RutujaPotdar/Mental-Health-App

## Thank you