In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Introduction of Dataset: The dataset is a collection of posts from "SuicideWatch" and "depression" subreddits of the Reddit platform which has been classified as suicide and non-suicide. 
### Our Aim is to build a web app which can classified user input which is self-harm as suicide and non-selfharm as non-suicide. 

### Reading the csv file and head and info of dataset.

In [None]:
df=pd.read_csv("/content/Suicide_Nonsuicide.csv")

In [None]:
df.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20060 entries, 0 to 20059
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20060 non-null  object
 1   class   20060 non-null  object
dtypes: object(2)
memory usage: 313.6+ KB


### Checking the value count of 'class' to check whether the dataset is balance or not.

In [None]:
df['class'].value_counts()

non-suicide    10159
suicide         9901
Name: class, dtype: int64

### The dataset is quite balance. Now checking whether any null value present in dataset or not.

In [None]:
df.isnull().sum()

text     0
class    0
dtype: int64

In [None]:
empty_idx=[]
for indx,text,calss in df.itertuples():
  if type(text)==str:
    if text.isspace():
      empty_idx.append(indx) 
print(empty_idx)


[]


### There was no null value present in the dataset.

In [None]:
df.head()

Unnamed: 0,text,class
0,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,Am I weird I don't get affected by compliments...,non-suicide
2,Finally 2020 is almost over... So I can never ...,non-suicide
3,i need helpjust help me im crying so hard,suicide
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


### Importing nltk and it's libraries and dependancies.

In [None]:
import nltk

from nltk.tokenize import word_tokenize 
nltk.download("punkt")

from nltk.corpus import stopwords 
nltk.download("stopwords")

from nltk.stem import PorterStemmer,WordNetLemmatizer 
nltk.download("wordnet")
nltk.download('omw-1.4')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Now doing preprocessing of text and cleaning of text.

In [None]:
def clean_text(text):
  token=word_tokenize(text.lower()) #case conversion + tokenization.


  #non alpha removal.
  ftoken=[i for i in token if i.isalpha()]

  #stop words removal
  stpwd=stopwords.words("english")
  stoken=[i for i in ftoken if i not in stpwd]

  #lemma. 
  lemma=WordNetLemmatizer()
  ltoken=[lemma.lemmatize(i) for i in stoken]

  #joining list of msgs 
  return " ".join(ltoken)


In [None]:
df["text"]=df["text"].astype(str)


In [None]:
df["clean_text"]=df["text"].apply(clean_text) 

In [None]:
df.head()

Unnamed: 0,text,class,clean_text
0,Ex Wife Threatening SuicideRecently I left my ...,suicide,ex wife threatening suiciderecently left wife ...
1,Am I weird I don't get affected by compliments...,non-suicide,weird get affected compliment coming someone k...
2,Finally 2020 is almost over... So I can never ...,non-suicide,finally almost never hear bad year ever swear ...
3,i need helpjust help me im crying so hard,suicide,need helpjust help im cry hard
4,"I’m so lostHello, my name is Adam (16) and I’v...",suicide,losthello name adam struggling year afraid pas...


### The text has been cleaned. Seperating x and y.

In [None]:
x=df['clean_text']
x

0        ex wife threatening suiciderecently left wife ...
1        weird get affected compliment coming someone k...
2        finally almost never hear bad year ever swear ...
3                           need helpjust help im cry hard
4        losthello name adam struggling year afraid pas...
                               ...                        
20055    trans right human right nothing else say wante...
20056    want play cod mobile boiz complete codm soldie...
20057            posting longer lonely day yay almost week
20058    desperate ex gamblercrosspost used play online...
20059    let settle debate fingering girl considered gi...
Name: clean_text, Length: 20060, dtype: object

In [None]:
y=df['class']
y

0            suicide
1        non-suicide
2        non-suicide
3            suicide
4            suicide
            ...     
20055    non-suicide
20056    non-suicide
20057    non-suicide
20058        suicide
20059    non-suicide
Name: class, Length: 20060, dtype: object

### Transforming x into array.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vec=TfidfVectorizer()
x=vec.fit_transform(x).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Splitting the data and building the baseline model which is Logistic Regression. Because this is a binary classification problem and logistic work best for binary classification problem.

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.20,random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression 
lr=LogisticRegression()
lr.fit(xtrain,ytrain) 
ypred=lr.predict(xtest)

In [None]:
 from sklearn.metrics import classification_report
 print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

 non-suicide       0.89      0.93      0.91      2016
     suicide       0.93      0.89      0.91      1996

    accuracy                           0.91      4012
   macro avg       0.91      0.91      0.91      4012
weighted avg       0.91      0.91      0.91      4012



### Logistic regression gave accuracy 91%. Now checking accuracy with gaussian, multinomial and randomforest.

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [None]:
def mymodel(model): 
  model.fit(xtrain,ytrain) 
  ypred=model.predict(xtest)
  print(classification_report(ytest,ypred))

In [None]:
rf=RandomForestClassifier()
mymodel(rf)

              precision    recall  f1-score   support

 non-suicide       0.87      0.90      0.89      2016
     suicide       0.89      0.87      0.88      1996

    accuracy                           0.88      4012
   macro avg       0.88      0.88      0.88      4012
weighted avg       0.88      0.88      0.88      4012



In [None]:
gb=GaussianNB()
mymodel(gb)

              precision    recall  f1-score   support

 non-suicide       0.72      0.82      0.77      2016
     suicide       0.79      0.67      0.73      1996

    accuracy                           0.75      4012
   macro avg       0.75      0.75      0.75      4012
weighted avg       0.75      0.75      0.75      4012



In [None]:
mnb=MultinomialNB()
mymodel(mnb)

              precision    recall  f1-score   support

 non-suicide       0.96      0.74      0.84      2016
     suicide       0.79      0.97      0.87      1996

    accuracy                           0.85      4012
   macro avg       0.88      0.86      0.85      4012
weighted avg       0.88      0.85      0.85      4012



### Randomforest, gaussian, multinomial gave accuracy 88%, 75% and 85% respectively. So will consider logistic regression for further building the api. 

### Importing pickle and pickling object of tfidf vectorizer and logistic regression.

In [None]:
import pickle

In [None]:
pickle.dump(vec,open("tfidfvect.pkl","wb"))

In [None]:
pickle.dump(lr,open("lreg.pkl","wb"))

### I have made web app which classifying user input which is non-harmful text as non-suicide and harmful text as suicide. 