In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
#printing stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#data preprocessing
mh_data=pd.read_csv('Sdata3.csv',encoding='unicode_escape')
#printing first five lines
print(mh_data.head())

       id                                               text        class  \
0  302034  I made a grave mistake I donât remember the ...  non-suicide   
1  302035  What series you like. I have watched all my fa...  non-suicide   
2  302036  Guys I did it! I lost my virginity but it wasn...  non-suicide   
3  302037  This guy like me or no? So, basically I have t...  non-suicide   
4  302040  I have no hopeMy ex boyfriend cheated on me an...      suicide   

   Unnamed: 3  Unnamed: 4  Unnamed: 5  Unnamed: 6  Unnamed: 7  
0         NaN         NaN         NaN         NaN         NaN  
1         NaN         NaN         NaN         NaN         NaN  
2         NaN         NaN         NaN         NaN         NaN  
3         NaN         NaN         NaN         NaN         NaN  
4         NaN         NaN         NaN         NaN         NaN  


In [5]:
#checking for missing values
print(mh_data.isnull().sum())

id               8
text            10
class           16
Unnamed: 3    4999
Unnamed: 4    4999
Unnamed: 5    4999
Unnamed: 6    4999
Unnamed: 7    4999
dtype: int64


In [6]:
#replacing missing values with null string
mh_data=mh_data.fillna('')
print(mh_data.isnull().sum())

id            0
text          0
class         0
Unnamed: 3    0
Unnamed: 4    0
Unnamed: 5    0
Unnamed: 6    0
Unnamed: 7    0
dtype: int64


In [7]:
#separating feature and target
X=mh_data.drop(columns='class',axis=1)
Y=mh_data['class']
print(Y)

0       non-suicide
1       non-suicide
2       non-suicide
3       non-suicide
4           suicide
           ...     
4994    non-suicide
4995        suicide
4996    non-suicide
4997        suicide
4998        suicide
Name: class, Length: 4999, dtype: object


In [8]:
print(mh_data['text'])

0       I made a grave mistake I donât remember the ...
1       What series you like. I have watched all my fa...
2       Guys I did it! I lost my virginity but it wasn...
3       This guy like me or no? So, basically I have t...
4       I have no hopeMy ex boyfriend cheated on me an...
                              ...                        
4994    I found the news archive of when I first met m...
4995    I wanted love, I wanted sex. I resuse to live ...
4996    You ever just feel so god damn bored with peop...
4997    Im an adoptee and I have a psychological abyss...
4998    ConcernsI'm concerned that as a result of my s...
Name: text, Length: 4999, dtype: object


In [9]:
#stemming
port_stem=PorterStemmer()
def stemming(content):
    stemmed_content=re.sub('[^a-zA-Z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content
mh_data['text']=mh_data['text'].apply(stemming)
print(mh_data['text'])

0       made grave mistak rememb post know someth lgbt...
1       seri like watch favorit seri multipl time wond...
2       guy lost virgin cool thought pp went soft minu...
3       guy like basic friend keep give littl hint lik...
4       hopemi ex boyfriend cheat gave genit herp hard...
                              ...                        
4994    found news archiv first met dad happi surviv t...
4995    want love want sex resus live miss life greate...
4996    ever feel god damn bore peopl talk like talk f...
4997    im adopte psycholog abyss keep fill drugsand f...
4998    concernsi concern result suicid wife get life ...
Name: text, Length: 4999, dtype: object


In [10]:
#X has features and Y has labes
X=mh_data['text'].values
Y=mh_data['class'].values
print(X)
print(Y)

['made grave mistak rememb post know someth lgbt commun made comment tri say someth along line straight like thought gay person whatev hell want came across homophob lost know care whether peopl get exactli word well'
 'seri like watch favorit seri multipl time wonder guy favorit netflix mabi calm mabi watch brand new netflix seri'
 'guy lost virgin cool thought pp went soft minut fuck' ...
 'ever feel god damn bore peopl talk like talk famili peopl xbox parti everi singl day get fuck bore depress never feel lone ya know honestli wait person school abl go outsid talk peopl without health risk everywher look wish peopl talk'
 'im adopte psycholog abyss keep fill drugsand feel disconnect feel one understand feel like belong peopl know wrong guy throw away reject like piec trash'
 'concernsi concern result suicid wife get life insur pay mortgag cover set account die year worth money continu live decid next step']
['non-suicide' 'non-suicide' 'non-suicide' ... 'non-suicide' 'suicide'
 'sui

In [11]:
print(X.shape)

(4999,)


In [12]:
print(Y.shape)

(4999,)


In [13]:
#converting textual data to numberical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)
print(X)

  (0, 12844)	0.1713405031480313
  (0, 12701)	0.19698814321810518
  (0, 12680)	0.16447898999780097
  (0, 12652)	0.12964274317463229
  (0, 12544)	0.07480519461328355
  (0, 11811)	0.0958454781529689
  (0, 11497)	0.11159907399719074
  (0, 10900)	0.19017143341871512
  (0, 10541)	0.2255195649519707
  (0, 9880)	0.10461945155746458
  (0, 9457)	0.16024980937085592
  (0, 8773)	0.12186842035404871
  (0, 8510)	0.11831795105396829
  (0, 8464)	0.09289403662880018
  (0, 7332)	0.1929422113172612
  (0, 6907)	0.25483521131337517
  (0, 6813)	0.14135133523790713
  (0, 6665)	0.19246591963031698
  (0, 6647)	0.07232615034064688
  (0, 6590)	0.26974259428276237
  (0, 6382)	0.16026095446051006
  (0, 5403)	0.2521699144440911
  (0, 5222)	0.15830205691678254
  (0, 4896)	0.26617909993750505
  (0, 4683)	0.08055508182679312
  :	:
  (4997, 830)	0.13759204208117484
  (4997, 152)	0.33788787154827343
  (4997, 57)	0.29852134261888685
  (4998, 12979)	0.11071541917275768
  (4998, 12877)	0.1913837575211378
  (4998, 12758)	0.

In [14]:
#converting the labels values into numberics
#from sklearn import preprocessing
#label_encoder=preprocessing.LabelEncoder()
#mh_data['class']=label_encoder.fit_transform(mh_data['class'])
#mh_data['class'].unique()


In [15]:
#splitting dataset into training and testing
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)


In [16]:
#importing libraries to fit the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
#training logistic regression model
model=LogisticRegression()
model.fit(X_train,Y_train)

LogisticRegression()

In [20]:
#evaluating accuracy score on training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [21]:
print("Accuracy score of the training data ",training_data_accuracy)

Accuracy score of the training data  0.9549887471867967


In [24]:
# evaluating accuarcy score on the testing data
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [25]:
print("Accuracy score of the testing data ",testing_data_accuracy)

Accuracy score of the testing data  0.918


In [31]:
#Making a predictive system
X_new=X_test[55]
prediction=model.predict(X_new)
print(prediction)
print(Y_test[55])

['suicide']
suicide
