In [99]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
#printing stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [100]:
#data preprocessing
mh_data=pd.read_csv('stemmed_data.csv',encoding='latin1')
#printing first five lines
print(mh_data.head())

                                                text        class
0  ex wife threaten suiciderec left wife good che...      suicide
1  weird get affect compliment come someon know i...  non-suicide
2  final almost never hear bad year ever swear fu...  non-suicide
3                     need helpjust help im cri hard      suicide
4  losthello name adam struggl year afraid past y...      suicide


In [101]:
#checking for missing values
print(mh_data.isnull().sum())

text     0
class    0
dtype: int64


In [102]:
#separating feature and target
X=mh_data.drop(columns='class',axis=1)
Y=mh_data['class']
print(X)
print(Y)

                                                     text
0       ex wife threaten suiciderec left wife good che...
1       weird get affect compliment come someon know i...
2       final almost never hear bad year ever swear fu...
3                          need helpjust help im cri hard
4       losthello name adam struggl year afraid past y...
...                                                   ...
231933  like rock go get anyth go http musictast space...
231934  tell mani friend lone everyth depriv pre bough...
231935  pee probabl tast like salti tea someon drank p...
231936  usual stuff find herei post sympathi piti know...
231937  still beaten first boss hollow knight fought t...

[231938 rows x 1 columns]
0             suicide
1         non-suicide
2         non-suicide
3             suicide
4             suicide
             ...     
231933    non-suicide
231934    non-suicide
231935    non-suicide
231936        suicide
231937    non-suicide
Name: class, Length: 231938, dtype:

In [103]:
#stemming
port_stem=PorterStemmer()
def stemming(text):
    text=text.lower()
    text=re.sub('\[.*?\]', '',text)
    text=re.sub("\\W"," ",text)
    text=re.sub('https?://\S+|www\.\S+','',text)
    text=re.sub('<.*?>+','',text)
    text=re.sub('[%s]'% re.escape(string.punctuation),'',text)
    text=re.sub('\n','',text)
    text=re.sub('\w*\d\w*','',text)
    text=text.split()
    text=[port_stem.stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    return text

In [104]:
#X has features and Y has labes
X=mh_data['text'].values
Y=mh_data['class'].values
print(X)
print(Y)

['ex wife threaten suiciderec left wife good cheat twice lie much decid refus go back day ago began threaten suicid tirelessli spent paat day talk keep hesit want believ come back know lot peopl threaten order get way happen realli suppos handl death hand still love wife cannot deal get cheat constantli feel insecur worri today may day hope much happen'
 'weird get affect compliment come someon know irl feel realli good internet stranger'
 'final almost never hear bad year ever swear fuck god annoy' ...
 'pee probabl tast like salti tea someon drank pee confirm'
 'usual stuff find herei post sympathi piti know far wors situat mine want get stuff seem life point everyth done life ruin quit isol everyon even famili even like tell famili would help consid psychot probabl right know sens fuck univers want think seem like univers fuck made know made fuck think know want get peopl tri help went rough patch got tough done tough life tough look around famili sinc youngest seen ridicul shit hap

In [105]:
print(X.shape)

(231938,)


In [106]:
print(Y.shape)

(231938,)


In [107]:
np.where(pd.isnull(mh_data))

(array([], dtype=int64), array([], dtype=int64))

In [108]:
#converting textual data to numberical data
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)
print(X)

  (0, 112398)	0.09895357395150674
  (0, 111269)	0.3716087958578045
  (0, 109793)	0.06552396841808322
  (0, 109432)	0.045256200696607865
  (0, 103926)	0.12396649741038204
  (0, 101436)	0.08153377262540994
  (0, 101061)	0.22625045269728053
  (0, 100284)	0.39245214292095365
  (0, 97607)	0.06395914856574722
  (0, 96568)	0.10202240543022045
  (0, 96100)	0.28479456905430106
  (0, 95859)	0.06344725210565381
  (0, 94335)	0.06894862099427926
  (0, 93082)	0.10903799861083817
  (0, 83040)	0.12362481441177676
  (0, 82275)	0.057880141832593934
  (0, 75355)	0.05621625270909069
  (0, 73809)	0.28479456905430106
  (0, 72499)	0.12194171017770329
  (0, 66241)	0.12404239882177273
  (0, 62130)	0.09880886360792734
  (0, 60187)	0.067922179848044
  (0, 60107)	0.07861329404309986
  (0, 58105)	0.10183020707510082
  (0, 57509)	0.08349171034752481
  :	:
  (231936, 18375)	0.05229032881931739
  (231936, 18243)	0.06651116273373657
  (231936, 14405)	0.05652634625392117
  (231936, 12204)	0.08825059415073287
  (231936,

In [109]:
#converting the labels values into numberics
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
mh_data['class']=label_encoder.fit_transform(mh_data['class'])
mh_data['class'].unique()


array([1, 0])

In [110]:
#splitting dataset into training and testing
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)


In [111]:
#importing libraries to fit the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [112]:
#training logistic regression model
model=LogisticRegression(solver='lbfgs', max_iter=100000)
model.fit(X_train,Y_train)

LogisticRegression(max_iter=100000)

In [113]:
#evaluating accuracy score on training data
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [114]:
print("Accuracy score of the training data ",training_data_accuracy)

Accuracy score of the training data  0.9408137968202641


In [115]:
# evaluating accuarcy score on the testing data
X_test_prediction=model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [116]:
print("Accuracy score of the testing data ",testing_data_accuracy)

Accuracy score of the testing data  0.932849012675692


In [117]:
print(type(X))
print(type(Y))

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.ndarray'>


In [120]:
#Making a predictive system
new=X_test[1800]
prediction=model.predict(new)
print(type(X_test))
print(prediction)
print(Y_test[1800])

<class 'scipy.sparse.csr.csr_matrix'>
['suicide']
suicide


# Analysing single input data

In [128]:
import string

In [142]:
X_new=pd.Series(input())
print(type(X_new))
X_new=X_new.apply(stemming)
print(type(X_new))

This is the only place I can talk about thisUsing a throwaway account because I need to get this off of my chest but Iâ€™m scared of speaking out fully as people I know irl know my reddit account.    For the last five years Iâ€™ve been slowly growing more and more sad and upset with life due to my own stupid mistakes.   I have a loving family and sure we donâ€™t always get along but I love them all to pieces.  The stupid mistakes I made haunt me to this day, no one knows about them besides me and the people involved, and that kills me. I have so many passions I want to pursue, like my singing or my art however Iâ€™m terrified that if my face gets out there one day Iâ€™ll wake up to hear everyone I love knows what I did.   I was so so fucking stupid and if I could go back I would have stopped myself. It was only five years ago I found out what I did shouldnâ€™t have happened, that I should have never done it and since then itâ€™s been creeping up on me and eating away at me.   I canâ€™t

In [143]:
print(X_new)

0    place talk thisus throwaway account need get c...
dtype: object


In [144]:
X_new=X_new.values
print(X_new)
type(X_new)

['place talk thisus throwaway account need get chest iâ scare speak fulli peopl know irl know reddit account last five year iâ slowli grow sad upset life due stupid mistak love famili sure donâ alway get along love piec stupid mistak made haunt day one know besid peopl involv kill mani passion want pursu like sing art howev iâ terrifi face get one day iâ wake hear everyon love know fuck stupid could go back would stop five year ago found shouldnâ happen never done sinc itâ creep eat away canâ tell anyon care went see doctor look help iâ scare someon would find iâ scare iâ break iâ scare iâ lose place colleg iâ thought kill mani time even tire take crohn coliti medic hope ill would kill slowli kill none less mother soon found full pill box inject feel like huge weight drop head wasnâ forc stupid child didnâ know get bad naiv itâ haunt rest life howev long thatâ anyway thank read stranger reddit first time iâ ever get say word']


numpy.ndarray

In [145]:
print(X_new.shape)
print(type(X_new))

(1,)
<class 'numpy.ndarray'>


In [146]:
#converting textual data to numberical data
vect=TfidfVectorizer()
vect.fit(X_new)
X_new=vect.transform(X_new)
print(X_new)

  (0, 124)	0.09901475429766744
  (0, 123)	0.14852213144650114
  (0, 122)	0.04950737714883372
  (0, 121)	0.04950737714883372
  (0, 120)	0.04950737714883372
  (0, 119)	0.04950737714883372
  (0, 118)	0.04950737714883372
  (0, 117)	0.04950737714883372
  (0, 116)	0.04950737714883372
  (0, 115)	0.04950737714883372
  (0, 114)	0.09901475429766744
  (0, 113)	0.04950737714883372
  (0, 112)	0.04950737714883372
  (0, 111)	0.04950737714883372
  (0, 110)	0.04950737714883372
  (0, 109)	0.04950737714883372
  (0, 108)	0.04950737714883372
  (0, 107)	0.04950737714883372
  (0, 106)	0.04950737714883372
  (0, 105)	0.04950737714883372
  (0, 104)	0.04950737714883372
  (0, 103)	0.19802950859533489
  (0, 102)	0.04950737714883372
  (0, 101)	0.04950737714883372
  (0, 100)	0.04950737714883372
  :	:
  (0, 24)	0.04950737714883372
  (0, 23)	0.04950737714883372
  (0, 22)	0.09901475429766744
  (0, 21)	0.04950737714883372
  (0, 20)	0.04950737714883372
  (0, 19)	0.04950737714883372
  (0, 18)	0.04950737714883372
  (0, 17)

In [147]:
from scipy.sparse import csr_matrix
X_new=csr_matrix(X_new)

In [148]:
type(X_new)

scipy.sparse.csr.csr_matrix

In [149]:
X_new.shape

(1, 125)

In [150]:
X_test.shape

(46388, 116181)

In [140]:
prediction=model.predict(X_new)
print(prediction)

ValueError: X has 33 features per sample; expecting 116181

In [151]:
csr_matrix.resize(X_new,(1,X.shape[1]))
pred=model.predict(X_new)
print(pred)
print(X_new.shape,X.shape)

['non-suicide']
(1, 116181) (231938, 116181)


In [91]:
X_new="hello its sad"

from scipy.sparse import csr_matrix

X_new=stemming(X_new)
print(X_new)
print(type(X_new))

X_new=np.array(X_new).reshape(-1, 1)
print(type(X_new))

prediction=model.predict(X_new)
prediction

hello sad
<class 'str'>
<class 'numpy.ndarray'>


  return f(**kwargs)


ValueError: X has 1 features per sample; expecting 116181