In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
submission = pd.read_csv('SampleSubmission(1).csv')

In [3]:
train.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [4]:
train.shape

(616, 3)

In [5]:
#Total number of observations from each category
train.label.value_counts()

Depression    352
Alcohol       140
Suicide        66
Drugs          58
Name: label, dtype: int64

In [6]:
#Percentage number of observations from each category 
train.label.value_counts(normalize = True)
#This revaels that the data is imbalance

Depression    0.571429
Alcohol       0.227273
Suicide       0.107143
Drugs         0.094156
Name: label, dtype: float64

In [7]:
train.head()

Unnamed: 0,ID,text,label
0,SUAVK39Z,I feel that it was better I dieAm happy,Depression
1,9JDAGUV3,Why do I get hallucinations?,Drugs
2,419WR1LQ,I am stresseed due to lack of financial suppor...,Depression
3,6UY7DX6Q,Why is life important?,Suicide
4,FYC0FTFB,How could I be helped to go through the depres...,Depression


In [8]:
#Collecting 58 samples from each label (undersampling)
depression = train[train.label == "Depression"].sample(frac=1)[:58]
alcohol = train[train.label == "Alcohol"].sample(frac=1)[:58]
suicide = train[train.label == "Suicide"].sample(frac=1)[:58]
drugs = train[train.label == "Drugs"].sample(frac=1)[:58]

In [9]:
#concatenating and randomising the samples from each label 
data = pd.concat([depression,alcohol,suicide,drugs])
data = data.sample(frac = 1).reset_index(drop = True)
data.head()


Unnamed: 0,ID,text,label
0,W1OOD0X1,How better would I quit?,Drugs
1,6SZ3EXJ3,I did not ask for any assistance at first but ...,Suicide
2,KWNGKGHQ,I feel like life does not make sense,Depression
3,24NNBTL7,How to deal with hallucinations?,Drugs
4,WLEWY07N,Effects of alcohol in my life?,Alcohol


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [11]:
text_clf = Pipeline([('vect', TfidfVectorizer()),
                    ('clf', MultinomialNB())])

In [12]:
text_clf.fit(data.text, data.label)

Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [13]:
text_clf.score(data.text, data.label)

0.9741379310344828

In [14]:
y_pred = text_clf.predict(data.text)

In [15]:
from sklearn.metrics import classification_report
print(classification_report(data.label, y_pred))

              precision    recall  f1-score   support

     Alcohol       0.95      0.98      0.97        58
  Depression       0.98      0.97      0.97        58
       Drugs       0.98      0.95      0.96        58
     Suicide       0.98      1.00      0.99        58

    accuracy                           0.97       232
   macro avg       0.97      0.97      0.97       232
weighted avg       0.97      0.97      0.97       232



In [16]:
#Predicting the probability of occurence for each observation
text_clf.predict_proba(data.text)[:5]

array([[0.25614116, 0.18520514, 0.40248537, 0.15616833],
       [0.1087802 , 0.2081896 , 0.1611055 , 0.5219247 ],
       [0.09699382, 0.51224129, 0.17716149, 0.2136034 ],
       [0.19070157, 0.17556284, 0.40783832, 0.22589727],
       [0.49556307, 0.16164521, 0.13102559, 0.21176613]])

In [17]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(data.label,y_pred)
print(cm)

[[57  1  0  0]
 [ 0 56  1  1]
 [ 3  0 55  0]
 [ 0  0  0 58]]


In [18]:
#test data importation
test.head()

Unnamed: 0,ID,text
0,02V56KMO,How to overcome bad feelings and emotions
1,03BMGTOK,I feel like giving up in life
2,03LZVFM6,I was so depressed feel like got no strength t...
3,0EPULUM5,I feel so low especially since I had no one to...
4,0GM4C5GD,can i be successful when I am a drug addict?


In [19]:
predicted = text_clf.predict_proba(test.text)
predicted

array([[0.34416743, 0.22991213, 0.16403094, 0.2618895 ],
       [0.09260478, 0.58089145, 0.10194802, 0.22455576],
       [0.08419933, 0.66600402, 0.0868229 , 0.16297375],
       ...,
       [0.24182542, 0.16690285, 0.18612722, 0.40514451],
       [0.20997628, 0.11868915, 0.5136894 , 0.15764517],
       [0.51836926, 0.10689542, 0.15930974, 0.21542557]])

In [20]:
submission.head()

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0,0,0,0
1,03BMGTOK,0,0,0,0
2,03LZVFM6,0,0,0,0
3,0EPULUM5,0,0,0,0
4,0GM4C5GD,0,0,0,0


In [21]:
submission['Depression'] = predicted[:,0]
submission['Alcohol'] = predicted[:,1]
submission['Suicide'] = predicted[:,2]
submission['Drugs'] = predicted[:,3]

In [22]:
submission.head(2)

Unnamed: 0,ID,Depression,Alcohol,Suicide,Drugs
0,02V56KMO,0.344167,0.229912,0.164031,0.261889
1,03BMGTOK,0.092605,0.580891,0.101948,0.224556


In [23]:
submission.to_csv('submission.csv', index=False)