# Classification using word embeddings 
In this case I will use the same dataset of Mental Heaklth and find out how word embeddings can help in classification tasks  for text data.|

In [37]:
import pandas as pd 
import numpy as np 
import spacy 

In [38]:
df_mh = pd.read_csv("../data/mental_health/Combined Data.csv")
df_mh

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety
...,...,...,...
53038,53038,Nobody takes me seriously I’ve (24M) dealt wit...,Anxiety
53039,53039,"selfishness ""I don't feel very good, it's lik...",Anxiety
53040,53040,Is there any way to sleep better? I can't slee...,Anxiety
53041,53041,"Public speaking tips? Hi, all. I have to give ...",Anxiety


In [39]:
# # This is to install the large english model to make the word embeddings with sapacy
# !python -m spacy download en_core_web_lg

In [40]:
nlp =  spacy.load("en_core_web_lg")
df_mh.dropna(inplace=True)
df_mh.drop(columns=['Unnamed: 0'], inplace=True)
df_mh['statement'] = df_mh.statement.apply(lambda x : x.lower())

In [41]:
df_mh

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"all wrong, back off dear, forward doubt. stay ...",Anxiety
3,i've shifted my focus to something else but i'...,Anxiety
4,"i'm restless and restless, it's been a month n...",Anxiety
...,...,...
53038,nobody takes me seriously i’ve (24m) dealt wit...,Anxiety
53039,"selfishness ""i don't feel very good, it's lik...",Anxiety
53040,is there any way to sleep better? i can't slee...,Anxiety
53041,"public speaking tips? hi, all. i have to give ...",Anxiety


In [42]:
df_mh['vector']= df_mh.statement.apply(lambda x: nlp(x).vector)
df_mh

Unnamed: 0,statement,status,vector
0,oh my gosh,Anxiety,"[-0.25945, 0.13983373, -0.49716333, -0.0200033..."
1,"trouble sleeping, confused mind, restless hear...",Anxiety,"[0.071175426, 0.28115746, -0.18763639, -0.1502..."
2,"all wrong, back off dear, forward doubt. stay ...",Anxiety,"[0.08828204, 0.11978752, -0.236601, -0.0531898..."
3,i've shifted my focus to something else but i'...,Anxiety,"[0.055449076, 0.2068899, -0.2928056, -0.088440..."
4,"i'm restless and restless, it's been a month n...",Anxiety,"[-0.017599745, 0.23834197, -0.2164838, -0.1466..."
...,...,...,...
53038,nobody takes me seriously i’ve (24m) dealt wit...,Anxiety,"[-0.036094554, 0.2158964, -0.2152108, -0.05165..."
53039,"selfishness ""i don't feel very good, it's lik...",Anxiety,"[-0.05780601, 0.24437937, -0.26088294, -0.1082..."
53040,is there any way to sleep better? i can't slee...,Anxiety,"[-0.0100243185, 0.23560496, -0.23800392, -0.12..."
53041,"public speaking tips? hi, all. i have to give ...",Anxiety,"[-0.034472004, 0.21387953, -0.18153135, -0.046..."


In [43]:
df_mh.to_csv("../data/mental_health/embeddings_mental_health_data.csv")
df_mh_emb = df_mh.copy()

In [84]:
emotions = list(df_mh_emb.status.unique())
target =  {k:t for t,k in enumerate(emotions)}
target

{'Anxiety': 0,
 'Normal': 1,
 'Depression': 2,
 'Suicidal': 3,
 'Stress': 4,
 'Bipolar': 5,
 'Personality disorder': 6}

In [85]:
df_mh_emb.status.value_counts()

status
Normal                  16343
Depression              15404
Suicidal                10652
Anxiety                  3841
Bipolar                  2777
Stress                   2587
Personality disorder     1077
Name: count, dtype: int64

In [None]:
df_mh_emb['cat_num'] =  df_mh_emb['status'].map(target)
df_mh_emb

Unnamed: 0,statement,status,vector,cat_num
0,oh my gosh,Anxiety,"[-0.25945, 0.13983373, -0.49716333, -0.0200033...",0
1,"trouble sleeping, confused mind, restless hear...",Anxiety,"[0.071175426, 0.28115746, -0.18763639, -0.1502...",0
2,"all wrong, back off dear, forward doubt. stay ...",Anxiety,"[0.08828204, 0.11978752, -0.236601, -0.0531898...",0
3,i've shifted my focus to something else but i'...,Anxiety,"[0.055449076, 0.2068899, -0.2928056, -0.088440...",0
4,"i'm restless and restless, it's been a month n...",Anxiety,"[-0.017599745, 0.23834197, -0.2164838, -0.1466...",0
...,...,...,...,...
53038,nobody takes me seriously i’ve (24m) dealt wit...,Anxiety,"[-0.036094554, 0.2158964, -0.2152108, -0.05165...",0
53039,"selfishness ""i don't feel very good, it's lik...",Anxiety,"[-0.05780601, 0.24437937, -0.26088294, -0.1082...",0
53040,is there any way to sleep better? i can't slee...,Anxiety,"[-0.0100243185, 0.23560496, -0.23800392, -0.12...",0
53041,"public speaking tips? hi, all. i have to give ...",Anxiety,"[-0.034472004, 0.21387953, -0.18153135, -0.046...",0


In [88]:
undersamp = []

for cat in emotions:
    undersamp.append(df_mh_emb[df_mh_emb.status == cat].sample(1077,random_state=101))

balnced_df = pd.concat(undersamp, axis=0)
print(balnced_df.status.value_counts())
print(balnced_df)

status
Anxiety                 1077
Normal                  1077
Depression              1077
Suicidal                1077
Stress                  1077
Bipolar                 1077
Personality disorder    1077
Name: count, dtype: int64
                                               statement  \
35643  seeking post-cancer anxiety advice i had cance...   
52942  is it normal for an ssri to make you feel like...   
52241  today i was calm and collected hi fellas, so i...   
52384  what actually helped your sudden panic attacks...   
35005  used tap water to rinse sinuses, extremely anx...   
...                                                  ...   
51288  hikikomori condition have any of you become hi...   
51739  anyone here in nyc i'm a black male an have oc...   
50737  i don't know if i truly deserve to get better....   
51320  podcasts for avoidant people? i'm trying to fi...   
51615  who else here is limerant? just wanting to get...   

                     status  \
35643       

In [94]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test  = train_test_split(balnced_df.vector,balnced_df.cat_num, test_size=0.2,
                                                     random_state=101, stratify= balnced_df.cat_num)

In [95]:
X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [96]:
print(X_train.shape, X_test.shape)

(6031, 300) (1508, 300)


In [97]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf_nb = MultinomialNB()
clf_nb.fit(X_train,y_train)

In [98]:
from sklearn.metrics._classification import classification_report

y_pred = clf_nb.predict(X_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.63      0.52      0.57       215
           1       0.76      0.53      0.62       216
           2       0.31      0.20      0.25       215
           3       0.37      0.62      0.46       216
           4       0.31      0.40      0.35       215
           5       0.51      0.40      0.45       215
           6       0.41      0.44      0.42       216

    accuracy                           0.44      1508
   macro avg       0.47      0.44      0.45      1508
weighted avg       0.47      0.44      0.45      1508

