In [1]:
import pandas as pd
d  = pd.read_csv("new_DF_withreddit.csv")

In [2]:
d.groupby(["source","label",'illness']).size()

source  label  illness
m       1      ptsd         24
        2      ocd         322
        3      anx        1865
        4      bd         2333
        5      dep        3135
r       1      ptsd        420
        2      ocd         736
        3      anx        1251
        4      bd         2832
        5      dep         471
dtype: int64

In [3]:
type(d.iloc[0,3])

numpy.int64

In [4]:
d.iloc[0,3]

1

In [5]:
d.columns

Index(['source', 'postID', 'illness', 'label', 'wpp', 'Ipp', 'PRON', 'AUX',
       'VERB', 'ADP', 'CCONJ', 'NOUN', 'ADJ', 'DET', 'ADV', 'PART', 'SCONJ',
       'PROPN', 'NUM', 'INTJ', 'X', 'PUNCT', 'SYM', 'compound', 'neg', 'neu',
       'pos'],
      dtype='object')

# Collapse Classes

We are collapsing classes because they are imbalanced. Classes 1, 2, 3 will be combined because they are smaller and the prediction accuracy for them has been consistently low 

In [16]:
d['label'] = d["label"].map({1:3,2:3,3:3,4:4,5:5})

In [17]:
d.groupby(["source","label",'illness']).size()

source  label  illness
m       3      anx        1865
               ocd         322
               ptsd         24
        4      bd         2333
        5      dep        3135
r       3      anx        1251
               ocd         736
               ptsd        420
        4      bd         2832
        5      dep         471
dtype: int64

In [19]:
d.groupby(["label"]).size()

label
3    4618
4    5165
5    3606
dtype: int64

# Random Forest

In [8]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


### scramble

In [9]:
d = d.sample(frac=1).reset_index(drop=True)


## Random forest; 70/30 split; 

In [10]:
d_word0_data = d[d.columns[5:]].as_matrix()

  """Entry point for launching an IPython kernel.


In [11]:
features = d_word0_data
labels = np.array(d["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3)

In [12]:
rf = RandomForestClassifier(n_estimators=1000,random_state=0, 
                n_jobs=-1,class_weight = "balanced",max_depth = 16)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# predictions
errors = []
for x,pre in enumerate(predictions):
    if pre != test_labels[x]:
        errors.append((pre,test_labels[x],x))
# print(len(errors))
# print(len(test_labels))
print("Accuracy:",(len(test_labels) - len(errors))/len(test_labels)*100,"%")


Accuracy: 46.2285287528006 %


In [13]:
from sklearn import metrics
print(metrics.classification_report(predictions, test_labels))

              precision    recall  f1-score   support

           1       0.04      0.15      0.06        34
           2       0.02      0.38      0.03        13
           3       0.17      0.38      0.23       397
           4       0.60      0.53      0.56      1798
           5       0.69      0.42      0.53      1775

   micro avg       0.46      0.46      0.46      4017
   macro avg       0.30      0.37      0.28      4017
weighted avg       0.59      0.46      0.51      4017



### cross validate

In [14]:
#perform four-fold cross-validation of the method
def crossvalidate(classifier,X,T,CV=4):
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(classifier, X, T, cv=CV)
    print('--- 4-fold cross-validation accuracy: %%%.1f (+/-%.1f)' % (scores.mean()*100,scores.std()*100))

In [15]:
rf = RandomForestClassifier(n_estimators=1000,random_state=0, n_jobs=-1)
crossvalidate(rf,features,labels,CV=5)

--- 4-fold cross-validation accuracy: %47.6 (+/-0.5)


## Random forest; 70/30 split; min 20 words

In [16]:
mask_word20 = (d["wpp"] >= 20)
# mask_word20

In [17]:
d_word20 = d[mask_word20]

In [18]:
#isolate data
d_word20_data = d_word20[d.columns[5:]].as_matrix()

  


In [19]:
features = d_word20_data
labels = np.array(d_word20["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3)

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000,random_state=0, n_jobs=-1,class_weight = "balanced",max_depth = 16)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# predictions
errors = []
for x,pre in enumerate(predictions):
    if pre != test_labels[x]:
        errors.append((pre,test_labels[x],x))
# print(len(errors))
# print(len(test_labels))
print("Accuracy:",(len(test_labels) - len(errors))/len(test_labels)*100,"%")


Accuracy: 49.59432048681541 %


In [21]:
from sklearn import metrics
print(metrics.classification_report(predictions, test_labels))
# from sklearn.metrics import confusion_matrix
# import seaborn as sns; sns.set()
# import matplotlib.pyplot as plt
# mat = confusion_matrix(test_labels, predictions)
# sns.heatmap(mat.T, square = True, annot = True, fmt ='d', cbar = True)
# plt.xlabel('true label')
# plt.ylabel('predicted label')

              precision    recall  f1-score   support

           3       0.36      0.53      0.43       945
           4       0.58      0.52      0.55      1695
           5       0.55      0.44      0.49      1304

   micro avg       0.50      0.50      0.50      3944
   macro avg       0.50      0.50      0.49      3944
weighted avg       0.52      0.50      0.50      3944



### cross validate

In [22]:
crossvalidate(rf,features,labels,CV=5)

--- 4-fold cross-validation accuracy: %50.1 (+/-0.9)


## Random Forest; 70/30 split; min 30 words

In [23]:
mask_word30 = (d["wpp"] >= 30)
# mask_word20
d_word30 = d[mask_word30]
d_word30_data = d_word30[d.columns[5:]].as_matrix()

features = d_word30_data
labels = np.array(d_word30["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3)

rf = RandomForestClassifier(n_estimators=1000,random_state=0, n_jobs=-1,class_weight = "balanced",max_depth = 16)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# predictions
errors = []
for x,pre in enumerate(predictions):
    if pre != test_labels[x]:
        errors.append((pre,test_labels[x],x))
# print(len(errors))
# print(len(test_labels))
print("Accuracy:",(len(test_labels) - len(errors))/len(test_labels)*100,"%")

print(metrics.classification_report(predictions, test_labels))

  after removing the cwd from sys.path.


Accuracy: 50.609914352452634 %
              precision    recall  f1-score   support

           3       0.38      0.53      0.44       947
           4       0.57      0.55      0.56      1596
           5       0.57      0.44      0.49      1310

   micro avg       0.51      0.51      0.51      3853
   macro avg       0.51      0.51      0.50      3853
weighted avg       0.52      0.51      0.51      3853



### cross validate

In [24]:
crossvalidate(rf,features,labels,CV=5)

--- 4-fold cross-validation accuracy: %50.2 (+/-0.7)


# SVM  
https://medium.com/all-things-ai/in-depth-parameter-tuning-for-svc-758215394769

In [25]:
from sklearn import svm
from sklearn.svm import SVC 


## SVM; 70/30 split; min 20 words

In [26]:
mask_word20 = (d["wpp"] >= 20)
d_word20 = d[mask_word20]
d_word20_data = d_word20[d.columns[5:]].as_matrix()

features = d_word20_data
labels = np.array(d_word20["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3,random_state = 0)

  This is separate from the ipykernel package so we can avoid doing imports until


### kernel = linear

In [27]:
from sklearn.metrics import confusion_matrix
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(train_features, train_labels) 
svm_predictions = svm_model_linear.predict(test_features) 
  
# model accuracy for X_test   
accuracy = svm_model_linear.score(test_features, test_labels) 
  
# creating a confusion matrix 
cm = confusion_matrix(test_labels, svm_predictions) 

In [28]:
print(accuracy)
print(cm)

0.47439148073022314
[[440 376 561]
 [321 618 570]
 [134 111 813]]


#### cross validate

In [29]:
crossvalidate(svm_model_linear,features,labels,CV=5)

--- 4-fold cross-validation accuracy: %46.9 (+/-0.2)


### kernel = rbf

In [30]:
svm_model_rbf = SVC(kernel = 'rbf', C = 1).fit(train_features, train_labels) 
svm_predictions = svm_model_rbf.predict(test_features) 
  
# model accuracy for X_test   
accuracy = svm_model_rbf.score(test_features, test_labels) 
  
# creating a confusion matrix 
cm = confusion_matrix(test_labels, svm_predictions) 
print(accuracy)
print(cm)



0.4287525354969574
[[762 582  33]
 [586 868  55]
 [487 510  61]]


#### cross validate

In [31]:
crossvalidate(svm_model_rbf,features,labels,CV=5)



--- 4-fold cross-validation accuracy: %43.9 (+/-1.0)


### kernel = poly

In [32]:
svm_model_poly = SVC(kernel = 'poly', C = 1,gamma="scale").fit(train_features, train_labels) 
svm_predictions = svm_model_poly.predict(test_features) 
  
# model accuracy for X_test   
accuracy = svm_model_poly.score(test_features, test_labels) 
  
# creating a confusion matrix 
cm = confusion_matrix(test_labels, svm_predictions) 
print(accuracy)
print(cm)

0.47718052738336714
[[431 529 417]
 [262 827 420]
 [172 262 624]]


#### cross validate

In [33]:
crossvalidate(svm_model_poly,features,labels,CV=5)

--- 4-fold cross-validation accuracy: %48.3 (+/-0.9)
