In [32]:
import pandas as pd
d  = pd.read_csv("new_DF_withreddit.csv")

In [33]:
d.groupby(["source","label",'illness']).size()

source  label  illness
m       1      ptsd         24
        2      ocd         322
        3      anx        1865
        4      bd         2333
        5      dep        3135
r       1      ptsd        420
        2      ocd         736
        3      anx        1251
        4      bd         2832
        5      dep         471
dtype: int64

In [34]:
d.groupby(["label"]).size()

label
1     444
2    1058
3    3116
4    5165
5    3606
dtype: int64

In [35]:
d.columns

Index(['source', 'postID', 'illness', 'label', 'wpp', 'Ipp', 'PRON', 'AUX',
       'VERB', 'ADP', 'CCONJ', 'NOUN', 'ADJ', 'DET', 'ADV', 'PART', 'SCONJ',
       'PROPN', 'NUM', 'INTJ', 'X', 'PUNCT', 'SYM', 'compound', 'neg', 'neu',
       'pos'],
      dtype='object')

In [5]:
#scramble
d = d.sample(frac=1).reset_index(drop=True)


# upsample

In [36]:
from sklearn.utils import resample

In [37]:
# upsample class 1
d_minority = d[d.label==1]
d_minority_upsampled = resample(d_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123)

d = pd.concat([d, d_minority_upsampled])
d.groupby(["label"]).size()

label
1    1020
2    1058
3    3116
4    5165
5    3606
dtype: int64

In [38]:
# upsample class 2
d_minority = d[d.label==2]
d_minority_upsampled = resample(d_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123)

d = pd.concat([d, d_minority_upsampled])
d.groupby(["label"]).size()

label
1    1020
2    1634
3    3116
4    5165
5    3606
dtype: int64

# down and up sample

In [9]:
from collections import defaultdict
dfdict = defaultdict()
for i in range(1,6):
    d_minority = d[d.label==i]
    d_minority_upsampled = resample(d_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=1500,    # to match majority class
                                     random_state=123)
    dfdict[i] =  d_minority_upsampled
d = pd.concat([dfdict[1],dfdict[2],dfdict[3],dfdict[4],dfdict[5]])
d.groupby(["label"]).size()

label
1    1500
2    1500
3    1500
4    1500
5    1500
dtype: int64

# Random Forest

In [10]:
def crossvalidate(classifier,X,T,CV=4):
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(classifier, X, T, cv=CV)
    print('--- 4-fold cross-validation accuracy: %%%.1f (+/-%.1f)' % (scores.mean()*100,scores.std()*100))

In [11]:
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


In [12]:
#scramble
d = d.sample(frac=1).reset_index(drop=True)

## Random forest; 70/30 split; 

In [13]:
d_word0_data = d[d.columns[5:]].as_matrix()

features = d_word0_data
labels = np.array(d["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3)

rf = RandomForestClassifier(n_estimators=1000,random_state=0, n_jobs=-1)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# predictions
errors = []
for x,pre in enumerate(predictions):
    if pre != test_labels[x]:
        errors.append((pre,test_labels[x],x))
# print(len(errors))
# print(len(test_labels))
print("Accuracy:",(len(test_labels) - len(errors))/len(test_labels)*100,"%")


  """Entry point for launching an IPython kernel.


Accuracy: 65.95555555555556 %


In [14]:
from sklearn import metrics
print(metrics.classification_report(predictions, test_labels))

              precision    recall  f1-score   support

           1       0.94      0.79      0.86       555
           2       0.75      0.72      0.73       465
           3       0.41      0.59      0.48       300
           4       0.46      0.62      0.53       331
           5       0.73      0.54      0.62       599

   micro avg       0.66      0.66      0.66      2250
   macro avg       0.66      0.65      0.65      2250
weighted avg       0.70      0.66      0.67      2250



## Random forest; 70/30 split; min 20 words

In [15]:
mask_word20 = (d["wpp"] >= 20)
# mask_word20

d_word20 = d[mask_word20]

#isolate data
d_word20_data = d_word20[d.columns[5:]].as_matrix()

features = d_word20_data
labels = np.array(d_word20["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000,random_state=0, n_jobs=-1)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# predictions
errors = []
for x,pre in enumerate(predictions):
    if pre != test_labels[x]:
        errors.append((pre,test_labels[x],x))
# print(len(errors))
# print(len(test_labels))
print("Accuracy:",(len(test_labels) - len(errors))/len(test_labels)*100,"%")


  import sys


Accuracy: 63.4927797833935 %


In [16]:
from sklearn import metrics
print(metrics.classification_report(predictions, test_labels))
# from sklearn.metrics import confusion_matrix
# import seaborn as sns; sns.set()
# import matplotlib.pyplot as plt
# mat = confusion_matrix(test_labels, predictions)
# sns.heatmap(mat.T, square = True, annot = True, fmt ='d', cbar = True)
# plt.xlabel('true label')
# plt.ylabel('predicted label')

              precision    recall  f1-score   support

           1       0.92      0.75      0.83       511
           2       0.73      0.75      0.74       452
           3       0.35      0.62      0.45       277
           4       0.45      0.54      0.50       344
           5       0.75      0.52      0.61       632

   micro avg       0.63      0.63      0.63      2216
   macro avg       0.64      0.64      0.62      2216
weighted avg       0.69      0.63      0.65      2216



## Random Forest; 70/30 split; min 30 words

In [17]:
mask_word30 = (d["wpp"] >= 30)
# mask_word20
d_word30 = d[mask_word30]
d_word30_data = d_word30[d.columns[5:]].as_matrix()

features = d_word30_data
labels = np.array(d_word30["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3)

rf = RandomForestClassifier(n_estimators=1000,random_state=0, n_jobs=-1,class_weight = "balanced")
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# predictions
errors = []
for x,pre in enumerate(predictions):
    if pre != test_labels[x]:
        errors.append((pre,test_labels[x],x))
# print(len(errors))
# print(len(test_labels))
print("Accuracy:",(len(test_labels) - len(errors))/len(test_labels)*100,"%")

print(metrics.classification_report(predictions, test_labels))

  after removing the cwd from sys.path.


Accuracy: 66.4819944598338 %
              precision    recall  f1-score   support

           1       0.96      0.76      0.85       544
           2       0.70      0.73      0.71       411
           3       0.47      0.66      0.55       311
           4       0.43      0.69      0.53       291
           5       0.78      0.53      0.63       609

   micro avg       0.66      0.66      0.66      2166
   macro avg       0.67      0.67      0.65      2166
weighted avg       0.72      0.66      0.68      2166



In [18]:
crossvalidate(rf,features,labels,CV=5)

--- 4-fold cross-validation accuracy: %67.8 (+/-0.6)


In [23]:
sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), d.columns), 
             reverse=True)


[(0.0533, 'illness'),
 (0.0531, 'DET'),
 (0.0529, 'ADJ'),
 (0.0509, 'ADP'),
 (0.0505, 'NUM'),
 (0.0504, 'label'),
 (0.0501, 'PRON'),
 (0.0498, 'AUX'),
 (0.0494, 'postID'),
 (0.0494, 'CCONJ'),
 (0.0493, 'VERB'),
 (0.0488, 'Ipp'),
 (0.0485, 'SCONJ'),
 (0.0478, 'INTJ'),
 (0.0477, 'wpp'),
 (0.0462, 'source'),
 (0.0458, 'X'),
 (0.0448, 'PUNCT'),
 (0.0391, 'NOUN'),
 (0.0329, 'ADV'),
 (0.0206, 'PROPN'),
 (0.0184, 'PART')]

In [21]:
features.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [19]:
# from sklearn.model_selection import cross_val_predict
# from sklearn.metrics import confusion_matrix
# y_pred = cross_val_predict(rf, test_features, test_labels, cv=4)
# conf_mat = confusion_matrix(test_labels, y_pred)
# conf_mat

## equal classes

# SVM  
https://medium.com/all-things-ai/in-depth-parameter-tuning-for-svc-758215394769

In [26]:
from sklearn import svm
from sklearn.svm import SVC 


## SVM; 70/30 split; min 20 words

In [26]:
mask_word20 = (d["wpp"] >= 20)
d_word20 = d[mask_word20]
d_word20_data = d_word20[d.columns[5:]].as_matrix()

features = d_word20_data
labels = np.array(d_word20["label"])

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.3,random_state = 0)

  This is separate from the ipykernel package so we can avoid doing imports until


### kernel = linear

In [29]:
from sklearn.metrics import confusion_matrix
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(train_features, train_labels) 
svm_predictions = svm_model_linear.predict(test_features) 
  
# model accuracy for X_test   
accuracy = svm_model_linear.score(test_features, test_labels) 
  
# creating a confusion matrix 
cm = confusion_matrix(test_labels, svm_predictions) 

In [30]:
print(accuracy)
print(cm)

0.376269621421976
[[396   7   0   4  25]
 [245  25  12  13 132]
 [146  14  20   8 243]
 [212  19   4  24 206]
 [ 44   4  13   0 350]]


### kernel = rbf

In [29]:
svm_model_linear = SVC(kernel = 'rbf', C = 1).fit(train_features, train_labels) 
svm_predictions = svm_model_linear.predict(test_features) 
  
# model accuracy for X_test   
accuracy = svm_model_linear.score(test_features, test_labels) 
  
# creating a confusion matrix 
cm = confusion_matrix(test_labels, svm_predictions) 
print(accuracy)
print(cm)



0.3785228377065112
[[   0    0    0  265    2]
 [   0    0    0  314    6]
 [   0    0    6  878   33]
 [   0    0    2 1466   45]
 [   0    0    5 1008   86]]


### kernel = poly

In [31]:
from sklearn.metrics import confusion_matrix

svm_model_linear = SVC(kernel = 'poly', C = 1,gamma="scale").fit(train_features, train_labels) 
svm_predictions = svm_model_linear.predict(test_features) 
  
# model accuracy for X_test   
accuracy = svm_model_linear.score(test_features, test_labels) 
  
# creating a confusion matrix 
cm = confusion_matrix(svm_predictions,test_labels) 
print(accuracy)
print(cm)

0.3850415512465374
[[360 199 130 178  37]
 [ 27  45  18  28   3]
 [  0  10  37  10  20]
 [ 20  45  26  60  19]
 [ 25 128 220 189 332]]
