### 2018 Shared Tasks
#### The results shown here represent top 2000 words selection, in SVC, we choose 200
#### If you would like to have top 200 words in this draft, just change 2000 to 200 or any other numbers you want to test

In [1]:
import pickle
import re
import collections
import scipy
import numpy as np

### Load data

In [2]:
with open('data_modified.p', 'rb') as fp:
    data = pickle.load(fp)

In [3]:
anx_score = []
for k in data:
    anx_score.append(data[k]['anxiety'])

### Data depression distribution

In [4]:
dict((x,anx_score.count(x)) for x in set(anx_score))

{0.0: 5594,
 1.0: 1945,
 2.0: 876,
 3.0: 404,
 4.0: 213,
 5.0: 144,
 6.0: 96,
 7.0: 72,
 8.0: 56,
 9.0: 26,
 10.0: 19,
 11.0: 5,
 12.0: 5}

### Data Group

In [5]:
data_essay = []
data_anxie = []
data_depre = []
for k in data:
    data_essay.append(data[k]['essay'])
    data_anxie.append(data[k]['anxiety'])
    data_depre.append(data[k]['depression'])

In [6]:
from sklearn.model_selection import train_test_split

### regular expression of the words

In [7]:
words_all = []
for k in data_essay:
    words = re.compile('\w+').findall(k)
    words = [x.lower() for x in words]
    words_all += words
counts = collections.Counter(words_all)

### Get top N words

In [9]:
#counts.most_common()[:-21:-1]
### Here we just showed the top 20 words, and in this workflow we used 2000
### You can use whatever you want
counts.most_common(20)

[('go', 21261),
 ('would', 18337),
 ('like', 13796),
 ('work', 12507),
 ('get', 12163),
 ('children', 8541),
 ('home', 8273),
 ('one', 7107),
 ('house', 6748),
 ('time', 6110),
 ('got', 5950),
 ('years', 5681),
 ('name', 5569),
 ('job', 5566),
 ('live', 5550),
 ('old', 5386),
 ('two', 5226),
 ('married', 4740),
 ('going', 4740),
 ('school', 4608)]

In [9]:
top_n_count = [k[0] for k in counts.most_common(2000)]

### Feature the data

In [52]:
def feature(datum):
    wor = re.compile('\w+').findall(datum)
    wor = [x.lower() for x in wor]
    cou = collections.Counter(wor)
    # for 20 words
    # feat = [1, cou['go'], cou['would'], cou['like'], cou['work'], cou['get'], cou['children'], cou['home'], cou['one'], cou['house'], cou['time'], cou['got'], cou['years'], cou['name'], cou['job'], cou['live'], cou['old'], cou['two'], cou['married'], cou['going'], cou['school']]
    feat = [1] + [cou[k] for k in top_n_count]
    return feat

In [53]:
new_essay = [feature(datum) for datum in data_essay]

In [54]:
new_data_depre = []
for k in data_depre:
    if k == 0:
        new_data_depre.append(0)
    else:
        new_data_depre.append(1)

### Split data to training and test

In [55]:
X_train, X_test, y_train, y_test = train_test_split(new_essay, new_data_depre, test_size=0.33, random_state=42)

### Logistic Regression

In [21]:
def inner(x,y):
    return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

In [22]:
# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
    loglikelihood = 0
    for i in range(len(X)):
        logit = inner(X[i], theta)
        loglikelihood -= np.log(1 + np.exp(-logit))
        if not y[i]:
            loglikelihood -= logit
    for k in range(len(theta)):
        loglikelihood -= lam * theta[k]*theta[k]
    # for debugging
    # print("ll =" + str(loglikelihood))
    return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
    dl = [0]*len(theta)
    for i in range(len(X)):
        logit = inner(X[i], theta)
        for k in range(len(theta)):
            dl[k] += X[i][k] * (1 - sigmoid(logit))
            if not y[i]:
                dl[k] -= X[i][k]
    for k in range(len(theta)):
        dl[k] -= lam*2*theta[k]
    return np.array([-x for x in dl])

In [23]:
def train(lam):
    theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X_train[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
    return theta

In [24]:
lam = 1.0

In [56]:
theta = train(lam)

In [80]:
import datetime

currentDT = datetime.datetime.now()
print (str(currentDT))

2018-06-03 23:30:38.249417


In [40]:
def performance(theta, dataset_x, dataset_y):
    scores = [inner(theta,k) for k in dataset_x]
    predictions = [1 if s > 0 else 0 for s in scores]
    #correct = [(a==b) for (a,b) in zip(predictions,dataset_y)]
    correct = [(a==b) for (a,b) in zip(predictions,dataset_y)]
    acc = sum(correct) * 1.0 / len(correct)
    return acc

### Get accuracy

In [57]:
print('accuracy of validation set is', performance(theta, X_train, y_train))

accuracy of validation set is 0.5887275023681717


In [58]:
print('accuracy of validation set is', performance(theta, X_test, y_test))

accuracy of validation set is 0.5825056071771868


In [30]:
predictions = [1 if s > 0 else 0 for s in scores]

In [37]:
correct = [(a==b) for (a,b) in zip(predictions,y_test)]
acc = sum(correct) * 1.0 / len(correct)

In [39]:
acc

0.5709708426786286

In [81]:
y_train_mod = [s > 0 for s in y_train]

### Train and Test accuracy

In [82]:
performance(theta, X_train, y_train_mod)

0.7813388064414272

In [83]:
y_test_mod = [s > 0 for s in y_test]

In [85]:
performance(theta, X_test, y_test_mod)

0.5709708426786286