In [4]:
import string
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
train_file = "data/train_small.csv"
test_file = "data/test_small.csv"

# Tokenize + extract list

In [15]:
train = pd.read_csv(train_file, header=0)
test = pd.read_csv(test_file, header=0)

train['verbs'] = train['verbs'].apply(tokenizer.tokenize)
test['verbs'] = test['verbs'].apply(tokenizer.tokenize)

In [26]:
words = sorted(set([j for i in list(train['verbs']) for j in i]))

# Construct matrix

In [55]:
def word_count(data, words):
    """
    return word cound matrix,
    data: series of list of words,
    words: list of sorted, unique words extracted from train dataset
    """
    counts = pd.DataFrame(index=data.index, columns=words).fillna(0)
    for index, verbs in data.iteritems():
        for verb in verbs:
            if verb in words:
                counts.loc[index, verb] +=1
    return counts

In [56]:
train_count = word_count(train['verbs'], words)
test_count = word_count(test['verbs'], words)

# PCA

In [57]:
from sklearn import decomposition

In [134]:
pca = decomposition.PCA(n_components=50)
pca.fit(train_count)
X_train = pca.transform(train_count)
X_test = pca.transform(test_count)

# Log Predict

In [135]:
from sklearn.linear_model import LogisticRegression

In [136]:
y_train = train['duration_label']
y_test = test['duration_label']

In [137]:
model = LogisticRegression(random_state=42, max_iter=2000).fit(X_train, y_train)
y_hat = model.predict(X_test)

In [138]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [139]:
print(accuracy_score(y_test, y_hat))

precision = precision_score(y_test, y_hat, average=None, zero_division=0)
recall = recall_score(y_test, y_hat, average=None, zero_division=0)
f1 = f1_score(y_test, y_hat, average=None, zero_division=0)

score = pd.DataFrame({'Precision':precision, "Recall":recall, "F_score":f1}, index=[1,2,3])
print(score)

matrix = confusion_matrix(y_test, y_hat)
matrix = pd.DataFrame(matrix, index=[1,2,3], columns=[1,2,3])
print("\nConfusion matrix:")
print(matrix, end='\n\n')

0.7025
   Precision    Recall   F_score
1   0.694301  0.732240  0.712766
2   0.725806  0.703125  0.714286
3   0.571429  0.480000  0.521739

Confusion matrix:
     1    2   3
1  134   45   4
2   52  135   5
3    7    6  12

