In [1]:
import string
import pandas as pd
import re
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
train_file = "data/train_small.csv"
test_file = "data/test_small.csv"

# Tokenize + extract list

In [2]:
train = pd.read_csv(train_file, header=0)
test = pd.read_csv(test_file, header=0)

train['verbs'] = train['verbs'].apply(tokenizer.tokenize)
test['verbs'] = test['verbs'].apply(tokenizer.tokenize)

In [3]:
words = sorted(set([j for i in list(train['verbs']) for j in i]))

# Construct matrix

In [4]:
def word_count(data, words):
    """
    return word cound matrix,
    data: series of list of words,
    words: list of sorted, unique words extracted from train dataset
    """
    counts = pd.DataFrame(index=data.index, columns=words).fillna(0)
    for index, verbs in data.iteritems():
        for verb in verbs:
            if verb in words:
                counts.loc[index, verb] +=1
    return counts

In [5]:
train_count = word_count(train['verbs'], words)
test_count = word_count(test['verbs'], words)

In [16]:
#train_count.sum(axis=0)
a = train_count.drop([col for col, val in train_count.sum().iteritems() if val < 10], axis=1)
b = test_count.drop([col for col, val in train_count.sum().iteritems() if val < 10], axis=1)
print(a.columns)

Index(['absorb', 'accommodate', 'accompany', 'accord', 'accumulate', 'achieve',
       'add', 'adhere', 'adjust', 'advance',
       ...
       'wire', 'wish', 'work', 'worry', 'wrap', 'wrinkle', 'write', 'yellow',
       'yield', 'zest'],
      dtype='object', length=978)


In [59]:
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

clf = svm.LinearSVC(C=1, max_iter=1000)
clf2 = LogisticRegression(random_state=42, max_iter=1000)

y = train['duration_label']
y_t = test['duration_label']

In [36]:
for i in range(80, 200, 5):
    print(f"--------{i}--------")
    a = train_count.drop([col for col, val in train_count.sum().iteritems() if val < i], axis=1)
    b = test_count.drop([col for col, val in train_count.sum().iteritems() if val < i], axis=1)
    print(a.columns)

    y = train['duration_label']
    y_t = test['duration_label']
    clf.fit(a, y)
    clf2.fit(a, y)
    
    pre1 = clf.predict(b)
    pre2 = clf2.predict(b)
    
    print("----------------------SVM")
    score = accuracy_score(y_t, pre1)
    print(f"Accuracy : {(score*100):.2f}%")

    precision = precision_score(y_t, pre1, average=None, zero_division=0)
    recall = recall_score(y_t, pre1, average=None, zero_division=0)
    f1 = f1_score(y_t, pre1, average=None, zero_division=0)

    score = pd.DataFrame({'Precision':precision, "Recall":recall, "F_score":f1}, index=[1,2,3])
    print(score)

    matrix = confusion_matrix(y_t, pre1)
    matrix = pd.DataFrame(matrix, index=[1,2,3], columns=[1,2,3])
    print("\nConfusion matrix:")
    print(matrix, end='\n')
    print("----------------------Log")
    score = accuracy_score(y_t, pre2)
    print(f"Accuracy : {(score*100):.2f}%")

    precision = precision_score(y_t, pre2, average=None, zero_division=0)
    recall = recall_score(y_t, pre2, average=None, zero_division=0)
    f1 = f1_score(y_t, pre2, average=None, zero_division=0)

    score = pd.DataFrame({'Precision':precision, "Recall":recall, "F_score":f1}, index=[1,2,3])
    print(score)

    matrix = confusion_matrix(y_t, pre2)
    matrix = pd.DataFrame(matrix, index=[1,2,3], columns=[1,2,3])
    print("\nConfusion matrix:")
    print(matrix, end='\n\n')

--------80--------
Index(['absorb', 'accord', 'achieve', 'add', 'adjust', 'air', 'allow',
       'alternate', 'amount', 'appear',
       ...
       'wine', 'wipe', 'wire', 'wish', 'work', 'worry', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=530)




----------------------SVM
Accuracy : 73.06%
   Precision    Recall   F_score
1   0.697815  0.752270  0.724020
2   0.758162  0.741769  0.749876
3   0.799087  0.431034  0.560000

Confusion matrix:
      1     2    3
1  2651   851   22
2  1029  3019   22
3   119   112  175
----------------------Log
Accuracy : 72.95%
   Precision    Recall   F_score
1   0.706663  0.740352  0.723115
2   0.753404  0.747666  0.750524
3   0.684015  0.453202  0.545185

Confusion matrix:
      1     2    3
1  2609   881   34
2   976  3043   51
3   107   115  184

--------85--------
Index(['absorb', 'accord', 'achieve', 'add', 'adjust', 'air', 'allow',
       'alternate', 'amount', 'appear',
       ...
       'wine', 'wipe', 'wire', 'wish', 'work', 'worry', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=519)




----------------------SVM
Accuracy : 73.20%
   Precision    Recall   F_score
1   0.698947  0.753689  0.725287
2   0.760050  0.743243  0.751553
3   0.795455  0.431034  0.559105

Confusion matrix:
      1     2    3
1  2656   846   22
2  1022  3025   23
3   122   109  175
----------------------Log
Accuracy : 72.99%
   Precision    Recall   F_score
1   0.706519  0.741203  0.723446
2   0.754090  0.747420  0.750740
3   0.687732  0.455665  0.548148

Confusion matrix:
      1     2    3
1  2612   878   34
2   978  3042   50
3   107   114  185

--------90--------
Index(['absorb', 'accord', 'achieve', 'add', 'adjust', 'air', 'allow',
       'alternate', 'amount', 'appear',
       ...
       'wine', 'wipe', 'wire', 'wish', 'work', 'worry', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=507)




----------------------SVM
Accuracy : 73.02%
   Precision    Recall   F_score
1   0.697002  0.751986  0.723451
2   0.758421  0.741278  0.749751
3   0.795455  0.431034  0.559105

Confusion matrix:
      1     2    3
1  2650   852   22
2  1030  3017   23
3   122   109  175
----------------------Log
Accuracy : 73.02%
   Precision    Recall   F_score
1   0.705914  0.741771  0.723398
2   0.754900  0.747666  0.751265
3   0.695489  0.455665  0.550595

Confusion matrix:
      1     2    3
1  2614   877   33
2   979  3043   48
3   110   111  185

--------95--------
Index(['absorb', 'accord', 'achieve', 'add', 'adjust', 'air', 'allow',
       'alternate', 'amount', 'appear',
       ...
       'wine', 'wipe', 'wire', 'wish', 'work', 'worry', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=498)




----------------------SVM
Accuracy : 72.97%
   Precision    Recall   F_score
1   0.696372  0.751703  0.722980
2   0.757858  0.740541  0.749099
3   0.799087  0.431034  0.560000

Confusion matrix:
      1     2    3
1  2649   853   22
2  1034  3014   22
3   121   110  175
----------------------Log
Accuracy : 72.99%
   Precision    Recall   F_score
1   0.704772  0.741771  0.722798
2   0.754656  0.746683  0.750648
3   0.704545  0.458128  0.555224

Confusion matrix:
      1     2    3
1  2614   877   33
2   986  3039   45
3   109   111  186

--------100--------
Index(['absorb', 'accord', 'achieve', 'add', 'adjust', 'air', 'allow',
       'alternate', 'amount', 'appear',
       ...
       'wine', 'wipe', 'wire', 'wish', 'work', 'worry', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=490)




----------------------SVM
Accuracy : 73.04%
   Precision    Recall   F_score
1   0.696954  0.753121  0.723950
2   0.758812  0.740541  0.749565
3   0.795455  0.431034  0.559105

Confusion matrix:
      1     2    3
1  2654   849   21
2  1032  3014   24
3   122   109  175
----------------------Log
Accuracy : 73.02%
   Precision    Recall   F_score
1   0.705644  0.741487  0.723122
2   0.755026  0.747420  0.751204
3   0.697761  0.460591  0.554896

Confusion matrix:
      1     2    3
1  2613   878   33
2   980  3042   48
3   110   109  187

--------105--------
Index(['absorb', 'accord', 'achieve', 'add', 'adjust', 'air', 'allow',
       'alternate', 'amount', 'appear',
       ...
       'wine', 'wipe', 'wire', 'wish', 'work', 'worry', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=477)




----------------------SVM
Accuracy : 73.04%
   Precision    Recall   F_score
1   0.696954  0.753121  0.723950
2   0.758812  0.740541  0.749565
3   0.795455  0.431034  0.559105

Confusion matrix:
      1     2    3
1  2654   848   22
2  1033  3014   23
3   121   110  175
----------------------Log
Accuracy : 72.96%
   Precision    Recall   F_score
1   0.705246  0.740068  0.722238
2   0.753778  0.747666  0.750709
3   0.701887  0.458128  0.554396

Confusion matrix:
      1     2    3
1  2608   882   34
2   982  3043   45
3   108   112  186

--------110--------
Index(['absorb', 'accord', 'add', 'adjust', 'air', 'allow', 'alternate',
       'amount', 'appear', 'arrange',
       ...
       'win', 'wine', 'wipe', 'wire', 'wish', 'work', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=469)




----------------------SVM
Accuracy : 73.06%
   Precision    Recall   F_score
1   0.697448  0.752270  0.723823
2   0.758673  0.741523  0.750000
3   0.796380  0.433498  0.561404

Confusion matrix:
      1     2    3
1  2651   851   22
2  1029  3018   23
3   121   109  176
----------------------Log
Accuracy : 72.97%
   Precision    Recall   F_score
1   0.706042  0.739501  0.722384
2   0.753526  0.748157  0.750832
3   0.697761  0.460591  0.554896

Confusion matrix:
      1     2    3
1  2606   885   33
2   977  3045   48
3   108   111  187

--------115--------
Index(['absorb', 'accord', 'add', 'adjust', 'air', 'allow', 'alternate',
       'amount', 'appear', 'arrange',
       ...
       'win', 'wine', 'wipe', 'wire', 'wish', 'work', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=463)




----------------------SVM
Accuracy : 72.99%
   Precision    Recall   F_score
1   0.696555  0.751703  0.723079
2   0.757858  0.740541  0.749099
3   0.800000  0.433498  0.562300

Confusion matrix:
      1     2    3
1  2649   853   22
2  1034  3014   22
3   120   110  176
----------------------Log
Accuracy : 72.96%
   Precision    Recall   F_score
1   0.705357  0.739784  0.722161
2   0.753591  0.747666  0.750617
3   0.703008  0.460591  0.556548

Confusion matrix:
      1     2    3
1  2607   884   33
2   981  3043   46
3   108   111  187

--------120--------
Index(['absorb', 'accord', 'add', 'adjust', 'air', 'allow', 'alternate',
       'amount', 'appear', 'arrange',
       ...
       'wilt', 'win', 'wipe', 'wire', 'wish', 'work', 'wrap', 'yellow',
       'yield', 'zest'],
      dtype='object', length=455)


KeyboardInterrupt: 

# PCA

In [40]:
from sklearn import decomposition

In [49]:
pca = decomposition.PCA(n_components=100)
pca.fit(a)
X_train = pca.transform(a)
X_test = pca.transform(b)

# Log Predict

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [51]:
y_train = train['duration_label']
y_test = test['duration_label']

In [52]:
model = LogisticRegression(random_state=42, max_iter=2000).fit(X_train, y_train)
y_hat = model.predict(X_test)

In [54]:
print(accuracy_score(y_test, y_hat))

precision = precision_score(y_test, y_hat, average=None, zero_division=0)
recall = recall_score(y_test, y_hat, average=None, zero_division=0)
f1 = f1_score(y_test, y_hat, average=None, zero_division=0)

score = pd.DataFrame({'Precision':precision, "Recall":recall, "F_score":f1}, index=[1,2,3])
print(score)

matrix = confusion_matrix(y_test, y_hat)
matrix = pd.DataFrame(matrix, index=[1,2,3], columns=[1,2,3])
print("\nConfusion matrix:")
print(matrix, end='\n\n')

0.714125
   Precision    Recall   F_score
1   0.684765  0.737230  0.710030
2   0.744129  0.739558  0.741836
3   0.652174  0.258621  0.370370

Confusion matrix:
      1     2    3
1  2598   903   23
2  1027  3010   33
3   169   132  105

