In [1]:
#### SETUP ####

import csv
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import numpy as np
np.random.seed(42)
import random
random.seed(42)

# READ IN THE TRAINING DATA

X_txt = []
y = []
with open('./train.tsv') as in_file:
    iCSV = csv.reader(in_file, delimiter='\t')
    for row in iCSV:
        X_txt.append(row[1])
        y.append(row[2])

# SPLIT THE TRAINING DATA INTO TRAINING (80%) AND VALIDATION (20%) SUBSETS        

X_txt_train, X_txt_val, y_train, y_val = train_test_split(X_txt, y, test_size=0.2, random_state=42)
print(len(X_txt_train))
print(len(X_txt_val))
print(len(y_train))
print(len(y_val))

8473
2119
8473
2119


In [2]:
# What is the breakdown of the 3 classes in our training data?

import pandas
pandas.Series(y).value_counts()

NOT    7069
TIN    3102
UNT     421
dtype: int64

In [3]:
#### TRAIN SOME BASELINE MODELS ####

# LINEAR SVC

pipeline = Pipeline([
    ('vec', CountVectorizer()), 
    ('clf', LinearSVC(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'clf__C':[0.01, 0.1, 1.]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed:  1.9min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vec', CountVectorizer()),
                                       ('clf', LinearSVC(random_state=42))]),
             param_grid={'clf__C': [0.01, 0.1, 1.0], 'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [4]:
print("Best LinearSVC Micro F1: {:.4f}".format(clf.best_score_))
print("Best LinearSVC Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation LinearSVC Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation LinearSVC Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best LinearSVC Micro F1: 0.7324
Best LinearSVC Parameters: {'clf__C': 0.1, 'vec__min_df': 1, 'vec__ngram_range': (1, 1), 'vec__stop_words': 'english'}
Validation LinearSVC Micro F1: 0.7268
Validation LinearSVC Macro F1: 0.4487


In [5]:
# RANDOM FOREST

from sklearn.ensemble import RandomForestClassifier

pipeline = Pipeline([
    ('vec', CountVectorizer()), 
    ('clf', RandomForestClassifier(random_state=42))])

params = {'vec__ngram_range':[(1,1),(1,2)],
          'vec__stop_words':['english', None], 
          #'vec__lowercase':[False, True],
          'vec__min_df':[1, 5, 10],
          'clf__n_estimators':[100, 200, 300]}
clf = GridSearchCV(pipeline, params, scoring="f1_micro", cv=3, verbose=1)
clf.fit(X_txt_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 108 out of 108 | elapsed: 14.7min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('vec', CountVectorizer()),
                                       ('clf',
                                        RandomForestClassifier(random_state=42))]),
             param_grid={'clf__n_estimators': [100, 200, 300],
                         'vec__min_df': [1, 5, 10],
                         'vec__ngram_range': [(1, 1), (1, 2)],
                         'vec__stop_words': ['english', None]},
             scoring='f1_micro', verbose=1)

In [6]:
print("Best RandomForest Micro F1: {:.4f}".format(clf.best_score_))
print("Best RandomForest Parameters:", clf.best_params_)

preds = clf.predict(X_txt_val)
print("Validation RandomForest Micro F1: {:.4f}".format(f1_score(y_val, preds, average='micro')))
print("Validation RandomForest Macro F1: {:.4f}".format(f1_score(y_val, preds, average='macro')))

Best RandomForest Micro F1: 0.7355
Best RandomForest Parameters: {'clf__n_estimators': 300, 'vec__min_df': 5, 'vec__ngram_range': (1, 1), 'vec__stop_words': None}
Validation RandomForest Micro F1: 0.7348
Validation RandomForest Macro F1: 0.4699


In [10]:
#print(clf.cv_results_)

In [9]:
#plot.grid_search(clf.cv_results_, change='n_estimators', kind='bar')