In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

In [2]:
# Randomly partition the data and convert it to Count, Boolean, and TF-IDF Formats.
# PCA has been removed due to its rather intense computational constraints. :(
# Data has been stratified here due to severe class imbalance.

#data = pd.read_csv('filtered.csv')
data = pd.read_csv('clean.csv')
labeled_data = data[data['injury_report'] != 'x']

Xtr, Xte, Ytr, Yte =  model_selection.train_test_split(labeled_data['tweet'],
                                                       labeled_data['injury_report'].astype(int),
                                                       stratify=labeled_data['injury_report'],
                                                       test_size=0.3)

v = TfidfVectorizer()
v.fit(data['tweet'].dropna())

Xtr_tfidf = v.transform(Xtr)
Xte_tfidf = v.transform(Xte)

dump(v, 'tfidf.joblib')

v = CountVectorizer(binary=True)

v.fit(data['tweet'].dropna())
Xtr_bool = v.transform(Xtr)
Xte_bool = v.transform(Xte)

dump(v, 'bool.joblib')

v = CountVectorizer()

v.fit(data['tweet'].dropna())
Xtr_count = v.transform(Xtr)
Xte_count = v.transform(Xte)

dump(v, 'count.joblib')

['count.joblib']

In [3]:
# K Nearest Neighbors over TF-IDF representation

kNN = KNeighborsClassifier()
param_search = [{'weights': ['distance'], 'n_neighbors': [i for i in range(1,20)]}]

grid_search = GridSearchCV(kNN, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'kNN.joblib')

Fitting 8 folds for each of 19 candidates, totalling 152 fits
[[5678   14]
 [ 420  118]]
KNeighborsClassifier(n_neighbors=1, weights='distance')


['kNN.joblib']

In [4]:
# Bernoulli Naive Bayes; must be done over Boolean Representaion

BNB = BernoulliNB()

param_search = [{'alpha': list(np.linspace(0, 0.1, 10000))}]

grid_search = GridSearchCV(BNB, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'bernoulliNB.joblib')

Fitting 8 folds for each of 10000 candidates, totalling 80000 fits
[[5446  246]
 [  71  467]]
BernoulliNB(alpha=0.05394539453945395)


['bernoulliNB.joblib']

In [5]:
# Multinomial Naive Bayes; must be done over Count Representation

MNB = MultinomialNB()

param_search = [{'alpha': list(np.linspace(0, 0.1, 10000))}]

grid_search = GridSearchCV(MNB, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_count, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_count)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'multinomialNB.joblib')

Fitting 8 folds for each of 10000 candidates, totalling 80000 fits
[[5356  336]
 [  78  460]]
MultinomialNB(alpha=0.0626062606260626)


['multinomialNB.joblib']

In [6]:
# Logistic Regression over TF-IDF Representation

LReg = LogisticRegression()

param_search = [{'penalty': ['l1', 'l2'],
                'C': [2 ** i for i in range(-5, 16)],
                'class_weight': ['balanced'],
                'solver': ['liblinear']}]

grid_search = GridSearchCV(LReg, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'logistic_regression.joblib')

Fitting 8 folds for each of 42 candidates, totalling 336 fits
[[5576  116]
 [  54  484]]
LogisticRegression(C=1, class_weight='balanced', penalty='l1',
                   solver='liblinear')


['logistic_regression.joblib']

In [7]:
# Random Forest over Boolean Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'],
                 'max_depth': [i for i in range(20, 41)],
                'n_estimators': [100],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'random_forest_bool.joblib')

Fitting 8 folds for each of 84 candidates, totalling 672 fits
[[5384   93]
 [ 181  351]]
RandomForestClassifier(class_weight='balanced', max_depth=21)


['random_forest_bool.joblib']

In [8]:
# Random Forest over TF-IDF Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(20, 41)],
                'min_samples_leaf': [5],
                'n_estimators': [100],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'random_forest_tfidf.joblib')

Fitting 8 folds for each of 84 candidates, totalling 672 fits
[[5282  195]
 [ 112  420]]
RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=38, min_samples_leaf=5, min_samples_split=5)


['random_forest_tfidf.joblib']

In [7]:
# SVM; must be done over TF-IDF Representation

SVM = svm.SVC()

search_area = [
  {'C': [2 ** i for i in list(range(-5,16))], 'kernel': ['linear']},
  #{'C': [2 ** i for i in list(range(-5,16))], 'gamma': [2 ** i for i in list(range(-15,4))], 'kernel': ['rbf']},
  # Removed because it was computationally expensive and seemed to provide no advantage.
  # Perhaps that would change with more data / more word lemmatization.
 ]

grid_search = GridSearchCV(SVM, search_area, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'svm.joblib')

Fitting 8 folds for each of 21 candidates, totalling 168 fits
[[5661   31]
 [ 104  434]]
SVC(C=4, kernel='linear')


['svm.joblib']