In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [3]:
# Randomly partition the data and convert it to Count, Boolean, and TF-IDF Formats.
# PCA has been removed due to its rather intense computational constraints. :(
# Data has been stratified here due to severe class imbalance.

data = pd.read_csv('dft_comb_new.csv')
labeled_data = data[data['injury_report'] != 'x']

Xtr, Xte, Ytr, Yte =  model_selection.train_test_split(labeled_data['tweet'],
                                                       labeled_data['injury_report'].astype(int),
                                                       stratify=labeled_data['injury_report'],
                                                       test_size=0.3)

v = TfidfVectorizer()
v.fit(data['tweet'].dropna())

Xtr_tfidf = v.transform(Xtr)
Xte_tfidf = v.transform(Xte)

v = CountVectorizer(binary=True)

v.fit(data['tweet'].dropna())
Xtr_bool = v.transform(Xtr)
Xte_bool = v.transform(Xte)

v = CountVectorizer()

v.fit(data['tweet'].dropna())
Xtr_count = v.transform(Xtr)
Xte_count = v.transform(Xte)

In [4]:
# K Nearest Neighbors over TF-IDF representation

kNN = KNeighborsClassifier()
param_search = [{'weights': ['distance'], 'n_neighbors': [i for i in range(1,20)]}]

grid_search = GridSearchCV(kNN, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 19 candidates, totalling 152 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    5.3s
[Parallel(n_jobs=3)]: Done 152 out of 152 | elapsed:   18.6s finished


[[4060   92]
 [ 103  329]]
KNeighborsClassifier(n_neighbors=1, weights='distance')


In [5]:
# Bernoulli Naive Bayes; must be done over Boolean Representaion

BNB = BernoulliNB()

param_search = [{'alpha': list(np.linspace(0, 0.5, 10000))}]

grid_search = GridSearchCV(BNB, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 10000 candidates, totalling 80000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 266 tasks      | elapsed:    0.7s
[Parallel(n_jobs=3)]: Done 3258 tasks      | elapsed:    7.1s
[Parallel(n_jobs=3)]: Done 9754 tasks      | elapsed:   21.5s
[Parallel(n_jobs=3)]: Done 18810 tasks      | elapsed:   41.8s
[Parallel(n_jobs=3)]: Done 30490 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 44730 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 61594 tasks      | elapsed:  2.2min


[[3972  180]
 [  43  389]]
BernoulliNB(alpha=0.05300530053005301)


[Parallel(n_jobs=3)]: Done 79995 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done 80000 out of 80000 | elapsed:  2.9min finished


In [6]:
# Multinomial Naive Bayes; must be done over Count Representation

MNB = MultinomialNB()

param_search = [{'alpha': list(np.linspace(0, 0.5, 10000))}]

grid_search = GridSearchCV(MNB, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_count, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_count)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 10000 candidates, totalling 80000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 346 tasks      | elapsed:    0.6s
[Parallel(n_jobs=3)]: Done 4218 tasks      | elapsed:    6.6s
[Parallel(n_jobs=3)]: Done 10714 tasks      | elapsed:   17.0s
[Parallel(n_jobs=3)]: Done 19770 tasks      | elapsed:   32.0s
[Parallel(n_jobs=3)]: Done 31450 tasks      | elapsed:   50.3s
[Parallel(n_jobs=3)]: Done 45690 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done 62554 tasks      | elapsed:  1.7min


[[3896  256]
 [  34  398]]
MultinomialNB(alpha=0.13551355135513551)


[Parallel(n_jobs=3)]: Done 80000 out of 80000 | elapsed:  2.2min finished


In [7]:
# Logistic Regression over TF-IDF Representation

LReg = LogisticRegression()

param_search = [{'penalty': ['l1', 'l2'],
                'C': [2 ** i for i in range(-5, 16)],
                'class_weight': ['balanced'],
                'solver': ['liblinear']}]

grid_search = GridSearchCV(LReg, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 8 folds for each of 42 candidates, totalling 336 fits


[Parallel(n_jobs=3)]: Done 110 tasks      | elapsed:    1.5s
[Parallel(n_jobs=3)]: Done 252 tasks      | elapsed:   48.8s


[[3997  155]
 [  47  385]]
LogisticRegression(C=1, class_weight='balanced', solver='liblinear')


[Parallel(n_jobs=3)]: Done 336 out of 336 | elapsed:  1.0min finished


In [8]:
# Random Forest over Boolean Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(20, 41)],
                'min_samples_leaf': [5],
                'n_estimators': [1000],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 8 folds for each of 84 candidates, totalling 672 fits


[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 19.2min
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed: 44.1min
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed: 82.7min
[Parallel(n_jobs=3)]: Done 672 out of 672 | elapsed: 87.6min finished


[[3929  223]
 [  68  364]]
RandomForestClassifier(class_weight='balanced_subsample', criterion='entropy',
                       max_depth=39, min_samples_leaf=5, min_samples_split=5,
                       n_estimators=1000)


In [9]:
# Random Forest over TF-IDF Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(20, 41)],
                'min_samples_leaf': [5],
                'n_estimators': [1000],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 84 candidates, totalling 672 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  3.6min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 19.6min
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed: 45.6min
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed: 84.9min
[Parallel(n_jobs=3)]: Done 672 out of 672 | elapsed: 90.2min finished


[[3939  213]
 [  72  360]]
RandomForestClassifier(class_weight='balanced', max_depth=40,
                       min_samples_leaf=5, min_samples_split=5,
                       n_estimators=1000)


In [10]:
# SVM; must be done over TF-IDF Representation

SVM = svm.SVC()

search_area = [
  {'C': [2 ** i for i in list(range(-5,16))], 'kernel': ['linear']},
  {'C': [2 ** i for i in list(range(-5,16))], 'gamma': [2 ** i for i in list(range(-15,4))], 'kernel': ['rbf']},
 ]

grid_search = GridSearchCV(SVM, search_area, cv=8, scoring='recall', n_jobs=3)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

[[4092   60]
 [  87  345]]
SVC(C=8, kernel='linear')
