In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.decomposition import IncrementalPCA

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [2]:
# Fit IPCA first to avoid running into memory constraints.
# IPCA had to be done over the labeled data only; the full dataset would take 24 hours to compute.
# This results in a much less accurate lower dim representation of TF-IDF data, which is done over
# the entire dataset.

chunksize = 1000
data = pd.read_csv('filtered.csv')
v = TfidfVectorizer()
v.fit(data['tweet'].dropna())

labeled_data = data[data['injury_report'] != 'x']

IPCA = IncrementalPCA(n_components=1000, batch_size=chunksize, copy=False)
count = 0
for i in range(0, labeled_data.shape[0] // chunksize):
    IPCA.partial_fit(v.transform(labeled_data['tweet'][count*chunksize:(count+1)*chunksize]).toarray())
    print((count+1)*chunksize)
    count += 1
    
IPCA.batch_size_ = 1000

  explained_variance[self.n_components_:].mean()
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000


In [4]:
# Randomly partition the data and convert it to Count, Boolean, and TF-IDF Formats.

Xtr, Xte, Ytr, Yte =  model_selection.train_test_split(labeled_data['tweet'],
                                                       labeled_data['injury_report'].astype(int),
                                                       test_size=0.3)

Xtr_tfidf = v.transform(Xtr)
Xte_tfidf = v.transform(Xte)
Xtr_pca = IPCA.transform(Xtr_tfidf)
Xte_pca = IPCA.transform(Xte_tfidf)

v = CountVectorizer(binary=True)

v.fit(data['tweet'].dropna())
Xtr_bool = v.transform(Xtr)
Xte_bool = v.transform(Xte)

v = CountVectorizer()

v.fit(data['tweet'].dropna())
Xtr_count = v.transform(Xtr)
Xte_count = v.transform(Xte)

In [5]:
# K Nearest Neighbors over TF-IDF representation

kNN = KNeighborsClassifier()
param_search = [{'weights': ['distance'], 'n_neighbors': [i for i in range(1,51)]}]

grid_search = GridSearchCV(kNN, param_search, cv=8, scoring='recall', n_jobs=3)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

[[4212    4]
 [ 127  186]]
KNeighborsClassifier(n_neighbors=1, weights='distance')


In [13]:
# Bernoulli Naive Bayes; must be done over Boolean Representaion

BNB = BernoulliNB()

param_search = [{'alpha': list(np.linspace(0, 1, 10000))}]

grid_search = GridSearchCV(BNB, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 10000 candidates, totalling 80000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  43 tasks      | elapsed:    2.5s
[Parallel(n_jobs=3)]: Done 285 tasks      | elapsed:    8.2s
[Parallel(n_jobs=3)]: Done 691 tasks      | elapsed:   17.9s
[Parallel(n_jobs=3)]: Done 1257 tasks      | elapsed:   31.3s
[Parallel(n_jobs=3)]: Done 1987 tasks      | elapsed:   47.6s
[Parallel(n_jobs=3)]: Done 2877 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 3931 tasks      | elapsed:  1.5min
[Parallel(n_jobs=3)]: Done 5145 tasks      | elapsed:  2.0min
[Parallel(n_jobs=3)]: Done 6523 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done 8061 tasks      | elapsed:  3.1min
[Parallel(n_jobs=3)]: Done 9763 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done 11625 tasks      | elapsed:  4.4min
[Parallel(n_jobs=3)]: Done 13651 tasks      | elapsed:  5.2min
[Parallel(n_jobs=3)]: Done 15837 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done 18187 tasks      | elapsed:  6

[[4149   67]
 [  53  260]]
BernoulliNB(alpha=0.0013001300130013002)


In [14]:
# Multinomial Naive Bayes; must be done over Count Representation

MNB = MultinomialNB()

param_search = [{'alpha': list(np.linspace(0, 1, 10000))}]

grid_search = GridSearchCV(MNB, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_count, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_count)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 10000 candidates, totalling 80000 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done 178 tasks      | elapsed:    2.1s
[Parallel(n_jobs=3)]: Done 1146 tasks      | elapsed:   13.7s
[Parallel(n_jobs=3)]: Done 2770 tasks      | elapsed:   32.8s
[Parallel(n_jobs=3)]: Done 5034 tasks      | elapsed:   59.3s
[Parallel(n_jobs=3)]: Done 7954 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 11514 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done 15730 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done 20586 tasks      | elapsed:  4.2min
[Parallel(n_jobs=3)]: Done 26098 tasks      | elapsed:  5.3min
[Parallel(n_jobs=3)]: Done 32250 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 39058 tasks      | elapsed:  8.1min
[Parallel(n_jobs=3)]: Done 46506 tasks      | elapsed:  9.6min
[Parallel(n_jobs=3)]: Done 54610 tasks      | elapsed: 11.3min
[Parallel(n_jobs=3)]: Done 63354 tasks      | elapsed: 13.1min
[Parallel(n_jobs=3)]: Done 72754 tasks      | ela

[[4103  113]
 [  45  268]]
MultinomialNB(alpha=0.002000200020002)


[Parallel(n_jobs=3)]: Done 80000 out of 80000 | elapsed: 16.6min finished


In [15]:
# Logistic Regression over TF-IDF Representation

LReg = LogisticRegression()

param_search = [{'penalty': ['l1', 'l2'],
                'C': [2 ** i for i in range(-5, 16)],
                'class_weight': ['balanced'],
                'solver': ['liblinear']}]

grid_search = GridSearchCV(LReg, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 42 candidates, totalling 336 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    8.4s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:   47.6s
[Parallel(n_jobs=3)]: Done 336 out of 336 | elapsed:  2.4min finished


[[4098  118]
 [  36  277]]
LogisticRegression(C=1, class_weight='balanced', solver='liblinear')


In [18]:
# Random Forest over Boolean Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(20, 41)],
                'min_samples_leaf': [5],
                'n_estimators': [1000],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 84 candidates, totalling 672 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  3.6min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 17.4min
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed: 39.9min
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed: 73.9min
[Parallel(n_jobs=3)]: Done 672 out of 672 | elapsed: 77.7min finished


[[4052  164]
 [  70  243]]
RandomForestClassifier(class_weight='balanced_subsample', max_depth=39,
                       min_samples_leaf=5, min_samples_split=5,
                       n_estimators=1000)


In [19]:
# Random Forest over TF-IDF Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(20, 41)],
                'min_samples_leaf': [5],
                'n_estimators': [1000],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 84 candidates, totalling 672 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed: 17.1min
[Parallel(n_jobs=3)]: Done 359 tasks      | elapsed: 39.1min
[Parallel(n_jobs=3)]: Done 642 tasks      | elapsed: 73.1min
[Parallel(n_jobs=3)]: Done 672 out of 672 | elapsed: 77.0min finished


[[4099  117]
 [  74  239]]
RandomForestClassifier(class_weight='balanced', max_depth=40,
                       min_samples_leaf=5, min_samples_split=5,
                       n_estimators=1000)


In [20]:
# SVM; must be done over TF-IDF Representation

SVM = svm.SVC()

search_area = [
  {'C': [2 ** i for i in list(range(-5,16))], 'kernel': ['linear']},
  {'C': [2 ** i for i in list(range(-5,16))], 'gamma': [2 ** i for i in list(range(-15,4))], 'kernel': ['rbf']},
 ]

grid_search = GridSearchCV(SVM, search_area, cv=8, scoring='recall', n_jobs=3)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

[[4190   26]
 [  57  256]]
SVC(C=256, kernel='linear')


In [21]:
# K Nearest Neighbors over TF-IDF Representaion with IPCA

kNN = KNeighborsClassifier()
param_search = [{'weights': ['distance'], 'n_neighbors': [i for i in range(1,51)]}]

grid_search = GridSearchCV(kNN, param_search, cv=8, scoring='recall', n_jobs=3)
grid_search.fit(Xtr_pca, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_pca)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

[[4180   36]
 [  93  220]]
KNeighborsClassifier(n_neighbors=1, weights='distance')


In [None]:
# Logistic Regression over TF-IDF Representation with IPCA

LReg = LogisticRegression()

param_search = [{'penalty': ['l1', 'l2'],
                'C': [2 ** i for i in range(-5, 16)],
                'class_weight': ['balanced'],
                'solver': ['liblinear']}]

grid_search = GridSearchCV(LReg, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_pca, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_pca)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

Fitting 8 folds for each of 42 candidates, totalling 336 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:    8.4s
[Parallel(n_jobs=3)]: Done 156 tasks      | elapsed:  1.3min


In [None]:
# Random Forest over TF-IDF Representation with IPCA

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(5, 31)],
                'min_samples_leaf': [5],
                'n_estimators': [1000],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=3, verbose=2)
grid_search.fit(Xtr_pca, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_pca)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)

In [None]:
# SVM over TF-IDF Representation with IPCA

SVM = svm.SVC()

search_area = [
  {'C': [2 ** i for i in list(range(-5,16))], 'kernel': ['linear']},
  {'C': [2 ** i for i in list(range(-5,16))], 'gamma': [2 ** i for i in list(range(-15,4))], 'kernel': ['rbf']},
 ]

grid_search = GridSearchCV(SVM, search_area, cv=8, scoring='recall', verbose=2, n_jobs=3)
grid_search.fit(Xtr_pca, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_pca)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)