In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

In [16]:
# Randomly partition the data and convert it to Count, Boolean, and TF-IDF Formats.
# PCA has been removed due to its rather intense computational constraints. :(
# Data has been stratified here due to severe class imbalance.

#data = pd.read_csv('filtered.csv')
data = pd.read_csv('clean.csv')
#labeled_data = data[data['injury_report'] != 'x']

#Xtr, Xte, Ytr, Yte =  model_selection.train_test_split(labeled_data['tweet'],
#                                                       labeled_data['injury_report'].astype(int),
#                                                       stratify=labeled_data['injury_report'],
#                                                       test_size=0.3)

url = 'https://raw.githubusercontent.com/Jhagrut/Twitter-Project/main/Data/09-09/train%2009-09.csv'
train = pd.read_csv(url)

url = 'https://raw.githubusercontent.com/Jhagrut/Twitter-Project/main/Data/09-09/test%2009-09.csv'
test = pd.read_csv(url)


v = TfidfVectorizer()
v.fit(data['clean'].dropna())

Xtr_tfidf = v.transform(Xtr)
Xte_tfidf = v.transform(Xte)

#dump(v, 'tfidf.joblib')

v = CountVectorizer(binary=True)

v.fit(data['clean'].dropna())
Xtr_bool = v.transform(Xtr)
Xte_bool = v.transform(Xte)

#dump(v, 'bool.joblib')

v = CountVectorizer()

v.fit(data['clean'].dropna())
Xtr_count = v.transform(Xtr)
Xte_count = v.transform(Xte)

#dump(v, 'count.joblib')

In [19]:
# K Nearest Neighbors over Boolean representation

kNN = KNeighborsClassifier()
param_search = [{'weights': ['distance'], 'n_neighbors': [i for i in range(1,20)]}]

grid_search = GridSearchCV(kNN, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'kNN.joblib')

Fitting 8 folds for each of 19 candidates, totalling 152 fits
[[7422   27]
 [ 776  292]]
KNeighborsClassifier(n_neighbors=1, weights='distance')


['kNN.joblib']

In [20]:
# K Nearest Neighbors over TF-IDF representation

kNN = KNeighborsClassifier()
param_search = [{'weights': ['distance'], 'n_neighbors': [i for i in range(1,20)]}]

grid_search = GridSearchCV(kNN, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'kNN.joblib')

Fitting 8 folds for each of 19 candidates, totalling 152 fits
[[7406   43]
 [ 859  209]]
KNeighborsClassifier(n_neighbors=1, weights='distance')


['kNN.joblib']

In [21]:
# Bernoulli Naive Bayes; must be done over Boolean Representaion

BNB = BernoulliNB()

param_search = [{'alpha': list(np.linspace(0, 0.1, 1000))}]

grid_search = GridSearchCV(BNB, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'bernoulliNB.joblib')

Fitting 8 folds for each of 1000 candidates, totalling 8000 fits
[[7066  383]
 [ 148  920]]
BernoulliNB(alpha=0.07707707707707707)


['bernoulliNB.joblib']

In [22]:
# Multinomial Naive Bayes; must be done over Count Representation

MNB = MultinomialNB()

param_search = [{'alpha': list(np.linspace(0, 0.1, 1000))}]

grid_search = GridSearchCV(MNB, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_count, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_count)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'multinomialNB.joblib')

Fitting 8 folds for each of 1000 candidates, totalling 8000 fits
[[6959  490]
 [ 148  920]]
MultinomialNB(alpha=0.095995995995996)


['multinomialNB.joblib']

In [23]:
# Logistic Regression over TF-IDF Representation

LReg = LogisticRegression()

param_search = [{'penalty': ['l1', 'l2'],
                'C': [2 ** i for i in range(-5, 16)],
                'class_weight': ['balanced'],
                'solver': ['liblinear']}]

grid_search = GridSearchCV(LReg, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'logistic_regression.joblib')

Fitting 8 folds for each of 42 candidates, totalling 336 fits
[[7296  153]
 [  67 1001]]
LogisticRegression(C=2, class_weight='balanced', penalty='l1',
                   solver='liblinear')


['logistic_regression.joblib']

In [27]:
# Random Forest over Boolean Representation

Forest = RandomForestClassifier()

param_search = [{'criterion': ['entropy', 'gini'],
                 'max_depth': [i for i in range(20, 41)],
                'n_estimators': [100],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_bool, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_bool)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'random_forest_bool.joblib')

Fitting 8 folds for each of 84 candidates, totalling 672 fits
[[7240  209]
 [ 253  815]]
RandomForestClassifier(class_weight='balanced', max_depth=21)


['random_forest_bool.joblib']

In [28]:
# Random Forest over TF-IDF Representation

Forest = RandomForestClassifier()


param_search = [{'criterion': ['entropy', 'gini'], 'min_samples_split': [5], 
                 'max_depth': [i for i in range(20, 41)],
                'min_samples_leaf': [5],
                'n_estimators': [100],
                'class_weight': ['balanced', 'balanced_subsample']}]

grid_search = GridSearchCV(Forest, param_search, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'random_forest_tfidf.joblib')

Fitting 8 folds for each of 84 candidates, totalling 672 fits
[[7091  358]
 [ 160  908]]
RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=37, min_samples_leaf=5, min_samples_split=5)


['random_forest_tfidf.joblib']

In [29]:
# SVM; must be done over TF-IDF Representation

SVM = svm.SVC()

search_area = [
  {'C': [2 ** i for i in list(range(-5,16))], 'kernel': ['linear']},
  #{'C': [2 ** i for i in list(range(-5,16))], 'gamma': [2 ** i for i in list(range(-15,4))], 'kernel': ['rbf']},
  # Removed the rbf kernel because it was computationally expensive and didn't perform better than linear.
  # Perhaps that would change with more data.
 ]

grid_search = GridSearchCV(SVM, search_area, cv=8, scoring='recall', n_jobs=2, verbose=2)
grid_search.fit(Xtr_tfidf, Ytr)

y_hat = grid_search.best_estimator_.predict(Xte_tfidf)

print(confusion_matrix(Yte, y_hat))
print(grid_search.best_estimator_)
dump(grid_search.best_estimator_, 'svm.joblib')

Fitting 8 folds for each of 21 candidates, totalling 168 fits
[[7381   68]
 [ 143  925]]
SVC(C=4, kernel='linear')


['svm.joblib']