In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Subsampling and Splitting Data

In [2]:
# import data
df = pd.read_csv('data/clean_data.csv', lineterminator='\n')
print(f'There are currently {len(df)} points in our data set.')
df.head()

There are currently 120542 points in our data set.


Unnamed: 0,label,tweet,clean_tweet
0,0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,alex brosas idiot aldubksgoestous
1,0,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",nancy reagan fucking like
2,0,RT @MailOnline: The Nazi death gas so horrific...,nazi death gas horrific hitler fear
3,1,I hate er chase because if the Bitch that work...,hate er chase bitch work literally evil
4,0,RT @chevleia: don't hmu when u get tired of ur...,hmu tired ur bore hoe ur bore


In [3]:
from sklearn.model_selection import train_test_split

# Split data into features and target label 
X = df.clean_tweet
y = df.label

# Get smaller subset of the data
# NOTE: Since the classes are imbalanced, we use a stratified random split 
X_small, X_big, y_small, y_big = train_test_split(X, y, stratify=y, test_size=0.75)
print(f'Our subsample has {len(X_small)} data points.')

# Split subsample into training and test sets. 
# Once again, we use a stratified split. 
X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, stratify=y_small, test_size=0.2)
print(f'The training split of the subsample has {len(X_train)} data points.')
print(f'The test split of the subsample has {len(X_test)} data points.')

Our subsample has 30135 data points.
The training split of the subsample has 24108 data points.
The test split of the subsample has 6027 data points.


## Model Selection

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
# List of potential models to try out with different parameters
model_list = [
    (LogisticRegression(solver='saga'), {
        'model__penalty' : ['l1', 'l2', 'none'],
        'model__C' : [0.01, 0.1, 1],
        'model__class_weight' : ['balanced', None],      
    }),
    (MultinomialNB(), {
        'model__fit_prior': [True, False],
        'model__alpha' : [0, 0.01, 0.1, 0.5, 0.8, 1]
    }), 
    (RandomForestClassifier(), {
        'model__class_weight' : ['balanced', None],
        'model__n_estimators' : [10, 100, 1000],
        'model__ccp_alpha' : [0, 0.01, 0.1]
    }), 
    (XGBClassifier(eval_metric='logloss', use_label_encoder=False), {
        'model__scale_pos_weight' : [1, 10],
        'model__max_depth' : [2, 6, 10],
        'model__eta' : [0.01, 0.3, 0.6]
    })
]

# Do a grid search cross validation over the parameter grid for each model, and print the results 
for model in model_list:
    
    pipe = Pipeline([('bow', CountVectorizer(min_df=2)),
                     ('tfidf', TfidfTransformer()),
                     ('model', model[0])])
    param_grid = {
        'bow__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'tfidf__use_idf': [True, False],
        'tfidf__norm': ['l1', 'l2'],
    }
    param_grid.update(model[1])
    
    clf = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(str(model[0]).split('(')[0])
    print(clf.best_params_)
    print(classification_report(y_test, y_pred, zero_division=0))
    print()

  "Setting penalty='none' will ignore the C and l1_ratio "


LogisticRegression
{'bow__ngram_range': (1, 2), 'model__C': 0.01, 'model__class_weight': None, 'model__penalty': 'none', 'tfidf__norm': 'l1', 'tfidf__use_idf': False}
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      5613
           1       0.50      0.32      0.39       414

    accuracy                           0.93      6027
   macro avg       0.73      0.65      0.68      6027
weighted avg       0.92      0.93      0.92      6027


MultinomialNB
{'bow__ngram_range': (1, 2), 'model__alpha': 0.5, 'model__fit_prior': False, 'tfidf__norm': 'l2', 'tfidf__use_idf': False}
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      5613
           1       0.42      0.38      0.40       414

    accuracy                           0.92      6027
   macro avg       0.69      0.67      0.68      6027
weighted avg       0.92      0.92      0.92      6027


RandomForestClassifier
{'bow__ngram_rang