In [49]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Subsampling and Splitting Data

In [2]:
# import data
df = pd.read_csv('data/clean_data.csv', lineterminator='\n')
print(f'There are currently {len(df)} points in our data set.')
df.head()

There are currently 120542 points in our data set.


Unnamed: 0,label,tweet,clean_tweet
0,0,Alex Brosas another idiot #ALDUBKSGoesToUS ht...,alex brosas idiot aldubksgoestous
1,0,"RT @ItIzBiz: as Nancy Reagan would say, 'just ...",nancy reagan fucking like
2,0,RT @MailOnline: The Nazi death gas so horrific...,nazi death gas horrific hitler fear
3,1,I hate er chase because if the Bitch that work...,hate er chase bitch work literally evil
4,0,RT @chevleia: don't hmu when u get tired of ur...,hmu tired ur bore hoe ur bore


In [3]:
from sklearn.model_selection import train_test_split

# Split data into features and target label 
X = df.clean_tweet
y = df.label

# Get smaller subset of the data
# NOTE: Since the classes are imbalanced, we use a stratified random split 
X_small, X_big, y_small, y_big = train_test_split(X, y, stratify=y, test_size=0.75)
print(f'Our subsample has {len(X_small)} data points.')

# Split subsample into training and test sets. 
# Once again, we use a stratified split. 
X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, stratify=y_small, test_size=0.2)
print(f'The training split of the subsample has {len(X_train)} data points.')
print(f'The test split of the subsample has {len(X_test)} data points.')

Our subsample has 30135 data points.
The training split of the subsample has 24108 data points.
The test split of the subsample has 6027 data points.


## Model Selection

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [5]:
# List of potential models to try out with different parameters
model_list = [
    (LogisticRegression(solver='saga'), {
        'model__penalty' : ['l1', 'l2', 'none'],
        'model__C' : [0.01, 0.1, 1],
        'model__class_weight' : ['balanced', None],      
    }),
    (MultinomialNB(), {
        'model__fit_prior': [True, False],
        'model__alpha' : [0, 0.01, 0.1, 0.5, 0.8, 1]
    }), 
    (RandomForestClassifier(), {
        'model__class_weight' : ['balanced', None],
        'model__n_estimators' : [10, 100, 1000],
        'model__ccp_alpha' : [0, 0.01, 0.1]
    }), 
    (XGBClassifier(eval_metric='logloss', use_label_encoder=False), {
        'model__scale_pos_weight' : [1, 10],
        'model__max_depth' : [2, 6, 10],
        'model__eta' : [0.01, 0.3, 0.6]
    })
]

# Do a grid search cross validation over the parameter grid for each model, and print the results 
for model in model_list:
    
    pipe = Pipeline([('bow', CountVectorizer(min_df=2)),
                     ('tfidf', TfidfTransformer()),
                     ('model', model[0])])
    param_grid = {
        'bow__ngram_range': [(1, 1), (1, 2), (2, 2)],
        'tfidf__use_idf': [True, False],
        'tfidf__norm': ['l1', 'l2'],
    }
    param_grid.update(model[1])
    
    clf = GridSearchCV(pipe, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print(str(model[0]).split('(')[0])
    print(clf.best_params_)
    print(classification_report(y_test, y_pred, zero_division=0))
    print()

  "Setting penalty='none' will ignore the C and l1_ratio "


LogisticRegression
{'bow__ngram_range': (1, 2), 'model__C': 0.01, 'model__class_weight': None, 'model__penalty': 'none', 'tfidf__norm': 'l1', 'tfidf__use_idf': False}
              precision    recall  f1-score   support

           0       0.95      0.98      0.96      5613
           1       0.50      0.32      0.39       414

    accuracy                           0.93      6027
   macro avg       0.73      0.65      0.68      6027
weighted avg       0.92      0.93      0.92      6027


MultinomialNB
{'bow__ngram_range': (1, 2), 'model__alpha': 0.5, 'model__fit_prior': False, 'tfidf__norm': 'l2', 'tfidf__use_idf': False}
              precision    recall  f1-score   support

           0       0.95      0.96      0.96      5613
           1       0.42      0.38      0.40       414

    accuracy                           0.92      6027
   macro avg       0.69      0.67      0.68      6027
weighted avg       0.92      0.92      0.92      6027


RandomForestClassifier
{'bow__ngram_rang

## Model Prototype

All of the models have fairly similar performance. Some have slightly higher precision or recall or accuracy, so it depends on what we're going for. In this case, I would like to use the Logistic Regression model, as it requires far less computational overhead to scale to the rest of the data. 

Now, let's go ahead and use 50% of the data to train a Logistic Regression prototype model, and let's perform hyperparameter optimization over a larger space with randomized search cross validation.

In [45]:
# Calculate how many unseen rows to pull out of X_big to scale X_small up to 50% of total data 
sample_size = ((len(X_small) + len(X_big)) // 2) - len(X_small)
X_new = X_big.sample(sample_size).sort_index()
y_new = y_big[y_big.index.isin(X_new.index)].sort_index()

# Append new rows to X_small and y_small
X = X_small.append(X_new)
y = y_small.append(y_new)

# Get remaining unseen rows
X_unseen = X_big[~X_big.index.isin(X_new.index)].sort_index()
y_unseen = y_big[~y_big.index.isin(y_new.index)].sort_index()

In [127]:
# Split subsample into training and test sets. 
# Once again, we use a stratified split. 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)
print(f'The training split of the subsample has {len(X_train)} data points.')
print(f'The test split of the subsample has {len(X_test)} data points.')

The training split of the subsample has 48216 data points.
The test split of the subsample has 12055 data points.


In [128]:
# Train using RandomizedSearchCV 
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV

pipe = Pipeline([('bow', CountVectorizer(min_df=2)),
                 ('tfidf', TfidfTransformer()),
                 ('model', LogisticRegression())])
                 
distributions = {
    'bow__ngram_range' : [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf' : [True, False],
    'tfidf__norm' : ['l1', 'l2'],
    'model__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'model__penalty' : ['l1', 'l2', 'none'],
    'model__C' : uniform(loc=0, scale=10),
    'model__max_iter': range(100, 1000),
    'model__class_weight' : ['balanced', None]
}

clf = RandomizedSearchCV(pipe, distributions, cv=5, scoring='f1_macro', n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Score model on test set and print best parameters 
print('Logistic Regression')
print(clf.best_params_)
print(classification_report(y_test, y_pred, zero_division=0))
print()

 0.70866823 0.67846125 0.69348996 0.70051624]


Logistic Regression
{'bow__ngram_range': (1, 2), 'model__C': 5.974598940642741, 'model__class_weight': None, 'model__max_iter': 885, 'model__penalty': 'l1', 'model__solver': 'liblinear', 'tfidf__norm': 'l2', 'tfidf__use_idf': True}
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     11233
           1       0.60      0.41      0.48       822

    accuracy                           0.94     12055
   macro avg       0.78      0.69      0.73     12055
weighted avg       0.93      0.94      0.94     12055




Next, we'll try both oversampling and undersampling to see if it improves model performance. First, let's start with oversampling. 

In [136]:
from imblearn.over_sampling import RandomOverSampler

# Reformat training data
X_train = X_train.values.reshape(-1,1)
y_train = y_train.values.reshape(-1,1)

# Oversample training data 
oversampler = RandomOverSampler()
X_train_over, y_train_over = oversampler.fit_resample(X_train, y_train)

# Reformat oversampled data
X_train_over = pd.Series(X_train_over.squeeze())
y_train_over = pd.Series(y_train_over.squeeze())

clf_over = RandomizedSearchCV(pipe, distributions, cv=5, scoring='f1_macro', n_jobs=-1)
clf_over.fit(X_train_over, y_train_over)
y_pred = clf_over.predict(X_test)

# Score model on test set and print best parameters 
print('Logistic Regression (Oversampling)')
print(clf_over.best_params_)
print(classification_report(y_test, y_pred, zero_division=0))
print()

        nan 0.98998323 0.97053069 0.9743018 ]
  "Setting penalty='none' will ignore the C and l1_ratio "


Logistic Regression (Oversampling)
{'bow__ngram_range': (1, 2), 'model__C': 4.088818197618195, 'model__class_weight': None, 'model__max_iter': 751, 'model__penalty': 'none', 'model__solver': 'newton-cg', 'tfidf__norm': 'l1', 'tfidf__use_idf': True}
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     11233
           1       0.60      0.32      0.41       822

    accuracy                           0.94     12055
   macro avg       0.78      0.65      0.69     12055
weighted avg       0.93      0.94      0.93     12055




Now, let's try undersampling.

In [137]:
from imblearn.under_sampling import RandomUnderSampler

# Oversample training data 
undersampler = RandomUnderSampler()
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

# Reformat undersampled
X_train_under = pd.Series(X_train_under.squeeze())
y_train_under = pd.Series(y_train_under.squeeze())

clf_under = RandomizedSearchCV(pipe, distributions, cv=5, scoring='f1_macro', n_jobs=-1)
clf_under.fit(X_train_under, y_train_under)
y_pred = clf_under.predict(X_test)

# Score model on test set and print best parameters 
print('Logistic Regression (Undersampling)')
print(clf_under.best_params_)
print(classification_report(y_test, y_pred, zero_division=0))
print()

Logistic Regression (Undersampling)
{'bow__ngram_range': (1, 1), 'model__C': 1.8213657528887306, 'model__class_weight': 'balanced', 'model__max_iter': 833, 'model__penalty': 'l2', 'model__solver': 'saga', 'tfidf__norm': 'l2', 'tfidf__use_idf': False}
              precision    recall  f1-score   support

           0       0.98      0.81      0.89     11233
           1       0.23      0.78      0.36       822

    accuracy                           0.81     12055
   macro avg       0.61      0.80      0.62     12055
weighted avg       0.93      0.81      0.85     12055




 0.70952062 0.70690382 0.55424465        nan]


Looks like the best method is to not use any resampling at all! Let's explore what happens when we change the decision threshold of our prototype.

In [147]:
y_pred = clf.predict_proba(X_test)[:,1] > 0.5
print('Threshold 0.5')
print(classification_report(y_test, y_pred, zero_division=0))
print()

y_pred = clf.predict_proba(X_test)[:,1] > 0.9
print('Threshold 0.9')
print(classification_report(y_test, y_pred, zero_division=0))
print()

y_pred = clf.predict_proba(X_test)[:,1] > 0.1
print('Threshold 0.1')
print(classification_report(y_test, y_pred, zero_division=0))
print()

Threshold 0.5
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     11233
           1       0.60      0.41      0.48       822

    accuracy                           0.94     12055
   macro avg       0.78      0.69      0.73     12055
weighted avg       0.93      0.94      0.94     12055


Threshold 0.9
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     11233
           1       0.81      0.25      0.38       822

    accuracy                           0.94     12055
   macro avg       0.88      0.62      0.68     12055
weighted avg       0.94      0.94      0.93     12055


Threshold 0.1
              precision    recall  f1-score   support

           0       0.97      0.91      0.94     11233
           1       0.34      0.63      0.44       822

    accuracy                           0.89     12055
   macro avg       0.65      0.77      0.69     12055
weighted avg       0.93      0

Based on these metrics, may want to set a high threshold (like 0.9) to get greater precision, even if the recall decreases. This is because we really want to avoid a false positive (i.e. accusing someone of hate speech when there isn't any). With a threshold of 0.9, our model identified a fourth of all hate tweets in the test set with 81% precision! In the final application, we may just return the probabilistic predictions anyway. 

## Scaling the Model

Now, we'll train the model on 75% of the entire dataset, and evaluate it on the remaining 25%. Afterwards, we'll train the model on the entire test set so it's ready for deployment. 

In [151]:
# Pull from unseen data to create test set that uses 25% of all data 
test_size = len(X_unseen) // 2
X_test = X_unseen.sample(test_size).sort_index()
y_test = y_unseen[y_unseen.index.isin(X_test.index)].sort_index()

# Get remaining unseen data 
X_unseen = X_unseen[~X_unseen.index.isin(X_test.index)].sort_index()
y_unseen = y_unseen[~y_unseen.index.isin(y_test.index)].sort_index()

# Append unseen rows to X & y to create training sets
X_train = X.append(X_unseen)
y_train = y.append(y_unseen)

In [152]:
print(f'The training split of the subsample has {len(X_train)} data points.')
print(f'The test split of the subsample has {len(X_test)} data points.')

The training split of the subsample has 90407 data points.
The test split of the subsample has 30135 data points.


In [157]:
# Scale the model by training on 75% of all data 
clf = RandomizedSearchCV(pipe, distributions, cv=5, scoring='f1_macro', n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Score model on test set and print best parameters 
print('Logistic Regression (Scaled)')
print(clf.best_params_)
print(classification_report(y_test, y_pred, zero_division=0))
print()

 0.56118615 0.62405842 0.55578837 0.62492922]


Logistic Regression (Scaled)
{'bow__ngram_range': (1, 2), 'model__C': 3.594267903050815, 'model__class_weight': None, 'model__max_iter': 988, 'model__penalty': 'l2', 'model__solver': 'saga', 'tfidf__norm': 'l2', 'tfidf__use_idf': True}
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     28065
           1       0.77      0.34      0.47      2070

    accuracy                           0.95     30135
   macro avg       0.86      0.67      0.72     30135
weighted avg       0.94      0.95      0.94     30135




Now, to prepare the model for deployment, we train it using all of the data. 

In [166]:
# Combine train and test sets
X = X_train.append(X_test)
y = y_train.append(y_test)

# Train model on all the data
clf = RandomizedSearchCV(pipe, distributions, cv=5, scoring='f1_macro', n_jobs=-1)
clf.fit(X, y);

# Print final parameters
print('Logistic Regression (All Data)')
print(clf.best_params_)

 0.54365524        nan        nan 0.72357878]


Logistic Regression (All Data)
{'bow__ngram_range': (1, 1), 'model__C': 9.076851334818326, 'model__class_weight': None, 'model__max_iter': 297, 'model__penalty': 'l1', 'model__solver': 'saga', 'tfidf__norm': 'l2', 'tfidf__use_idf': False}




## Model Persistence

In [171]:
# Pickle the model
import pickle
pickle.dump(clf, open('model.pickle', 'wb'))