# Binary Classifier

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib as mpl
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

## Importing and Processing Data

### Davidson

In [54]:
dataset = 'davidson'

In [14]:
# load Davidson dataset (for binary classification)
davidson_df = pd.read_csv('Data/davidson_labeled_data.csv')

davidson_df = davidson_df.drop(columns=['count', 'hate_speech', 'offensive_language', 'neither'])

# convert from classes 0,1,2 (hate, offensive, nonhate) to 0,1 (nonhate, hate)
def relabel(row):
    return 0 if row['class'] == 2 else 1

davidson_df['is_hate'] = davidson_df.apply(lambda row: relabel(row), axis=1)
davidson_df = davidson_df.drop(columns=['class'])

print(davidson_df)

       Unnamed: 0                                              tweet  is_hate
0               0  !!! RT @mayasolovely: As a woman you shouldn't...        0
1               1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...        1
2               2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...        1
3               3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...        1
4               4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...        1
...           ...                                                ...      ...
24778       25291  you's a muthaf***in lie &#8220;@LifeAsKing: @2...        1
24779       25292  you've gone and broke the wrong heart baby, an...        0
24780       25294  young buck wanna eat!!.. dat nigguh like I ain...        1
24781       25295              youu got wild bitches tellin you lies        1
24782       25296  ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...        0

[24783 rows x 3 columns]


In [18]:
tweets = davidson_df['tweet']

In [19]:
labels = davidson_df['is_hate']

In [22]:
# clean up tweets
# code from Davidson: https://github.com/t-davidson/hate-speech-and-offensive-language/blob/master/classifier/final_classifier.ipynb
import re

def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, '', parsed_text)
    parsed_text = re.sub(mention_regex, '', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]*", tweet.lower())).strip()
    return tweet.split()

## Train Models

In [52]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                   #tokenizer=basic_tokenize,
                                   #preprocessor=preprocess,
                                   max_df = 0.75, min_df=5, 
                                   max_features=10000
                                   )

# TF-IDF feature matrix
features = tfidf_vectorizer.fit_transform(tweets)

In [53]:
X = features
y = labels
seed = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed, test_size=0.2)

### Logistic Regression

In [51]:
LR_model = LogisticRegression().fit(X_train, y_train)
y_preds = LR_model.predict(X_test)
report = classification_report(y_test, y_preds)
acc = accuracy_score(y_test, y_preds)
mod_train = LR_model.score(X_train, y_train) 

print(report)
print('Logistic Regression Train accuracy {:.3f}%'.format(mod_train * 100)) 
print('Logistic Regression Test accuracy {:.3f}%'.format(acc * 100)) 

with open(f'Models/BinaryClassifier/{dataset}_LR_model_report.txt', 'w+') as f:
    f.write(report)
    f.write('Logistic Regression Train accuracy {:.3f}%\n'.format(mod_train * 100)) 
    f.write('Logistic Regression Test accuracy {:.3f}%\n'.format(acc * 100)) 

              precision    recall  f1-score   support

           0       0.88      0.67      0.76       835
           1       0.94      0.98      0.96      4122

    accuracy                           0.93      4957
   macro avg       0.91      0.83      0.86      4957
weighted avg       0.93      0.93      0.93      4957

Logistic Regression Train accuracy 95.778%
Logistic Regression Test accuracy 92.899%


In [35]:
with open('Models/BinaryClassifier/LR_model', 'wb+') as f:  
    pickle.dump(LR_model, f)

### Random Forest

In [48]:
RF_model = RandomForestClassifier()
RF_model.fit(X_train,y_train)
y_preds = RF_model.predict(X_test)
report = classification_report(y_test, y_preds)
acc = accuracy_score(y_test, y_preds)
mod_train = RF_model.score(X_train, y_train) 

print(report)
print('Random Forest Train accuracy {:.3f}%'.format(mod_train * 100)) 
print('Random Forest Test accuracy {:.3f}%'.format(acc * 100)) 

with open(f'Models/BinaryClassifier/{dataset}_RF_model_report.txt', 'w+') as f:
    f.write(report)
    f.write('Random Forest Train accuracy {:.3f}%\n'.format(mod_train * 100))
    f.write('Random Forest Test accuracy {:.3f}%'.format(acc * 100))

              precision    recall  f1-score   support

           0       0.86      0.69      0.77       835
           1       0.94      0.98      0.96      4122

    accuracy                           0.93      4957
   macro avg       0.90      0.83      0.86      4957
weighted avg       0.93      0.93      0.93      4957

Random Forest Train accuracy 99.950%
Random Forest Test accuracy 92.919%
