In [1]:
import pandas as pd
import numpy as np
import logging
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
trainPath = '../data/hateval2019_en_train_clean.csv'
testPath = '../data/hateval2019_en_test_clean.csv'

trainSet = pd.read_csv(trainPath)
testSet = pd.read_csv(testPath)

In [3]:
trainText = trainSet.text

testText = testSet.text

In [4]:
def classify(x, y, x_test, y_test):
    nb = SGDClassifier()

    nb.fit(x, y)
    y_pred_class_nb = nb.predict(x_test)
    
    print(classification_report(y_test, y_pred_class_nb, labels=[0,1]))

In [22]:
# Experiment 1
vect = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.25)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

classify(x_train_dtm, trainSet.HS, x_test_dtm, testSet.HS)

              precision    recall  f1-score   support

           0       0.74      0.23      0.35      1740
           1       0.45      0.89      0.60      1260

    accuracy                           0.51      3000
   macro avg       0.60      0.56      0.47      3000
weighted avg       0.62      0.51      0.45      3000



In [23]:
# Experiment 2
vect = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.5)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

classify(x_train_dtm, trainSet.HS, x_test_dtm, testSet.HS)

              precision    recall  f1-score   support

           0       0.75      0.21      0.32      1740
           1       0.45      0.91      0.60      1260

    accuracy                           0.50      3000
   macro avg       0.60      0.56      0.46      3000
weighted avg       0.63      0.50      0.44      3000



In [24]:
# Experiment 3
vect = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.75)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

classify(x_train_dtm, trainSet.HS, x_test_dtm, testSet.HS)

              precision    recall  f1-score   support

           0       0.76      0.19      0.31      1740
           1       0.45      0.91      0.60      1260

    accuracy                           0.50      3000
   macro avg       0.60      0.55      0.46      3000
weighted avg       0.63      0.50      0.43      3000



In [21]:
# Experiment 4
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.75)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

classify(x_train_dtm, trainSet.HS, x_test_dtm, testSet.HS)

              precision    recall  f1-score   support

           0       0.79      0.16      0.27      1740
           1       0.45      0.94      0.61      1260

    accuracy                           0.49      3000
   macro avg       0.62      0.55      0.44      3000
weighted avg       0.65      0.49      0.41      3000

