# Picking Out the Best Classification Model for the Task

In [1]:
import pandas as pd
import numpy as np
import logging
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
trainPath = '../data/hateval2019_en_train_clean.csv'
testPath = '../data/hateval2019_en_test_clean.csv'

trainSet = pd.read_csv(trainPath)
testSet = pd.read_csv(testPath)
hateSet = trainSet[trainSet['HS']==1]

classifiers = ['Multinomial NB', 'Bernoulli NB', 'Gaussian NB', 'Logistic Regression ', 'Stochastic Gradient Descent', 'Support Vector Machine', 'Random Forest', 'Gradient Boosting']

In [3]:
def classifying(x, x_test, y, y_test):
    lr = LogisticRegression(max_iter=10000)
    sgd = SGDClassifier()
    multi_nb = MultinomialNB()
    gaussian_nb = GaussianNB()
    bernoulli_nb = BernoulliNB()
    svm = SVC()
    rf = RandomForestClassifier(n_estimators = 100)
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)

    %time multi_nb.fit(x, y)
    %time bernoulli_nb.fit(x, y)
    %time gaussian_nb.fit(x.toarray(), y)
    %time lr.fit(x, y)
    %time sgd.fit(x, y)
    %time svm.fit(x, y)
    %time rf.fit(x, y)
    %time gb.fit(x, y)
    
    accuracy_array = predicting(x_test, y_test, multi_nb, bernoulli_nb, gaussian_nb, lr, sgd, svm, rf, gb)
    f1_array = metric_f1(x_test, y_test, multi_nb, bernoulli_nb, gaussian_nb, lr, sgd, svm, rf, gb)
    
    return accuracy_array, f1_array

def predicting(x, y, multi_nb, bernoulli_nb, gaussian_nb, lr, sgd, svm, rf, gb):
    #Naive Bayes
    y_pred_class_multi_nb = multi_nb.predict(x)
    y_pred_class_bernoulli_nb = bernoulli_nb.predict(x)
    y_pred_class_gaussian_nb = gaussian_nb.predict(x.toarray())

    multi_nb_acc = metrics.accuracy_score(y, y_pred_class_multi_nb)
    bernoulli_nb_acc = metrics.accuracy_score(y, y_pred_class_bernoulli_nb)
    gaussian_nb_acc = metrics.accuracy_score(y, y_pred_class_gaussian_nb)

    #Linear Models
    y_pred_class_lr = lr.predict(x)
    y_pred_class_sgd = sgd.predict(x)

    lr_acc = metrics.accuracy_score(y, y_pred_class_lr)
    sgd_acc = metrics.accuracy_score(y, y_pred_class_sgd)


    #Support Vector Machine
    y_pred_class_svm = svm.predict(x)

    svm_acc = metrics.accuracy_score(y, y_pred_class_svm)

    #Ensemble
    y_pred_class_rf = rf.predict(x)
    y_pred_class_gb = gb.predict(x)

    rf_acc = metrics.accuracy_score(y, y_pred_class_rf)
    gb_acc = metrics.accuracy_score(y, y_pred_class_gb)
    
    return [multi_nb_acc, bernoulli_nb_acc, gaussian_nb_acc, lr_acc, sgd_acc, svm_acc, rf_acc, gb_acc]

def metric_f1(x,y, multi_nb, bernoulli_nb, gaussian_nb, lr, sgd, svm, rf, gb):
    #Naive Bayes
    y_pred_class_multi_nb = multi_nb.predict(x)
    y_pred_class_bernoulli_nb = bernoulli_nb.predict(x)
    y_pred_class_gaussian_nb = gaussian_nb.predict(x.toarray())

    multi_nb_acc = f1_score(y, y_pred_class_multi_nb)
    bernoulli_nb_acc = f1_score(y, y_pred_class_bernoulli_nb)
    gaussian_nb_acc = f1_score(y, y_pred_class_gaussian_nb)

    #Linear Models
    y_pred_class_lr = lr.predict(x)
    y_pred_class_sgd = sgd.predict(x)

    lr_acc = f1_score(y, y_pred_class_lr)
    sgd_acc = f1_score(y, y_pred_class_sgd)


    #Support Vector Machine
    y_pred_class_svm = svm.predict(x)

    svm_acc = f1_score(y, y_pred_class_svm)

    #Ensemble
    y_pred_class_rf = rf.predict(x)
    y_pred_class_gb = gb.predict(x)

    rf_acc = f1_score(y, y_pred_class_rf)
    gb_acc = f1_score(y, y_pred_class_gb)
    
    return [multi_nb_acc, bernoulli_nb_acc, gaussian_nb_acc, lr_acc, sgd_acc, svm_acc, rf_acc, gb_acc]

In [4]:
#vect = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

In [5]:
#Hate Score
accuracy_array, f1_array = classifying(x_train_dtm, x_test_dtm, trainSet.HS, testSet.HS)

data = pd.DataFrame(accuracy_array, columns=['Accuracy'], index=classifiers)
f1 = pd.DataFrame(f1_array, columns=['f1'], index=classifiers)

Wall time: 3 ms
Wall time: 5 ms
Wall time: 667 ms
Wall time: 55 ms
Wall time: 18 ms
Wall time: 7.24 s
Wall time: 4.51 s
Wall time: 773 ms


In [6]:
print(data)
print("\n")
print(f1)

                             Accuracy
Multinomial NB               0.507000
Bernoulli NB                 0.473667
Gaussian NB                  0.461667
Logistic Regression          0.488000
Stochastic Gradient Descent  0.491333
Support Vector Machine       0.479667
Random Forest                0.454667
Gradient Boosting            0.467667


                                   f1
Multinomial NB               0.605705
Bernoulli NB                 0.605349
Gaussian NB                  0.589581
Logistic Regression          0.606960
Stochastic Gradient Descent  0.607106
Support Vector Machine       0.603908
Random Forest                0.597837
Gradient Boosting            0.592914


In [7]:
#print(hateText.shape)
#print(trainTarget.shape)
#print(trainAggressive.shape)


#print(x_test_dtm.shape)
#print(testHate.shape)
#print(testAggressive.shape)

#nb = MultinomialNB()

#print(hate_train_dtm.shape)
#print(trainTarget.shape)
#print(test.shape)

#nb.fit(hate_train_dtm, trainTarget)
#data_predict = nb.predict(test)

#f1 = f1_score(testTarget, data_predict)
#print(f1)


(3000, 3908)


NameError: name 'testHate' is not defined

In [9]:
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)
x_train_dtm = vect.fit_transform(hateSet.text)
x_test_dtm = vect.transform(testSet.text)

In [10]:
#Target Score
#accuracy_array, f1_array = classifying(x_train_dtm, x_test_dtm, trainSet.TR, testSet.TR)
accuracy_array, f1_array = classifying(x_train_dtm, x_test_dtm, hateSet.TR, testSet.TR)

data = pd.DataFrame(accuracy_array, columns=['Accuracy'], index=classifiers)
f1 = pd.DataFrame(f1_array, columns=['f1'], index=classifiers)

Wall time: 3 ms
Wall time: 3 ms
Wall time: 666 ms
Wall time: 79 ms
Wall time: 11 ms
Wall time: 3.99 s
Wall time: 3.05 s
Wall time: 778 ms


In [11]:
print(data)
print("\n")
print(f1)

                             Accuracy
Multinomial NB               0.832000
Bernoulli NB                 0.700000
Gaussian NB                  0.650667
Logistic Regression          0.800667
Stochastic Gradient Descent  0.768333
Support Vector Machine       0.775333
Random Forest                0.740667
Gradient Boosting            0.821000


                                   f1
Multinomial NB               0.408451
Bernoulli NB                 0.524815
Gaussian NB                  0.424176
Logistic Regression          0.544901
Stochastic Gradient Descent  0.541254
Support Vector Machine       0.546433
Random Forest                0.524450
Gradient Boosting            0.584043


In [12]:
#Aggressive Score
#accuracy_array, f1_array = classifying(x_train_dtm, x_test_dtm, trainSet.AG, testSet.AG)
accuracy_array, f1_array = classifying(x_train_dtm, x_test_dtm, hateSet.AG, testSet.TR)

data = pd.DataFrame(accuracy_array, columns=['Accuracy'], index=classifiers)
f1 = pd.DataFrame(f1_array, columns=['f1'], index=classifiers)

Wall time: 3 ms
Wall time: 5 ms
Wall time: 668 ms
Wall time: 75 ms
Wall time: 12 ms
Wall time: 6.02 s
Wall time: 3.7 s
Wall time: 768 ms


In [13]:
print(data)
print("\n")
print(f1)

                             Accuracy
Multinomial NB               0.744333
Bernoulli NB                 0.639667
Gaussian NB                  0.406000
Logistic Regression          0.746000
Stochastic Gradient Descent  0.694000
Support Vector Machine       0.708667
Random Forest                0.662000
Gradient Boosting            0.639333


                                   f1
Multinomial NB               0.270219
Bernoulli NB                 0.354627
Gaussian NB                  0.332084
Logistic Regression          0.331579
Stochastic Gradient Descent  0.337662
Support Vector Machine       0.311811
Random Forest                0.335518
Gradient Boosting            0.281541


# Results


## Hate Score
```
                             Accuracy
Multinomial NB               0.507000
Bernoulli NB                 0.473667
Gaussian NB                  0.461667
Logistic Regression          0.488000
Stochastic Gradient Descent  0.491333
Support Vector Machine       0.479667
Random Forest                0.454667
Gradient Boosting            0.467667


                                   f1
Multinomial NB               0.605705
Bernoulli NB                 0.605349
Gaussian NB                  0.589581
Logistic Regression          0.606960
Stochastic Gradient Descent  0.607106
Support Vector Machine       0.603908
Random Forest                0.597837
Gradient Boosting            0.592914

```

## Target Score

### No Filter
```
Accuracy
                             Accuracy
Multinomial NB               0.832000
Bernoulli NB                 0.700000
Gaussian NB                  0.650667
Logistic Regression          0.800667
Stochastic Gradient Descent  0.768333
Support Vector Machine       0.775333
Random Forest                0.740667
Gradient Boosting            0.821000


                                   f1
Multinomial NB               0.408451
Bernoulli NB                 0.524815
Gaussian NB                  0.424176
Logistic Regression          0.544901
Stochastic Gradient Descent  0.541254
Support Vector Machine       0.546433
Random Forest                0.524450
Gradient Boosting            0.584043

```

### Filter HS=1

## Aggressive Score

### Filter HS=1
```
                             Accuracy
Multinomial NB               0.744333
Bernoulli NB                 0.639667
Gaussian NB                  0.406000
Logistic Regression          0.746000
Stochastic Gradient Descent  0.694000
Support Vector Machine       0.708667
Random Forest                0.662000
Gradient Boosting            0.639333


                                   f1
Multinomial NB               0.270219
Bernoulli NB                 0.354627
Gaussian NB                  0.332084
Logistic Regression          0.331579
Stochastic Gradient Descent  0.337662
Support Vector Machine       0.311811
Random Forest                0.335518
Gradient Boosting            0.281541
```

### Filter HS=1