# Picking Out the Best Classification Model for the Task

In [1]:
import pandas as pd
import numpy as np
import logging
from pprint import pprint
from time import time
from sklearn import metrics
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [2]:
trainPath = '../data/hateval2019_en_train_clean.csv'
testPath = '../data/hateval2019_en_test_clean.csv'

trainSet = pd.read_csv(trainPath)
testSet = pd.read_csv(testPath)

In [3]:
trainText = trainSet.text
trainHate = trainSet.HS
trainTarget = trainSet.TR
trainAggressive = trainSet.AG

testText = testSet.text
testHate = testSet.HS
testTarget = testSet.TR
testAggressive = testSet.AG

In [4]:
#From Previous Experiments these are the settings that are currently the most suitable for a general experiment
#vect = CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)

x_train_dtm = vect.fit_transform(trainText)
x_test_dtm = vect.transform(testText)

In [5]:
def classifying(x, y):
    lr = LogisticRegression(max_iter=10000)
    sgd = SGDClassifier()
    multi_nb = MultinomialNB()
    gaussian_nb = GaussianNB()
    bernoulli_nb = BernoulliNB()
    svm = SVC()
    rf = RandomForestClassifier(n_estimators = 100)
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)

    %time multi_nb.fit(x, y)
    %time bernoulli_nb.fit(x, y)
    %time gaussian_nb.fit(x.toarray(), y)
    %time lr.fit(x, y)
    %time sgd.fit(x, y)
    %time svm.fit(x, y)
    %time rf.fit(x, y)
    %time gb.fit(x, y)

In [6]:
def predicting(x, y):
    #Naive Bayes
    y_pred_class_multi_nb = multi_nb.predict(x)
    y_pred_class_bernoulli_nb = bernoulli_nb.predict(x)
    y_pred_class_gaussian_nb = gaussian_nb.predict(x.toarray())

    multi_nb_acc = metrics.accuracy_score(y, y_pred_class_multi_nb)
    bernoulli_nb_acc = metrics.accuracy_score(y, y_pred_class_bernoulli_nb)
    gaussian_nb_acc = metrics.accuracy_score(y, y_pred_class_gaussian_nb)

    #Linear Models
    y_pred_class_lr = lr.predict(x)
    y_pred_class_sgd = sgd.predict(x)

    lr_acc = metrics.accuracy_score(y, y_pred_class_lr)
    sgd_acc = metrics.accuracy_score(y, y_pred_class_sgd)


    #Support Vector Machine
    y_pred_class_svm = svm.predict(x)

    svm_acc = metrics.accuracy_score(y, y_pred_class_svm)

    #Ensemble
    y_pred_class_rf = rf.predict(x)
    y_pred_class_gb = gb.predict(x)

    rf_acc = metrics.accuracy_score(y, y_pred_class_rf)
    gb_acc = metrics.accuracy_score(y, y_pred_class_gb)
    
    return [multi_nb_acc, bernoulli_nb_acc, gaussian_nb_acc, lr_acc, sgd_acc, svm_acc, rf_acc, gb_acc]

In [7]:
classifiers = ['Multinomial NB', 'Bernoulli NB', 'Gaussian NB', 'Logistic Regression ', 'Stochastic Gradient Descent', 'Support Vector Machine', 'Random Forest', 'Gradient Boosting']

#Hate Score
classifying(x_train_dtm, trainHate)
accuracy_array = predicting(x_test_dtm, testHate)

data = pd.DataFrame(accuracy_array, columns=['Accuracy'], index=classifiers)
data

Wall time: 1.97 ms
Wall time: 4 ms
Wall time: 660 ms
Wall time: 83 ms
Wall time: 18 ms
Wall time: 6.87 s
Wall time: 4.44 s
Wall time: 776 ms


NameError: name 'multi_nb' is not defined

In [None]:
#Target Score
classifying(x_train_dtm, trainTarget)
accuracy_array = predicting(x_test_dtm, testTarget)

data = pd.DataFrame(accuracy_array, columns=['Accuracy'], index=classifiers)
data

In [None]:
#Aggressive Score
classifying(x_train_dtm, trainAggressive)
accuracy_array = predicting(x_test_dtm, testAggressive)

data = pd.DataFrame(accuracy_array, columns=['Accuracy'], index=classifiers)
data

# Results

## Hate Score
```
Accuracy
Multinomial NB 	         0.461333
Bernoulli NB 	           0.490667
Gaussian NB 	            0.459667
Logistic Regression 	    0.493000
Stochastic Gradient Descent 0.494667
Support Vector Machine      0.459000
Random Forest 	          0.453667
Gradient Boosting           0.452667
```

## Target Score
```
Multinomial NB 	             0.722333
Bernoulli NB 	               0.795667
Gaussian NB 	                0.736667
Logistic Regression 	        0.785667
Stochastic Gradient Descent 	0.766000
Support Vector Machine 	     0.722333
Random Forest 	              0.739667
Gradient Boosting 	          0.832000
```

## Aggressive Score
```
Multinomial NB 	             0.640333
Bernoulli NB 	               0.702667
Gaussian NB 	                0.580000
Logistic Regression 	        0.685000
Stochastic Gradient Descent 	0.665000
Support Vector Machine 	     0.693000
Random Forest 	              0.668333
Gradient Boosting 	          0.655667
```