In [80]:
%matplotlib notebook
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_curve, roc_auc_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

trainPath = '../data/hateval2019_en_train_clean.csv'
testPath = '../data/hateval2019_en_test_clean.csv'

trainSet = pd.read_csv(trainPath)
testSet = pd.read_csv(testPath)

hateSet = trainSet[trainSet['HS']==1]

print(trainSet.shape)
print(hateSet.shape)

print(testSet.shape)

(9000, 6)
(3783, 6)
(3000, 6)


In [92]:
def fitting(x,y):
    nb = BernoulliNB()

    %time nb.fit(x, y)
    
    return nb

def classify(x,y, nb):
    y_pred_class_nb = nb.predict(x)

    nb_acc = metrics.accuracy_score(y, y_pred_class_nb)

    print(classification_report(y, y_pred_class_nb, labels=[0,1]))
    
    print(f1_score(y, y_pred_class_nb))
    
    conf_matrix = confusion_matrix(y_true=y, y_pred=y_pred_class_nb)

    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

    plt.xlabel('Predictions', fontsize=18)
    plt.ylabel('Actuals', fontsize=18)
    plt.title('Confusion Matrix', fontsize=18)
    plt.show()
    
    
def ROC(x,y, nb):
    
    scores = nb.predict_proba(x)
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(2):
        fpr[i], tpr[i], _ = roc_curve(y.values, scores[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    plt.figure()
    lw = 2
    
    colors = ['aqua', 'darkorange']
    for i, color in zip(range(2), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [93]:
#Hate Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=3, max_df=0.75)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

nb = fitting(x_train_dtm, trainSet.HS)
classify(x_test_dtm, testSet.HS, nb)
ROC(x_test_dtm, testSet.HS, nb)

Wall time: 3 ms
              precision    recall  f1-score   support

           0       0.81      0.13      0.22      1740
           1       0.44      0.96      0.61      1260

    accuracy                           0.48      3000
   macro avg       0.63      0.54      0.41      3000
weighted avg       0.66      0.48      0.38      3000

0.606015037593985


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
#Target Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)

x_train_dtm = vect.fit_transform(hateSet.text)
x_test_dtm = vect.transform(testSet.text)

nb = fitting(x_train_dtm, hateSet.TR)
classify(x_test_dtm, testSet.TR, nb)

Wall time: 1.99 ms
              precision    recall  f1-score   support

           0       0.99      0.59      0.74      2471
           1       0.33      0.96      0.49       529

    accuracy                           0.65      3000
   macro avg       0.66      0.77      0.62      3000
weighted avg       0.87      0.65      0.69      3000

0.4948855333658061


<IPython.core.display.Javascript object>

In [21]:
#Aggressive Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.75)

x_train_dtm = vect.fit_transform(hateSet.text)
x_test_dtm = vect.transform(testSet.text)

nb = fitting(x_train_dtm, hateSet.AG)
classify(x_test_dtm, testSet.AG, nb)

Wall time: 1e+03 µs
              precision    recall  f1-score   support

           0       0.85      0.65      0.74      2406
           1       0.27      0.53      0.36       594

    accuracy                           0.63      3000
   macro avg       0.56      0.59      0.55      3000
weighted avg       0.73      0.63      0.66      3000

0.3586521987435751


<IPython.core.display.Javascript object>

trainSet

0.5248152059134108

0.3546268656716418

hateSet

0.4948855333658061

0.3586521987435751