In [1]:
%matplotlib notebook
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, f1_score, confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve, auc
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

trainPath = '../data/hateval2019_en_train_clean.csv'
testPath = '../data/hateval2019_en_test_clean.csv'

trainSet = pd.read_csv(trainPath)
testSet = pd.read_csv(testPath)

hateSet = trainSet[trainSet['HS']==1]

print(trainSet.shape)
print(hateSet.shape)

print(testSet.shape)

(9000, 6)
(3783, 6)
(3000, 6)


In [2]:
def fitting(x,y):
    lr = LogisticRegression(random_state=-0, max_iter=10000, C=3.727593720314938, penalty='l2')

    %time lr.fit(x, y)
    
    return lr

def classify(x,y, lr):
    y_pred_class_lr = lr.predict(x)

    lr_acc = metrics.accuracy_score(y, y_pred_class_lr)

    print(classification_report(y, y_pred_class_lr, labels=[0,1]))
    
    f1 = f1_score(y, y_pred_class_lr)
    
    print(f1)
    
    conf_matrix = confusion_matrix(y_true=y, y_pred=y_pred_class_lr)

    fig, ax = plt.subplots(figsize=(7.5, 7.5))
    ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

    plt.xlabel('Predictions', fontsize=18)
    plt.ylabel('Actuals', fontsize=18)
    plt.title('Confusion Matrix', fontsize=18)
    plt.show()
    
    return y_pred_class_lr, f1
    
    
def Metrics(x,y, lr):
    
    scores = lr.predict_proba(x)
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(2):
        fpr[i], tpr[i], _ = roc_curve(y.values, scores[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    plt.figure()
    lw = 2
    
    colors = ['aqua', 'darkorange']
    for i, color in zip(range(2), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))
    
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [20]:
x, x_test, y, y_test = train_test_split(trainSet.text,trainSet.HS, random_state=1)

#Hate Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=3, max_df=0.5)

#x_train_dtm = vect.fit_transform(x)
#x_test_dtm = vect.transform(x_test)

x_train_dtm = vect.fit_transform(x)
x_test_dtm = vect.transform(x_test)

lr = fitting(x_train_dtm, y)
hs_pred, hs_F1 = classify(x_test_dtm, y_test, lr)
Metrics(x_test_dtm, y_test, lr)

Wall time: 93 ms
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      1324
           1       0.74      0.71      0.73       926

    accuracy                           0.78      2250
   macro avg       0.77      0.77      0.77      2250
weighted avg       0.78      0.78      0.78      2250

0.7263736263736265


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
from sklearn.model_selection import GridSearchCV
c_space = np.logspace(-5,8,15)
param_grid = {'C': c_space,
             'max_iter' : [10000, 100000],
             'penalty': ['l1', 'l2']}


if __name__ == "__main__":
    logreg = LogisticRegression()
    logreg_cv = GridSearchCV(logreg, param_grid,n_jobs=4,verbose=1, scoring='f1')
    logreg_cv.fit(x_train_dtm, trainSet.HS)
    
    print("LR Parameter: {}".format(logreg_cv.best_params_))
    print("LR Accuracy: {}".format(logreg_cv.best_score_))

NameError: name 'x_train_dtm' is not defined

In [10]:
#Hate Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=3, max_df=0.5)

#x_train_dtm = vect.fit_transform(x)
#x_test_dtm = vect.transform(x_test)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

lr = fitting(x_train_dtm, trainSet.HS)
hs_pred, hs_F1 = classify(x_test_dtm, testSet.HS, lr)
Metrics(x_test_dtm, testSet.HS, lr)

Wall time: 85 ms
              precision    recall  f1-score   support

           0       0.77      0.18      0.29      1740
           1       0.45      0.93      0.61      1260

    accuracy                           0.49      3000
   macro avg       0.61      0.55      0.45      3000
weighted avg       0.63      0.49      0.42      3000

0.605133523463832


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
#Target Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=3, max_df=0.5)

#x_train_dtm = vect.fit_transform(hateSet.text)
x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

lr = fitting(x_train_dtm, trainSet.TR)
tr_pred, tr_F1 = classify(x_test_dtm, testSet.TR, lr)
Metrics(x_test_dtm, testSet.TR, lr)

Wall time: 133 ms
              precision    recall  f1-score   support

           0       0.92      0.82      0.87      2471
           1       0.45      0.69      0.54       529

    accuracy                           0.79      3000
   macro avg       0.68      0.75      0.70      3000
weighted avg       0.84      0.79      0.81      3000

0.5411415863602669


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
#Aggressive Score
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=3, max_df=0.5)

#x_train_dtm = vect.fit_transform(hateSet.text)
x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

lr = fitting(x_train_dtm, trainSet.AG)
ag_pred, ag_F1 = classify(x_test_dtm, testSet.AG, lr)
Metrics(x_test_dtm, testSet.AG, lr)

Wall time: 153 ms
              precision    recall  f1-score   support

           0       0.85      0.79      0.82      2406
           1       0.34      0.43      0.38       594

    accuracy                           0.72      3000
   macro avg       0.59      0.61      0.60      3000
weighted avg       0.75      0.72      0.73      3000

0.37685459940652816


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [19]:
y_true = np.empty((0,3), int)
y_pred = np.empty((0,3), int)
for i in range(2999):
    y_true = np.append(y_true, [[testSet.HS[i], testSet.TR[i], testSet.HS[i]]], axis = 0)
    y_pred = np.append(y_pred, [[hs_pred[i], tr_pred[i], ag_pred[i]]], axis = 0)

#Overall F1-Score
overall_F1 = (hs_F1 + tr_F1 + ag_F1) / 3
#Exact Match Ratio
EMR = np.all((y_pred == y_true), axis=1).mean()

print("Overall F1 Score: %f" % overall_F1)
print("Exact Match Ratio: %f" % EMR)

Overall F1 Score: 0.507710
Exact Match Ratio: 0.225742
