In [1]:
!pip install vaderSentiment



In [2]:
%matplotlib notebook
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
import pandas as pd

testPath = '../data/hateval2019_en_test_clean.csv'
trainPath = '../data/hateval2019_en_train_clean.csv'

testSet = pd.read_csv(testPath)
trainSet = pd.read_csv(trainPath)

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader = SentimentIntensityAnalyzer()

print(trainSet.shape)
print(testSet.shape)

(9000, 6)
(3000, 6)


In [4]:
%time testSet['vader'] = testSet['text'].apply(lambda x: vader.polarity_scores(x))
testSet = pd.concat([testSet, testSet['vader'].apply(pd.Series)], axis='columns')

%time trainSet['vader'] = trainSet['text'].apply(lambda x: vader.polarity_scores(x))
trainSet = pd.concat([trainSet, trainSet['vader'].apply(pd.Series)], axis='columns')


Wall time: 548 ms
Wall time: 1.44 s


In [5]:
#Different Vectorizers utilizing different range of n-grams
vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=4, max_df=0.5)

x_train_dtm = vect.fit_transform(trainSet.text)
x_test_dtm = vect.transform(testSet.text)

print(x_train_dtm.shape)
print(x_test_dtm.shape)

type(x_train_dtm)

(9000, 3908)
(3000, 3908)


scipy.sparse.csr.csr_matrix

In [6]:
vect_df = pd.DataFrame(
    x_train_dtm.toarray(), 
    columns=vect.get_feature_names(),
    index=trainSet.index
)

compound_train_df = pd.concat([vect_df, trainSet['vader'].apply(pd.Series)], axis='columns')
compound_train_df.head(10)

Unnamed: 0,000,10,100,11,12,13,135,14,15,150,...,²ðÿ,ºðÿ,â_x0081_,â_x009d_,âž_x009d_,ðÿ,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.791,0.209,0.5719
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.155,0.783,0.062,-0.7291
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.305,0.547,0.149,-0.8786
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.162,0.838,0.0,-0.4019
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.197,0.803,0.0,-0.4445
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.437,0.563,0.6318
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.303,0.697,0.0,-0.9042
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.159,0.38,0.461,0.6085
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.838,0.162,0.508
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
vect_df = pd.DataFrame(
    x_test_dtm.toarray(), 
    columns=vect.get_feature_names(),
    index=testSet.index
)

compound_test_df = pd.concat([vect_df, testSet['vader'].apply(pd.Series)], axis='columns')
compound_test_df.head(10)

Unnamed: 0,000,10,100,11,12,13,135,14,15,150,...,²ðÿ,ºðÿ,â_x0081_,â_x009d_,âž_x009d_,ðÿ,neg,neu,pos,compound
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.927,0.073,0.5719
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.091,0.783,0.126,0.1027
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.171,0.829,0.0,-0.6988
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.451,0.549,0.0,-0.5093
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.276,0.653,0.071,-0.802
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.203,0.688,0.109,-0.7546
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.281,0.719,0.0,-0.889
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.285,0.673,0.042,-0.9468
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.786,0.214,0.7405


In [8]:
import scipy
train_dtm = scipy.sparse.csr_matrix(compound_train_df.values)
print(train_dtm.shape)

test_dtm = scipy.sparse.csr_matrix(compound_test_df.values)
print(test_dtm.shape)

(9000, 3912)
(3000, 3912)


In [9]:
def fitting(x,y):
    nb = BernoulliNB()
    sgd = SGDClassifier()

    %time nb.fit(x, y)
    %time sgd.fit(x, y)
    
    return nb, sgd

def classify(x,y, nb, sgd):
    y_pred_class_nb = nb.predict(x)
    y_pred_class_sgd = sgd.predict(x)

    nb_acc = metrics.accuracy_score(y, y_pred_class_nb)

    sgd_acc = metrics.accuracy_score(y, y_pred_class_sgd)

    print(nb_acc)
    print(sgd_acc)

    print(classification_report(y, y_pred_class_nb, labels=[0,1]))
    print(classification_report(y, y_pred_class_sgd, labels=[0,1]))

In [12]:
#VADER
nb, sgd = fitting(train_dtm, trainSet.HS)
classify(test_dtm, testSet.HS, nb, sgd)

Wall time: 5 ms
Wall time: 29 ms
0.46366666666666667
0.5066666666666667
              precision    recall  f1-score   support

           0       0.77      0.11      0.19      1740
           1       0.44      0.95      0.60      1260

    accuracy                           0.46      3000
   macro avg       0.60      0.53      0.39      3000
weighted avg       0.63      0.46      0.36      3000

              precision    recall  f1-score   support

           0       0.77      0.21      0.33      1740
           1       0.46      0.91      0.61      1260

    accuracy                           0.51      3000
   macro avg       0.61      0.56      0.47      3000
weighted avg       0.64      0.51      0.45      3000



In [11]:
#WITHOUT VADER
nb, sgd = fitting(x_train_dtm, trainSet.HS)
classify(x_test_dtm, testSet.HS, nb, sgd)

Wall time: 4 ms
Wall time: 20 ms
0.4736666666666667
0.48733333333333334
              precision    recall  f1-score   support

           0       0.81      0.12      0.21      1740
           1       0.44      0.96      0.61      1260

    accuracy                           0.47      3000
   macro avg       0.63      0.54      0.41      3000
weighted avg       0.66      0.47      0.38      3000

              precision    recall  f1-score   support

           0       0.78      0.16      0.27      1740
           1       0.45      0.93      0.61      1260

    accuracy                           0.49      3000
   macro avg       0.61      0.55      0.44      3000
weighted avg       0.64      0.49      0.41      3000



# VADER

precision    recall  f1-score   support

           0       0.77      0.11      0.19      1740
           1       0.44      0.95      0.60      1260

    accuracy                           0.46      3000
   macro avg       0.60      0.53      0.39      3000
weighted avg       0.63      0.46      0.36      3000

              precision    recall  f1-score   support

           0       0.78      0.22      0.34      1740
           1       0.46      0.91      0.61      1260

    accuracy                           0.51      3000
   macro avg       0.62      0.57      0.48      3000
weighted avg       0.64      0.51      0.46      3000

# WITHOUT VADER

precision    recall  f1-score   support

           0       0.81      0.12      0.21      1740
           1       0.44      0.96      0.61      1260

    accuracy                           0.47      3000
   macro avg       0.63      0.54      0.41      3000
weighted avg       0.66      0.47      0.38      3000

              precision    recall  f1-score   support

           0       0.78      0.16      0.27      1740
           1       0.45      0.94      0.61      1260

    accuracy                           0.49      3000
   macro avg       0.61      0.55      0.44      3000
weighted avg       0.64      0.49      0.41      3000