In [1]:
import pandas as pd
import re
import spacy
import pickle
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from utils import read_file, preprocess, clean_twitter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [3]:
tqdm.pandas()
nlp = spacy.load('en_core_web_sm')

In [4]:
data, y = read_file('../hatespeech/', with_evaluation=True)

In [30]:
print(np.random.choice(np.where(y==0)[0], 100, replace=False).tolist())
print(np.random.choice(np.where(y==1)[0], 100, replace=False).tolist())
print(np.random.choice(np.where(y==2)[0], 100, replace=False).tolist())
print(np.random.choice(np.where(y==3)[0], 100, replace=False).tolist())

[48058, 15984, 27078, 9262, 10476, 59194, 11050, 8435, 71826, 42311, 74295, 80900, 50234, 34184, 11280, 3825, 8975, 20993, 64318, 78886, 58129, 58212, 70035, 37040, 28668, 60542, 72269, 50377, 11597, 78921, 47837, 68116, 57939, 14923, 45110, 31936, 64267, 7332, 14810, 27164, 4257, 22539, 17607, 64409, 34879, 7803, 14844, 39743, 4227, 17651, 22747, 62115, 34778, 19213, 75582, 35104, 66251, 55730, 49229, 64050, 62430, 30138, 54800, 52014, 424, 34602, 60449, 72086, 75604, 1138, 29506, 41801, 18458, 45909, 33471, 7796, 51094, 70716, 19936, 34304, 8337, 70614, 5661, 37509, 22731, 81222, 65062, 9793, 61653, 68646, 22308, 10509, 49897, 46742, 46728, 1136, 31604, 61232, 51818, 38044]
[5787, 71208, 42154, 6477, 74586, 75785, 72997, 16029, 34300, 1462, 31607, 29264, 35590, 26044, 38740, 54612, 67290, 43532, 39088, 29207, 32810, 32104, 55244, 13683, 35070, 38862, 20115, 13236, 50451, 77892, 59420, 70823, 64874, 1382, 33864, 14408, 34493, 48738, 57317, 35534, 23131, 61594, 39044, 32028, 47879, 484

In [None]:
data = [preprocess(text) for text in data]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

In [None]:
vectorizer = CountVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(svm.LinearSVC(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

In [None]:
# features = np.array(vectorizer.get_feature_names())
# features[np.argsort(classifier.estimators_[0].coef_[0])[::-1][:50]]

In [None]:
vectorizer = TfidfVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(svm.LinearSVC(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

In [None]:
vectorizer = CountVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

In [None]:
vectorizer = TfidfVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=0.1, max_iter=2000), n_jobs=-1)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

In [None]:
vectorizer = CountVectorizer(input='content',
                             analyzer='word',
                             strip_accents='ascii',
                             ngram_range=(1,1),
                             stop_words='english',
                             min_df=2)
x = vectorizer.fit_transform(x_train)
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 42)
classifier.fit(x, y_train)
y_pred = classifier.predict(x)
print(classification_report(y_train, y_pred))
print("---------------Test metrics------------------------")
x = vectorizer.transform(x_test)
y_pred = classifier.predict(x)
print(classification_report(y_test, y_pred))

In [None]:
features = np.array(vectorizer.get_feature_names())
features[np.argsort(classifier.feature_importances_)[::-1]][:50]

In [None]:
data, y = read_file('../hatespeech/', with_evaluation=True)
data = [clean_twitter(text) for text in data]
df = pd.DataFrame(columns=['text', 'label'])
df['text'] = data
df['label'] = y

In [None]:
df.to_pickle('~/df.pkl')

In [None]:
df_abusive = df[df['label'] == 'abusive']

In [None]:
df_abusive['cleaned'] = df_abusive['text'].apply(lambda x : clean(x))
df_abusive['lemmatized'] = df_abusive['cleaned'].apply(lambda x: lemmatize(x))

In [None]:
bow_vectorizer = CountVectorizer(input='content',
                                 analyzer='word',
                                 strip_accents='ascii',
                                 ngram_range=(2,5),
                                 stop_words='english',
                                 min_df=10)
X = bow_vectorizer.fit_transform(df_abusive['lemmatized'])

In [None]:
feature_names = np.array(bow_vectorizer.get_feature_names())
feature_names[np.argsort(np.array(X.sum(axis=0))[0])[::-1]][:50].tolist()

In [None]:
tfidf_vectorizer = TfidfVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,3),
                                   min_df=5,
                                   stop_words='english',
                                   use_idf = True)

In [None]:
trainX = tfidf_vectorizer.fit_transform(train_df['lemmatized'])
testX = tfidf_vectorizer.transform(test_df['lemmatized'])

In [None]:
features = tfidf_vectorizer.get_feature_names()
len(features)

In [None]:
rf_classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 42) #entropy
rf_classifier.fit(trainX, trainY)

In [None]:
predY = rf_classifier.predict(trainX)
print(classification_report(trainY, predY))

In [None]:
predY = rf_classifier.predict(testX)
print(classification_report(testY, predY))

In [None]:
rf_imps = rf_classifier.feature_importances_
sort_ind = np.argsort(rf_imps)
top_n = sort_ind[::-1][:100]
sorted_imps = rf_imps[top_n]
top_feats = np.array(features)[top_n]

plt.figure(figsize=(6, 30))
plt.xlabel('Relative Importance', fontsize=20)
plt.ylabel('Feature', fontsize=20)
sns.barplot(y = top_feats, x = sorted_imps)
plt.tick_params(labelsize=20)
plt.show()

In [None]:
np.array(features)[top_n]

In [None]:
lr_classifier = OneVsOneClassifier(LogisticRegression(class_weight='balanced', C=1, max_iter=7000), n_jobs=-1)
lr_classifier.fit(trainX, trainY)

In [None]:
predY = lr_classifier.predict(trainX)
print(classification_report(trainY, predY))

In [None]:
predY = lr_classifier.predict(testX)
print(classification_report(testY, predY))