In [103]:
import pandas as pd
from utils import read_file, clean_twitter, stem
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from scipy import stats
from sklearn.preprocessing import MinMaxScaler

In [104]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)

In [105]:
data, y = read_file('../hatespeech/', True)

In [106]:
with open('../hatespeech/out.txt', 'r') as f:
    y_pred = f.readlines()

In [107]:
y_pred = [int(i.strip()) for i in y_pred]

In [108]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.57      0.67     50287
           1       0.36      0.78      0.49     13608
           2       0.76      0.64      0.69     17089
           3       0.14      0.20      0.16      2545

    accuracy                           0.61     83529
   macro avg       0.52      0.55      0.51     83529
weighted avg       0.71      0.61      0.63     83529



In [109]:
confusion_matrix(y, y_pred)

array([[28840, 18129,  1607,  1711],
       [ 2125, 10675,   695,   113],
       [ 3678,   976, 10922,  1513],
       [  746,   151,  1127,   521]])

In [130]:
df = pd.DataFrame(columns=['text', 'y', 'y_pred']) 
df['text'] = data
df['y'] = y
df['y_pred'] = y_pred

In [140]:
# df[(df.y == 2) & (df.y_pred == 3)]

In [132]:
df = df[(df.y == 2) & (df.y_pred == 3)].reset_index(drop=True)

In [133]:
df['processed'] = df['text'].apply(lambda x: clean_twitter(x))

In [134]:
with open('stopwords.txt', 'r') as f:
    lines = f.readlines()
stopwords = [w.strip() for w in lines]
count_vectorizer = CountVectorizer(input='content',
                                   analyzer='word',
                                   strip_accents='ascii',
                                   ngram_range=(1,1),
                                   stop_words=stopwords)
count = count_vectorizer.fit_transform(df['processed'])
features = np.array(count_vectorizer.get_feature_names())
freq = count.copy()
count[count > 0] = 1

In [135]:
label=2
class_docs = count[df[df.y == label].index.to_numpy()]
rel_doc_freq = np.array(class_docs.sum(axis=0)/class_docs.shape[0])[0]
avg_freq = np.array(freq[df[df.y == label].index.to_numpy()].sum(axis=0)/class_docs.shape[0])[0]

In [136]:
rankingdf = pd.DataFrame(columns=['word', 'rel_doc_freq'])
rankingdf['word'] = features
rankingdf['rel_doc_freq'] = rel_doc_freq
rankingdf['avg_freq'] = avg_freq
rankingdf['idf'] = np.log(np.array(count.shape[0]/count.sum(axis=0))[0])

In [137]:
scaler = MinMaxScaler()
scaler.fit(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf[['rel_doc_freq','idf', 'avg_freq']] = scaler.transform(rankingdf[['rel_doc_freq', 'idf', 'avg_freq']])
rankingdf['comb'] = np.cbrt(rankingdf['rel_doc_freq'] * rankingdf['idf'] * rankingdf['avg_freq'])

In [138]:
rankingdf.sort_values(by=['comb'], ascending=False).head(100)

Unnamed: 0,word,rel_doc_freq,avg_freq,idf,comb
1677,idiot,0.643312,0.665109,0.068317,0.308045
3490,trump,0.179936,0.188474,0.265037,0.207918
1679,idiots,0.164013,0.165109,0.279284,0.19629
1361,fuck,0.159236,0.163551,0.283826,0.194797
3253,stupid,0.14172,0.14486,0.30172,0.183651
1041,don,0.10828,0.113707,0.342952,0.16163
2500,people,0.105096,0.105919,0.347516,0.15698
1363,fucked,0.085987,0.087227,0.378142,0.141551
3231,stop,0.066879,0.070093,0.416337,0.12497
2047,look,0.068471,0.068536,0.41277,0.124655


In [90]:
rankingdf['idf'].describe()

count    41198.000000
mean         0.911427
std          0.132469
min          0.000000
25%          0.866731
50%          1.000000
75%          1.000000
max          1.000000
Name: idf, dtype: float64