In [None]:
import json, os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
import pickle 

label_map_original = {'none': 0, 'racism': 1, 'sexism': 2}
label_map_modified = {
    0: 0,  #NH
    1: 1,  #H
    2: 1  #H
}
hate_map = {0: "NON HATE", 1: "HATE"}

model = LogisticRegression()
nltk_tweet_tokenizer = TweetTokenizer().tokenize
vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             analyzer='char',
                             tokenizer=nltk_tweet_tokenizer)

In [None]:
def load_data():
    texts = []
    labels = []
    folder = "ext_eval"
    file = os.path.join(folder, 'waseem_data_norm.json')
    with open(file, 'r') as f:
        waseem_data = json.load(f)
    for each_tweet in waseem_data:
        tweet = waseem_data[each_tweet]['text'].lower()
        texts.append(tweet)
        label_3 = int(waseem_data[each_tweet]['label_map'])
        hate_label = label_map_modified[label_3]
        labels.append(hate_label)
    return texts, labels

texts, labels = load_data()
X = vectorizer.fit_transform(texts)
Y = labels
plt.hist(labels)
plt.show()

X, Y = shuffle(X, Y, random_state=42)
model.fit(X, Y)
y_preds = model.predict(X)
report = classification_report(Y, y_preds)
print(report)

## Results

In [None]:
def ext_eval_probs(yg, yp):
    diff = []
    for g, p in zip(yg, yp):
        if g >= 0.5 and p >= 0.5:  ## 0 is hate label so we take <0.5
            diff.append(g - p)
    return np.mean(diff)


def run_for_test(model_name):
    folder = "ext_eval"
    print(model_name)
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "rb") as f:
        data = pickle.load(f)
    xg = data["ground"]
    xp = data["pred"]
    xg = return_feature_set(xg)
    xp = return_feature_set(xp)
    yg = model.predict_proba(xg)[:, 1]
    yp = model.predict_proba(xp)[:, 1]
    print(ext_eval_probs(yg, yp))


def run_for_test_dict(model_name):
    folder = "ext_eval"
    print(model_name)
    file = os.path.join(folder, model_name + "_for_ext_eval.pkl")
    with open(file, "rb") as f:
        data = pickle.load(f)
    for k in data:
        print("------k-----", k)
        xg = data[k]["ground"]
        xp = data[k]["pred"]
        xg = return_feature_set(xg)
        xp = return_feature_set(xp)
        yg = model.predict_proba(xg)[:, 1]
        yp = model.predict_proba(xp)[:, 1]
        print(ext_eval_probs(yg, yp))


def execute_():
    run_for_test("neutral")
    run_for_test("drgpreds")
    run_for_test("ntpcares")
    run_for_test_dict("fgst")
    run_for_test_dict("style")
    run_for_test("nacl")

In [None]:
execute_()