# Error analysis

## Load Pretrained model

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
saver_paths = ["./logs/rf2/char/ckpt", "./logs/rf2/word/ckpt", "./logs/rf2/hybrid/ckpt"]
checkpoint_files = list(map(tf.train.get_checkpoint_state, saver_paths))

In [3]:
checkpoint_files

[model_checkpoint_path: "/home/homes/jhpark/hate-speech/logs/rf2/char/ckpt/model-final.ckpt"
 all_model_checkpoint_paths: "/home/homes/jhpark/hate-speech/logs/rf2/char/ckpt/model-final.ckpt",
 model_checkpoint_path: "/home/homes/jhpark/hate-speech/logs/rf2/word/ckpt/model-final.ckpt"
 all_model_checkpoint_paths: "/home/homes/jhpark/hate-speech/logs/rf2/word/ckpt/model-final.ckpt",
 model_checkpoint_path: "/home/homes/jhpark/hate-speech/logs/rf2/hybrid/ckpt/model-final.ckpt"
 all_model_checkpoint_paths: "/home/homes/jhpark/hate-speech/logs/rf2/hybrid/ckpt/model-final.ckpt"]

In [4]:
# create session for evaluation
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)


In [5]:
from data.hybrid import load_data_from_file

(x_train, y_train, x_test, y_test, initW, vocab) = load_data_from_file("racism_final2_binary")
word_text_len = x_train[0]["word"].shape[0]
word_vocab_size = len(vocab.vocabulary_)
char_text_len = x_train[0]["char"].shape[0]
char_vocab_size = x_train[0]["char"].shape[1]



Data Summary:
Train: Total Positive Labels=1750 (0.1421)
Test: Total Positive Labels=309 (0.1421)

dataset passed the assertion test


In [6]:
from data.hybrid import extract_from_batch

batchW, batchC = extract_from_batch(x_test)
feed_dicts = []
feed_dicts.append({"input/Placeholder_1:0": y_test, "input/Placeholder:0": batchC, "nn-layers/fully-connected-layer-0/dropout_1/keras_learning_phase:0": 0})
feed_dicts.append({"input/labels:0": y_test, "input/X:0": batchW, "dropout_keep_prob:0": 1})
feed_dicts.append({"input/labels:0": y_test, "input/X_word:0": batchW, "input/X_char:0": batchC, "dropout_keep_prob:0": 1})

output_names = ["prediction/prediction", "output/prediction", "output/prediction"]

In [7]:
preds = []
for i, ckpt in enumerate(checkpoint_files):
    print(ckpt.all_model_checkpoint_paths[0])
    tf.reset_default_graph()
    saver = tf.train.import_meta_graph("{}.meta".format(ckpt.all_model_checkpoint_paths[0]))

    with tf.Session(config=session_conf) as sess:
        saver.restore(sess, ckpt.model_checkpoint_path)
        graph = tf.get_default_graph()
        pred = sess.run(output_names[i] + ":0", feed_dicts[i])
        print(pred.shape)
        preds.append(pred)

/home/homes/jhpark/hate-speech/logs/rf2/char/ckpt/model-final.ckpt
(2175,)
/home/homes/jhpark/hate-speech/logs/rf2/word/ckpt/model-final.ckpt
(2175,)
/home/homes/jhpark/hate-speech/logs/rf2/hybrid/ckpt/model-final.ckpt
(2175,)


In [8]:
from model.helper import calculate_metrics
for pred in preds:
    precision, recall, f1 = calculate_metrics(y_test, pred)
    print("precision=%.4f recall=%.4f f1=%.4f" % (precision, recall, f1))

precision=0.6667 recall=0.7832 f1=0.7202
precision=0.7110 recall=0.7961 f1=0.7511
precision=0.7086 recall=0.8026 f1=0.7527


Using TensorFlow backend.


since the metrics are same as the final output, we can validate that the pre-trained model has been loaded successfully

## Load original test data

In [16]:
from data.preprocess import load_from_file
data = load_from_file("racism_final2_binary")

In [17]:
data["x_test"][:10]

array([ 'there are no ukr nazis. get an education. bandera did not hold a nazi ideaology.',
       'yep - karlie kloss et al are shaking in their boots. mkr',
       'miles of nothing but farmland in either direction, here stands a lone palm tree. california.',
       'anonymous hacker group claims to have taken down more than 1,000 isis sites and emails. opisis',
       'and now mods of ghazi are under fire by a mob that appears to have been led by you. :(',
       "it's official: the fcc will motion to have the internet filed under title ii as a utility.",
       'so i can sympathize and understand where this is all coming from.',
       'thank you',
       "how's colin and these rounds working for you.. pretty crap i'd say from the tweets.. needarethinkinformat mkr",
       'it was sunny out today! :p i went to the dog park for an hour.'], 
      dtype='<U157')

In [18]:
from data.char import one_hot_to_chars
["".join(one_hot_to_chars(x)) for x in batchC[:10]]

['therearenoukrnazis.getaneducation.banderadidnotholdanaziideaology.',
 'yep-karlieklossetalareshakingintheirboots.mkr',
 'milesofnothingbutfarmlandineitherdirection,herestandsalonepalmtree.california.',
 'anonymoushackergroupclaimstohavetakendownmorethan1,000isissitesandemails.opisis',
 'andnowmodsofghaziareunderfirebyamobthatappearstohavebeenledbyyou.:(',
 'itsofficial:thefccwillmotiontohavetheinternetfiledundertitleiiasautility.',
 'soicansympathizeandunderstandwherethisisallcomingfrom.',
 'thankyou',
 'howscolinandtheseroundsworkingforyou..prettycrapidsayfromthetweets..needarethinkinformatmkr',
 'itwassunnyouttoday!:piwenttothedogparkforanhour.']

## Loading Baseline

In [19]:
import numpy as np
from sklearn.linear_model import LinearRegression
from data.preprocess import load_from_file
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [20]:
def evaluate(pred_scores, target):
    pred = list(map(lambda x: 1 if x >= 0.5 else 0, pred_scores))
    precision, recall, f1 = calculate_metrics(target, pred)
    print("Training: Precision=%.2f Recall=%.2f, F1=%.2f" %(precision, recall, f1))
    return f1

In [21]:
def lr_with_freq(data):
    vect = CountVectorizer(ngram_range=(2,5), analyzer="char")
    X_train_counts = vect.fit_transform(data["x_train"])
    print(X_train_counts.shape)
    
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
    
    lr = LinearRegression().fit(X_train_tfidf, data["y_train"])
    evaluate(lr.predict(X_train_tfidf), data["y_train"])
    
    X_test_counts = vect.transform(data["x_test"])
    pred_scores = lr.predict(tfidf_transformer.transform(X_test_counts))
    evaluate(pred_scores, data["y_test"])
    return lr, pred_scores

In [22]:
_, baseline_pred = lr_with_freq(data)

(12312, 195567)
Training: Precision=1.00 Recall=1.00, F1=1.00
Training: Precision=0.73 Recall=0.64, F1=0.68


In [23]:
baseline_pred = list(map(lambda x: 1 if x >= 0.5 else 0, baseline_pred))

In [25]:
len(baseline_pred)

2175

## Baseline vs CNN

### Char vs Word

In [41]:
char_pred = preds[0]
word_pred = preds[1]
hybrid_pred = preds[2]


In [49]:
word_right = []
word_wrong = []
for i in range(len(baseline_pred)):
    if char_pred[i] == 0 and word_pred[i] == 1 and y_test[i] == 1:
        word_right.append(data["x_test"][i])
    if char_pred[i] == 1 and word_pred[i] == 0  and y_test[i] == 1:
        word_wrong.append(data["x_test"][i])
    
                             

In [50]:
len(word_right)

21

In [51]:
len(word_wrong)

17

In [52]:
word_right

['our journalists watch isis burn a man in a cage, and they respond with the same tired cliche about "hijacking the religion" bs! it\'s islam',
 'if someone is going to die getting isis out of tikrit, it may as well be the shia militia. that is all good. islam daesh',
 'the holocaust was by nazis. and the people who fought, killed and stopped them where christian westerners.',
 "it's not about satire or free speech. pretending it is will solve nothing. the problem is that the quran is full of incitement for murder.",
 "the reason these politically correct idiots will never solve problem is because they won't acknowledge that it's islam",
 'baby you can drive my car asian juicyasian sex nsfw adult xxx',
 'the question is not about people doing bad things, it is about the religion supporting and demanding those things.',
 'sambo coon',
 'have they told the tribe yet that their daughters would become whores for sexual jihad?',
 'wrong. apostacy is the equivalent of leaving america and bec

In [53]:
word_wrong

["women did't lead squat. a woman can never be caliph. her testimony is half that of a man. she get's nothing for jihad.",
 'islam is a believe system based mainly on illusion. overlooking gross barbarity is necessary.',
 'socialists and moslems - working together to destroy australia.',
 'wrong, microbrain, she was 6, and you, like your prophet, are a liar. here is the proof',
 'the kurds clean up the isis islamofascist trash in kobane. kobane is the graveyard of isis. via',
 "at least i don't follow pedophile prophets or believe in mythical creatures of fire called jinn.",
 'yes, if we just sit on our butts and let them murder and enslave ezidis and christians, ...',
 'yes, they tried to murder their way to heaven. a total idiot would believe in such an allah.',
 'arab man arrested trying to enter same nuclear facility warned: islamic_terror_bomb_waiting_to_go_off.php',
 'looks like a slum, as people are forced to go to mosque at gun point. daesh are creating hundreds of thousands of

### Word vs Hybrid

In [55]:
hybrid_right = []
hybrid_wrong = []
for i in range(len(baseline_pred)):
    if word_pred[i] == 0 and hybrid_pred[i] == 1 and y_test[i] == 1:
        hybrid_right.append(data["x_test"][i])
    if word_pred[i] == 1 and hybrid_pred[i] == 0  and y_test[i] == 1:
        hybrid_wrong.append(data["x_test"][i])
    
                             

In [56]:
hybrid_right

['wheras the arabs really are behaving people and you scum are ignoring it.',
 'wrong, microbrain, she was 6, and you, like your prophet, are a liar. here is the proof',
 "at least i don't follow pedophile prophets or believe in mythical creatures of fire called jinn.",
 'yes, they tried to murder their way to heaven. a total idiot would believe in such an allah.',
 'looks like a slum, as people are forced to go to mosque at gun point. daesh are creating hundreds of thousands of atheists.',
 'you only have to count the corpses to see that islamic extremism is widespread.jewish not',
 'watching shia militia beat a peshmerga to death, i thought, "praying to allah 5 times a day certainly peaks the level of fanatical hatred"',
 'men and women are divided by propaganda, pushed by homosexuals, on behalf of zionist jews &gt;&gt; &lt;&lt; mgtow feminazi',
 "you don't guarantee them protection, you guarantee them third class citizenship and extortion."]

In [57]:
hybrid_wrong

['the holocaust was by nazis. and the people who fought, killed and stopped them where christian westerners.',
 'muslim found guilty of plotting to behead british soldiers-cases like this need the death penalty',
 'in the uk like idiots we give jihadis rehabilitation, jordan execute them. we are a joke',
 'where is your evidence that they are furious goat fucker?',
 "i don't care what they retracted. the no go zones are there. here is an example.",
 'allah the terrorist can stick what he wants up his giant ass.',
 'why do you think israel is most technological country in that part of world']