# Active Learning Test

## Load Pretrained model

In [1]:
import tensorflow as tf

In [2]:
saver_path = "./logs/rf3/hybrid/ckpt"

In [3]:
%ls logs/rf3/hybrid/ckpt/

checkpoint                             model-380000.ckpt.index
model-360000.ckpt.data-00000-of-00001  model-380000.ckpt.meta
model-360000.ckpt.index                model-390000.ckpt.data-00000-of-00001
model-360000.ckpt.meta                 model-390000.ckpt.index
model-370000.ckpt.data-00000-of-00001  model-390000.ckpt.meta
model-370000.ckpt.index                model-final.ckpt.data-00000-of-00001
model-370000.ckpt.meta                 model-final.ckpt.index
model-380000.ckpt.data-00000-of-00001  model-final.ckpt.meta


In [4]:
checkpoint_file = tf.train.get_checkpoint_state(saver_path)
print(checkpoint_file.all_model_checkpoint_paths[0])

/home/homes/jhpark/hate-speech/logs/rf3/hybrid/ckpt/model-360000.ckpt


In [5]:
saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file.all_model_checkpoint_paths[0]))


# create session for evaluation
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
session_conf = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
sess = tf.Session(config=session_conf)

saver.restore(sess, checkpoint_file.model_checkpoint_path)

In [6]:
graph = tf.get_default_graph()
[n.name for n in graph.as_graph_def().node]

['input/X_word',
 'input/X_char',
 'input/labels',
 'input/one_hot/on_value',
 'input/one_hot/off_value',
 'input/one_hot/depth',
 'input/one_hot',
 'input/Reshape/shape',
 'input/Reshape',
 'dropout_keep_prob',
 'Const',
 'embedding/random_uniform/shape',
 'embedding/random_uniform/min',
 'embedding/random_uniform/max',
 'embedding/random_uniform/RandomUniform',
 'embedding/random_uniform/sub',
 'embedding/random_uniform/mul',
 'embedding/random_uniform',
 'embedding/W',
 'embedding/W/Assign',
 'embedding/W/read',
 'embedding/embedding_lookup',
 'embedding/ExpandDims/dim',
 'embedding/ExpandDims',
 'ExpandDims/dim',
 'ExpandDims',
 'channel0-conv-maxpool-1/truncated_normal/shape',
 'channel0-conv-maxpool-1/truncated_normal/mean',
 'channel0-conv-maxpool-1/truncated_normal/stddev',
 'channel0-conv-maxpool-1/truncated_normal/TruncatedNormal',
 'channel0-conv-maxpool-1/truncated_normal/mul',
 'channel0-conv-maxpool-1/truncated_normal',
 'channel0-conv-maxpool-1/W',
 'channel0-conv-maxpoo

## Load metadata & test set

check whether the loaded graph computes correctly with test set

In [7]:
from data.hybrid import load_data_from_file

(x_train, y_train, x_test, y_test, initW, vocab) = load_data_from_file("racism_final2_binary")
word_text_len = x_train[0]["word"].shape[0]
word_vocab_size = len(vocab.vocabulary_)
char_text_len = x_train[0]["char"].shape[0]
char_vocab_size = x_train[0]["char"].shape[1]


Data Summary:
Train: Total Positive Labels=1750 (0.1421)
Test: Total Positive Labels=309 (0.1421)

dataset passed the assertion test


In [8]:
from data.hybrid import extract_from_batch

batchW, batchC = extract_from_batch(x_test)
feed_dict = {"input/labels:0": y_test, "input/X_word:0": batchW, "input/X_char:0": batchC, "dropout_keep_prob:0": 1}


In [9]:
pred = sess.run("output/prediction:0", feed_dict)

In [10]:
from model.helper import calculate_metrics
precision, recall, f1 = calculate_metrics(y_test, pred)
print("precision=%.4f recall=%.4f f1=%.4f" % (precision, recall, f1))

precision=0.7006 recall=0.8026 f1=0.7481


Using TensorFlow backend.


since the metrics are same as the final output, we can validate that the pre-trained model has been loaded successfully

## Extend the graph to compute softmax prob & entropy
use entropy loss to measure the uncertainty to sample

In [11]:
logits = graph.get_tensor_by_name("output/logits:0")
softmax_prob = tf.nn.softmax(logits, name="softmax")
entropy = tf.reduce_sum(tf.scalar_mul(-1, tf.multiply(softmax_prob, tf.log(softmax_prob))) ,axis=1, name="entropy")

n_candidates = tf.placeholder(tf.int32, name="n_candidates")
get_candidates = tf.nn.top_k(entropy, n_candidates, name="candidates")

In [12]:
feed_dict.update({n_candidates: 20})
prob, candidates = sess.run([softmax_prob, get_candidates], feed_dict)

In [13]:
idx = candidates.indices

In [14]:
prob[idx]

array([[ 0.50059927,  0.49940071],
       [ 0.49695459,  0.50304538],
       [ 0.49438867,  0.50561136],
       [ 0.49303412,  0.50696588],
       [ 0.49082008,  0.50917995],
       [ 0.48928291,  0.51071709],
       [ 0.51123738,  0.48876265],
       [ 0.48819947,  0.51180053],
       [ 0.48716697,  0.51283306],
       [ 0.51528043,  0.4847196 ],
       [ 0.48431545,  0.5156846 ],
       [ 0.48359284,  0.51640719],
       [ 0.48174015,  0.51825988],
       [ 0.48026633,  0.51973373],
       [ 0.48009303,  0.51990694],
       [ 0.47959855,  0.52040136],
       [ 0.47808233,  0.5219177 ],
       [ 0.52640969,  0.47359025],
       [ 0.47341239,  0.52658761],
       [ 0.52788436,  0.47211567]], dtype=float32)

as you can see the most uncertain (probabilities near 0.5) samples are chosen

## Load new unlabelled samples

In [1]:
import pandas as pd

unlabelled = pd.read_csv('./data/crawled/unlabelled/racism_tweets.tsv',
                     sep="\t",
                     header=None,
                     skiprows=[0],
                     names=["Tweet_ID", "Text", "Previous"],
                     error_bad_lines=False)
unlabelled = unlabelled.drop_duplicates(subset=["Text"])

In [2]:
unlabelled.describe(include="all")

Unnamed: 0,Tweet_ID,Text,Previous
count,1340,1339,1003.0
unique,1304,1339,
top,#IslamIsTheProblem,@BreitbartLondon Sadiq Khan is a fifth columni...,
freq,19,1,
mean,,,2.742637e+17
std,,,3.794854e+17
min,,,3297241.0
25%,,,428086800.0
50%,,,2843121000.0
75%,,,7.747524e+17


In [3]:
texts = list(unlabelled["Text"])
print(texts[:10])

['#blackLivesMatter is just a way to HATE white people. Shouldnt #WhiteLivesMatter cuz ALL lives do? /', '#blacklivesmatter ???? Uh.... to who??? Not ME...... #WHITElivesmatter ^', '#blacklivesmatter ???? Uh.... to who??? Not ME...... #WHITElivesmatter ;;', 'BLM, Crooked Hillary, Obama are the REAL RACISTS!!!! #WhiteLivesMatter #BlueLivesMatter #FuckIdiots/', '.@scott_usmc @SouthFlaVet @CNNPolitics Is his life a "#WhiteLivesMatter a  #BluelivesMatter or a: #BlackLivesMatter or an: #AllLivesMatter?:', '@JackPosobiec @HURRICANEPAUL #WhiteGenocide Wake Up, Fight Back!', 'BLM, Crooked Hillary, Obama are the REAL RACISTS!!!! #WhiteLivesMatter #BlueLivesMatter #FuckIdiots|', '#blacklivesmatter ???? Uh.... to who??? Not ME...... #WHITElivesmatter *', '@TacoSalad85 @lporiginalg @YouTube Smash the Matriarchy 😂', '845396808943702016']


remove too frequent tags from previous dataset

remove digits

In [5]:
final_filtered = list(filter(lambda x: not str(x).isdigit(), texts))
final_filtered = list(filter(lambda x: len(str(x).split(" ")) > 3, texts))

In [6]:
len(final_filtered)

1140

In [9]:
with open("./data/crawled/racism_to_be_labelled.tsv", "w") as f:
    for tweet in final_filtered[:800]:
        f.write("%s\n" % tweet)

## Split dataset into two

In [24]:
import os
import random

random.shuffle(final_filtered)

In [25]:
half_index = int(len(final_filtered)/2)

In [26]:
pool_random_sampling = final_filtered[:half_index]
pool_uncertainty_sampling = final_filtered[half_index:]
print(len(pool_random_sampling))
print(len(pool_uncertainty_sampling))

604
604


In [28]:
if not os.path.isfile("./data/crawled/unlabelled/racism_random.tsv"):
    with open("./data/crawled/unlabelled/racism_random.tsv", "w") as f:
        for line in pool_random_sampling:
            f.write(str(line) + "\n")
    print("Saved file")
else:
    print("load from file")
    pool_random_sampling = []
    with open("./data/crawled/unlabelled/racism_random.tsv", "r") as f:
        for line in f:
            pool_random_sampling.append(line.rstrip())
    print(len(pool_random_sampling))

Saved file


In [29]:
if not os.path.isfile("./data/crawled/unlabelled/racism_uncertain.tsv"):
    with open("./data/crawled/unlabelled/racism_uncertain.tsv", "w") as f:
        for line in pool_uncertainty_sampling:
            f.write(str(line) + "\n")
    print("Saved file")
else:
    print("load from file")
    pool_uncertainty_sampling = []
    with open("./data/crawled/unlabelled/sexism_uncertain.tsv", "r") as f:
        for line in f:
            pool_uncertainty_sampling.append(line.rstrip())
    print(len(pool_uncertainty_sampling))

Saved file


In [30]:
%ls ./data/crawled/unlabelled/

democrat_tweets.tsv   republican_tweets.tsv  unlabelled_data_analysis.ipynb
racism_random.tsv     sexism_random.tsv      youtube1.csv
racism_tweets.tsv     sexism_tweets.tsv      youtube2.csv
racism_uncertain.tsv  sexism_uncertain.tsv   youtube3.csv


## Prepare tsv for labelling

Randomly selected N samples from pool_random_sampling

In [77]:
N = 200

In [79]:
random_samples = random.sample(pool_random_sampling, N)

Prepare to feed pool_uncertainty_sampling into the classifier

In [33]:
from data.preprocess import preprocess_tweet

In [34]:
preprocessed = list(map(lambda x:preprocess_tweet(str(x)), pool_uncertainty_sampling))
valid_tweets = []
valid_tweets_preprocessed = []
for i, tweet in enumerate(preprocessed):
    if tweet:
        valid_tweets.append(pool_uncertainty_sampling[i])
        valid_tweets_preprocessed.append(tweet)
print(len(valid_tweets))
print(len(valid_tweets_preprocessed))
print(valid_tweets_preprocessed[:10])

569
569
['to ignore sweden is to invite the same here. bansharia &amp; endislamnow', 'terror islam needs a reformation. islamistheproblem terrorism notlonewolf', 'i understand your situation. you wanted to defend your cult but your peaceful brothers always fail you', 'listenin to strip it down makes me wanna throw more country that sounds like the white version of r&amp;b on my phone whitelivesmatter', "frog avi, whitelivesmatter in the bio. dude, it's not worth it. save your blood pressure, mute &amp; let them scream into the ether", 'so you are defending a jew hating +murdering sand nazi, wtg jezebel, you whorish retards stopislam', 'try islam, his religion, + you win the prize, lol stopislam', 'saying that 3 times makes you thrice the idiot 😝', "silly point for india is a deadset cheat, moving on to the pitch as the delivery is in the air... and we're the cheats! racecard indvaus", 'what a absolute surprise! london islamistheproblem rockvillerape isis isisisgay']


### char features

In [35]:
from data.char import text_to_1hot_matrix
import numpy as np

In [36]:
pool_char = np.array(list(map(lambda x: text_to_1hot_matrix(str(x)), valid_tweets_preprocessed)))
print(pool_char.shape)

(569, 140, 70)


### word features

In [45]:
from data.word import load_data_from_file as load_vocabulary
from data.tokenizer import tokenize_with_dictionary
import numpy as np

In [46]:
_, _, x_test, _, _, vocab = load_vocabulary("racism_final2_binary")

In [47]:
tokenized = list(map(lambda x: tokenize_with_dictionary(x ,vocab.vocabulary_._mapping.keys()), valid_tweets_preprocessed))

In [48]:
print(tokenized[:10])

[['to', 'ignore', 'sweden', 'is', 'to', 'invite', 'the', 'same', 'here', 'ban', 'sharia', 'end', 'islam', 'now'], ['terror', 'islam', 'needs', 'a', 'reformation', 'islam', 'is', 'the', 'problem', 'terrorism', 'not', 'lonewolf'], ['i', 'understand', 'your', 'situation', 'you', 'wanted', 'to', 'defend', 'your', 'cult', 'but', 'your', 'peaceful', 'brothers', 'always', 'fail', 'you'], ['listen', 'in', 'to', 'strip', 'it', 'down', 'makes', 'me', 'wanna', 'throw', 'more', 'country', 'that', 'sounds', 'like', 'the', 'white', 'version', 'of', 'r', 'b', 'on', 'my', 'phone', 'white', 'lives', 'matter'], ['frog', 'avi', 'white', 'lives', 'matter', 'in', 'the', 'bio', 'dude', "it's", 'not', 'worth', 'it', 'save', 'your', 'blood', 'pressure', 'mute', 'let', 'them', 'scream', 'into', 'the', 'ether'], ['so', 'you', 'are', 'defending', 'a', 'jew', 'hating', 'murdering', 'sand', 'nazi', 'wtg', 'jezebel', 'you', 'whorish', 'retards', 'stop', 'islam'], ['try', 'islam', 'his', 'religion', 'you', 'win', 't

In [49]:
assert x_test.shape[1] >= max([len(x) for x in tokenized])

In [50]:
joined = list(map(lambda x: " ".join(x), tokenized))
pool_word = np.array(list(vocab.fit_transform(joined)))
print(pool_word.shape)

(569, 35)


## Plug into the model

In [69]:
n = len(valid_tweets)
unlabelled_feed_dict = {"input/labels:0": np.zeros(n).reshape(n, 1), "input/X_word:0": pool_word, "input/X_char:0": pool_char, "dropout_keep_prob:0": 1}

In [70]:
unlabelled_feed_dict.update({n_candidates: 100})
_prob, _candidates = sess.run([softmax_prob, get_candidates], unlabelled_feed_dict)
idx = _candidates.indices

In [71]:
valid_tweets = np.array(valid_tweets)


### find the best predictions

In [72]:
positive_prob = _prob[:, 1]
top_pos_idx = np.argpartition(positive_prob, -100)[-100:]
for i in top_pos_idx:
    print("%s %s" % (valid_tweets[i], _prob[i][1]))


@IraqiSecurity @Franjota I hope they get the bastard #stopIslam 0.728043
Close your borders England, France, Germany and Italy! Stop being politically correct! It's killing you. #LondonTerrorAttack #NoRefugees 0.732504
@Tmchakra1 What another SATANIC LIE. #NOMUSLIMUSA #NOSHARIA #BANALLMUSLIMS #ALLAHSUX 0.733567
Don't #PrayForLondon fight back against the worst religion in the world, #StopIslam 0.883715
Don't be in the herd of fools believing the "Radical Islam" lie. Stop and THINK... It's not that hard to do.  0.882549
@AsadAli2047 Your country executes blasphemers, Islamofascist savage #StopIslam #NotAllMuslims 0.863361
@DeplorableKatie What an OXYMORON. NO MUSLIMS ARE AMERICANS. #NOMUSLIMUSA #NOSHARIA #BANALLMUSLIMS #ALLAHSUX 0.762895
@fitzscrounger @tfowkes02 @Channel4News deflect,deflect,deflect, thats all you have huh, he who denies #IslamIsTheProblem is the loon #Facts 0.833966
@BateZen @TRobinsonNewEra Sort of blows your 'one singer' analogy right out of the water, doesn't it? #

seems like them model is biased on certain hashtags like #notallmen #gamergate

### find the most uncertain predictions

In [73]:
candidate_tweets = valid_tweets[idx]
print(candidate_tweets)

[ "Oh but aren't these the poor women and children they want us to welcome with open arms?? \U0001f644 #NoRefugees"
 '#Muslim controlled area of #Luton celebrates #londonattack #NoGoZone #Bury #IslamIsTheProblem #BanIslam #Deport #Pedophiles Promises more...'
 "Is saying that Arabs used white women as sex slaves for 100's of years xenophobic? Cuz it is true #slavery #slaves #racism #STOPIslam"
 '@RealGauharAli  the main reason behind terrorism is the existence of people like you #stopislam @Trufflesmum'
 'Khalid Masood aka Adrian Elms a dindu, converted to Islam after being jailed for stabbing a man in the face. #StopIslam #BlackLiesMatter'
 '@FloEshalomi @SadiqKhan @cllrbray Go back to Middle East and spread your Islam spawn there! #STOPIslam'
 '@ryyma12 Sorry you liar. ISIS is 100% Islam #PrayForLondonِ #StopIslam'
 "@UNRefugeeAgency @TheCut The refugee crisis is nothing more than another Islamic invasion of the West. That's the truth! #NoRefugees"
 '@zakSkybrush2 your stats are as w

### combine both and make into file

In [74]:
combined_idx = np.concatenate((idx, top_pos_idx))

In [89]:
active_learning_samples = valid_tweets[combined_idx] 
total_tweets = [(1, tweet) for tweet in active_learning_samples] + [(2, tweet) for tweet in random_samples]

shuffle the tweets when labelling

In [90]:
random.shuffle(total_tweets)

In [95]:
with open("./data/crawled/racism_to_be_labelled.tsv", "w") as f:
    for i, tweet in total_tweets:
        f.write("%s\t%s\n" % (i, tweet))