In [49]:
import pandas as pd
import spacy
import joblib
import plotly as py
import cufflinks as cf
import plotly.graph_objs as go
from modules.utils import file_ops
from modules.utils import EmotionDetection
from modules.utils import settings
from modules.pattern_classifier import SimpleClassifier, PatternVectorizer

## Load Pipelines

In [None]:
emotion_api = EmotionDetection.EmotionDetection()
py.offline.init_notebook_mode(connected=True)
cf.go_offline()
nlp = spacy.load('en')

## Load Persisted models

In [11]:
cls_persistence = 'data/persistence/simple_classifier_model.pkl.compressed'
pv_persistence = 'data/persistence/pattern_vectorizer.pkl.compressed'
cls = joblib.load(cls_persistence)
pv = joblib.load(pv_persistence)

## Load CrowdFlower dataset

In [15]:
df = pd.read_csv('data/datasets/twitter-hate-speech-classifier.csv', encoding='utf-8')
contains_hatespeech = df.loc[df['does_this_tweet_contain_hate_speech'] == 'The tweet contains hate speech']
contains_hatespeech = contains_hatespeech[['_unit_id', '_unit_state', '_trusted_judgments', 'does_this_tweet_contain_hate_speech:confidence', 'tweet_id', 'tweet_text']]
contains_hatespeech.shape

(2399, 6)

In [12]:
# Plot trusted judgements against confidence
# data = [
#     go.Scatter(
#         x=contains_hatespeech['_trusted_judgments'], # assign x as the dataframe column 'x'
#         y=contains_hatespeech['does_this_tweet_contain_hate_speech:confidence']
#     )
# ]

# layout = go.Layout(
#     title='Confidence vs Trusted Judgments',
#     yaxis=dict(title='confidence level'),
#     xaxis=dict(title='# trusted judgments')
# )
# fig = go.Figure(data=data, layout=layout)
# py.offline.iplot(fig)
# test.iplot(kind='histogram')
# result["ambiguous"]

## Vectorize and classify tweets

In [36]:
hatespeech_subsample = contains_hatespeech[['tweet_text']][0:20]
hatespeech_vec = pv.transform(hatespeech_subsample['tweet_text'])
hatespeech_vec[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [90]:
# Can return up to 8 emotions
result = []
HS_GUESS = cls.get_top_classes(hatespeech_vec, ascending=True, n=3)
for doc in range(0,len(hatespeech_subsample)):
    result.append((hatespeech_subsample['tweet_text'].iloc[doc], HS_GUESS[doc]))

result_frame = pd.DataFrame(result, columns=('text', 'emotions'))
result_frame

Unnamed: 0,text,emotions
0,Fuck dykes,"[disgust, anticipation, joy]"
1,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...,"[fear, sadness, surprise]"
2,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill...","[anticipation, surprise, joy]"
3,@elaynay your a dirty terrorist and your relig...,"[anger, surprise, fear]"
4,RT @ivanrabago_: @_WhitePonyJr_ looking like f...,"[fear, joy, surprise]"
5,Well I thought you knew actually RT @KingHorse...,"[surprise, anger, trust]"
6,#VoteBlue2014 Yeah. CUZ 8 million people in fa...,"[surprise, anger, joy]"
7,"@AndreBerto word is you use roids, stupid hypo...","[trust, anger, surprise]"
8,I hate faggots like you,"[anger, surprise, sadness]"
9,@MoriTaheripour shut up nigger whore! Hope u g...,"[anger, trust, sadness]"


In [98]:
hs_text = result_frame['text'].iloc[12]

doc = nlp(hs_text)
nlp.vocab.strings[doc[16].orth_]
[ent.label_ for ent in doc.ents]
for d in doc:
    print(d.orth_, d.tag_)

Good JJ
night NN
fags NNS
and CC
fagettes NNS
( -LRB-
that DT
's VBZ
the DT
female JJ
version NN
of IN
fags NNS
like IN
fag NN
- HYPH
ettes NNS
to TO
make VB
it PRP
female JJ
) -RRB-
I PRP
hate VBP
you PRP
all DT
. .


In [99]:
for d in doc:
    print(d.orth_, d.pos_)

Good ADJ
night NOUN
fags NOUN
and CONJ
fagettes NOUN
( PUNCT
that DET
's VERB
the DET
female ADJ
version NOUN
of ADP
fags NOUN
like ADP
fag NOUN
- PUNCT
ettes NOUN
to PART
make VERB
it PRON
female ADJ
) PUNCT
I PRON
hate VERB
you PRON
all DET
. PUNCT
