In [1]:
import pandas as pd
from tqdm import tqdm
import spacy
from itertools import chain
import joblib
import plotly as py
import cufflinks as cf
import plotly.graph_objs as go
from modules.utils import file_ops
from modules.utils import EmotionDetection
from modules.utils.CustomTwokenizer import CustomTwokenizer
from modules.utils import settings
from modules.pattern_classifier import SimpleClassifier, PatternVectorizer

## Load Pipelines

In [2]:
emotion_api = EmotionDetection.EmotionDetection()
py.offline.init_notebook_mode(connected=True)
# cf.go_offline()

In [3]:
spacy_model = "en_core_web_md"
nlp = spacy.load(spacy_model, create_make_doc=CustomTwokenizer)
spacy.info(spacy_model)


    [93mInfo about model en_core_web_md[0m

    lang               en             
    url                https://explosion.ai
    description        General-purpose English model, with tagging, parsing, entities and word vectors
    source             /home/jherez/Dev/thesis-preprocessing/venv/lib/python3.5/site-packages/en_core_web_md/en_core_web_md-1.2.1
    spacy_version      >=1.7.0,<2.0.0 
    name               core_web_md    
    version            1.2.1          
    license            CC BY-SA 3.0   
    email              contact@explosion.ai
    author             Explosion AI   
    link               /home/jherez/Dev/thesis-preprocessing/venv/lib/python3.5/site-packages/spacy/data/en_core_web_md



## Load Persisted models

In [3]:
# cls_persistence = 'data/persistence/simple_classifier_model.pkl.compressed'
# pv_persistence = 'data/persistence/pattern_vectorizer.pkl.compressed'
# cls = joblib.load(cls_persistence)
# pv = joblib.load(pv_persistence)

## Load CrowdFlower dataset

In [3]:
df = pd.read_csv('data/datasets/twitter-hate-speech-classifier.csv', encoding='utf-8')
contains_hatespeech = df.loc[df['does_this_tweet_contain_hate_speech'] == 'The tweet contains hate speech']
contains_hatespeech = contains_hatespeech[['_unit_id', '_unit_state', '_trusted_judgments', 'does_this_tweet_contain_hate_speech:confidence', 'tweet_id', 'tweet_text']]
contains_hatespeech.shape

(2399, 6)

## Vectorize and classify tweets

In [5]:
hatespeech_subsample = contains_hatespeech[['tweet_text']][0:20]
hatespeech_vec = pv.transform(hatespeech_subsample['tweet_text'])

In [6]:
# Can return up to 8 emotions
result = []
HS_GUESS = cls.get_top_classes(hatespeech_vec, ascending=True, n=3)
for doc in range(0,len(hatespeech_subsample)):
    result.append((hatespeech_subsample['tweet_text'].iloc[doc], HS_GUESS[doc]))
result_frame = pd.DataFrame(result, columns=('text', 'emotions'))
result_frame[:5]

Unnamed: 0,text,emotions
0,Fuck dykes,"[disgust, anticipation, joy]"
1,@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...,"[fear, sadness, surprise]"
2,"""@jayswaggkillah: ""@JacklynAnnn: @jayswaggkill...","[anticipation, surprise, joy]"
3,@elaynay your a dirty terrorist and your relig...,"[anger, surprise, fear]"
4,RT @ivanrabago_: @_WhitePonyJr_ looking like f...,"[fear, joy, surprise]"


## spaCy Experimentation

In [7]:
hs_text = result_frame['text'].iloc[12]
doc = nlp(hs_text)
nlp.vocab.strings[doc[16].orth_]
[ent.label_ for ent in doc.ents]

[]

In [31]:
test_string = "these fucking skypes"
doc = nlp(test_string)
[[token.lower_, token.pos_, token.tag_] for token in doc if not(token.is_punct)]

[['these', 'DET', 'DT'], ['fucking', 'VERB', 'VBG'], ['skypes', 'NOUN', 'NNS']]

In [9]:
from numpy import dot
from numpy.linalg import norm

nasa = nlp.vocab[u'nasa']

# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

# gather all known words, take only the lowercased versions
allWords = list({w for w in nlp.vocab if w.has_vector and w.orth_.islower() and w.lower_ != "nasa"})

# sort by similarity to NASA
allWords.sort(key=lambda w: cosine(w.vector, nasa.vector))
allWords.reverse()
print("Top 10 most similar words to NASA:")
for word in allWords[:5]:
    print(word.orth_)

Top 10 most similar words to NASA:
ufo
hubble
apollo
yung
ang


In [32]:
res = [[{"text": word.lower_, "lemma": word.lemma_, "pos": word.tag_, "dependency": word.dep_, "root": word.head.lower_, "pre": word.suffix_}] for word in doc]
nounphrases = [{"text":np.text.lower(),"root": np.root.head.text.lower()} for np in doc.noun_chunks]
nounphrases
res

[[{'dependency': 'det',
   'lemma': 'these',
   'pos': 'DT',
   'pre': 'ese',
   'root': 'skypes',
   'text': 'these'}],
 [{'dependency': 'amod',
   'lemma': 'fuck',
   'pos': 'VBG',
   'pre': 'ing',
   'root': 'skypes',
   'text': 'fucking'}],
 [{'dependency': 'ROOT',
   'lemma': 'skypes',
   'pos': 'NNS',
   'pre': 'pes',
   'root': 'skypes',
   'text': 'skypes'}]]

In [11]:
cluster_ids = [tok.cluster for tok in doc if tok.cluster != 0]
tokens = [token for token in doc if not(token.is_stop or token.is_punct or token.lower_ == "rt" or token.is_digit)]

In [21]:
# https://github.com/explosion/spaCy/issues/276
from nltk.corpus import words as nltk_words
def most_similar(word):
    queries = [w for w in word.vocab if not (word.is_oov or word.is_punct or word.like_num or word.is_stop or word.lower_ == "rt") and w.has_vector and w.lower_ != word.lower_ and w.is_lower == word.is_lower and w.prob >= -15]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:5]

def get_keywords(doc):
    result = set()
    for token in doc.noun_chunks:
        if ' ' in token.text:
            split = token.text.split(" ")
            for tok in split:
                result.add(tok)
        else:
            get_doc_token = doc[token.start]
            if not (get_doc_token.is_oov or get_doc_token.is_punct or get_doc_token.like_num or get_doc_token.is_stop or get_doc_token.lower_ == "rt"):
                result.add(get_doc_token.lower_)
    return result
            
keywords = get_keywords(doc)
# [[w.lower_ for w in most_similar(nlp.vocab[token])] for token in keywords if token != 'the']
def count_upper_case_tokens(doc):
    count = 0
    for token in doc:
        if token.text.isupper() and len(token) != 1:
            count += 1
    return count
count_upper_case_tokens(doc)
result = most_similar(nlp.vocab[u'nasa'])
for res in result:
    print(res.lower_)

ufo
apollo
yung
ang
ako


In [13]:
import numpy as np
test = [1,2,3,4,6]
s = pd.Series(test, index=range(0, len(test)))
HS_GUESS[0][0]

'disgust'

In [4]:
tweet = [{'lemma': 'warning', 'root': 'warning', 'pos': 'NN', 'dependency': 'ROOT', 'text': 'warning'}, {'lemma': 'penny', 'root': 'boards', 'pos': 'NN', 'dependency': 'compound', 'text': 'penny'}
, {'lemma': 'board', 'root': 'make', 'pos': 'NNS', 'dependency': 'nsubj', 'text': 'boards'}, {'lemma': 'will', 'root': 'make', 'pos': 'MD', 'dependency': 'aux', 'text': 'will'}, {'lemma':
'make', 'root': 'warning', 'pos': 'VB', 'dependency': 'acl', 'text': 'make'}, {'lemma': 'you', 'root': 'faggot', 'pos': 'PRP', 'dependency': 'nsubj', 'text': 'you'}, {'lemma': 'a', 'root':
 'faggot', 'pos': 'DT', 'dependency': 'det', 'text': 'a'}, {'lemma': 'faggot', 'root': 'make', 'pos': 'NN', 'dependency': 'ccomp', 'text': 'faggot'}]

In [14]:
def create_dep_ngrams(dependency_list, length):
    dependencies = [[dependency["text"], dependency["root"], dependency["dependency"], dependency["pos"]] for dependency in dependency_list]
    for idx, dep in enumerate(dependencies):
        dependencies[idx] = "_".join(_ele for _ele in dep)
    
    dependencies = list(map(list, zip(*[dependencies[i:] for i in range(length)])))
    for idx, dep in enumerate(dependencies):
        dependencies[idx] = " | ".join(_ele for _ele in dep)
    return dependencies
test = create_dep_ngrams(tweet, 3)
test

 'penny_boards_compound_NN | boards_make_nsubj_NNS | will_make_aux_MD',
 'you_faggot_nsubj_PRP | a_faggot_det_DT | faggot_make_ccomp_NN']