## Retrieve parsed collection

In [6]:
import pandas as pd
import numpy as np
import spacy
import joblib
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from modules.utils.CustomTwokenizer import CustomTwokenizer

# connection_params = ["twitter", "crowdflower_features"]
# client = mongo_base.connect()
# db_name = connection_params[0]
# connection_params.insert(0, client)

# query = {}
# query["filter"] = {}
# query["projection"] = {}
# query["limit"] = 0
# query["skip"] = 0
# query["no_cursor_timeout"] = True
# cursor = mongo_base.finder(connection_params, query, False)
# df = pd.DataFrame(list(cursor))
# df.head()

In [7]:
crowdflower_persistence = 'data/persistence/crowdflower_features.pkl.compressed'
nlp = spacy.load('en', create_make_doc=CustomTwokenizer)
# joblib.dump(df, crowdflower_persistence, compress=True)

In [8]:
df = joblib.load(crowdflower_persistence)

In [9]:
df[0:1]['dep_bigrams'][0][0]
hasher = FeatureHasher(input_type='string', non_negative=True)
v = DictVectorizer(sparse=False)
# raw_X = hasher.transform(df[0:1]['brown_cluster_ids'])
df[0:1]['bigrams'][0][0]
D = [{'bigram': 'Warning penny'}, {'bigram': 'penny boards'}, {'bigram': 'boards will'}, {'bigram': 'boards will'}]
trans = v.fit_transform(D)
v.inverse_transform(trans)
v.get_feature_names()
# raw_X.toarray()
# print(df[0:1]['bigrams'][0])
df[0:1]

Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,dep_bigrams,...,noun_chunks,pos_dep_rootPos,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count,word_dep_root,word_root_preRoot
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,[warning ROOT NN warning | boards compound NN ...,...,"[{'root': 'warning', 'text': 'warning'}, {'roo...","[{'rootPos': 'NN', 'dep': 'ROOT', 'pos': 'NN'}...",Warning : penny boards will make you a faggot,"[penny, faggot, warning, boards]","[Warning penny boards, penny boards will, boar...",[],0,0,"[{'root': 'warning', 'dep': 'ROOT', 'word': 'w...","[{'root': 'warning', 'preRoot': 'warning', 'wo..."


## Setup data split

In [10]:
# let's pick the same random 10% of the data to train with
train_test_set = df.sample(n=int(len(df) / 2), random_state=1965)

X = train_test_set['text']
y = train_test_set['annotation_label']

In [11]:
print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

Predicting the labels of the test set...
7254 documents
2 categories


### Setup generic model experiment

In [12]:
def run_experiment(X, y, pipeline, process_name, num_expts=1):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: " + process_name)
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

### Setup naive baseline classification (countVectorizer)

In [13]:
from sklearn.feature_selection import SelectKBest, chi2
# our two ingredients: the ngram counter and the classifier
nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('kBest', ch2),
    ('clf' , clf),   # feed the output through a classifier
])

# Run the classifcation
run_experiment(X, y, count_pipeline, "CountVectorizer")

Classification Report: CountVectorizer
               precision    recall  f1-score   support

   hatespeech       0.41      0.37      0.39       285
not_offensive       0.89      0.90      0.89      1529

  avg / total       0.81      0.82      0.81      1814

Confusion matrix:
[[ 106  179]
 [ 150 1379]]
0.818632855568


### Setup naive baseline classification (hashingVectorizer)

In [14]:
vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, hashing_pipeline, "HashingVectorizer")

Classification Report: HashingVectorizer
               precision    recall  f1-score   support

   hatespeech       0.53      0.36      0.43       302
not_offensive       0.88      0.94      0.91      1512

  avg / total       0.82      0.84      0.83      1814

Confusion matrix:
[[ 108  194]
 [  94 1418]]
0.841234840132


### Setup tf-idf baseline classification

In [15]:
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, tfidf_pipeline, "TfidfVectorizer")

Classification Report: TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.55      0.28      0.37       311
not_offensive       0.86      0.95      0.91      1503

  avg / total       0.81      0.84      0.81      1814

Confusion matrix:
[[  87  224]
 [  72 1431]]
0.836824696803


### Investigate this

In [16]:
# As much as I want to ignore this, I shouldn't. The fact that the precision score is
# close to the other experiments is troubling

X = train_test_set[['hs_keyword_count', 'comment_length', 'unknown_words_count']]
y = train_test_set['annotation_label']
clf = LinearSVC()

# X = X.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = clf.fit(X_train, y_train)  # train the classifier
y_prediction = model.predict(X_test)          # apply the model to the test data
report = classification_report(y_test, y_prediction)
print("Classification Report : Random Experiment")
print(report)
cm = confusion_matrix(y_test, y_prediction)
print("Confusion matrix:")
print(cm)
df[0:1]

Classification Report : Random Experiment
               precision    recall  f1-score   support

   hatespeech       0.62      0.11      0.18       281
not_offensive       0.86      0.99      0.92      1533

  avg / total       0.82      0.85      0.80      1814

Confusion matrix:
[[  30  251]
 [  18 1515]]


Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,dep_bigrams,...,noun_chunks,pos_dep_rootPos,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count,word_dep_root,word_root_preRoot
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,[warning ROOT NN warning | boards compound NN ...,...,"[{'root': 'warning', 'text': 'warning'}, {'roo...","[{'rootPos': 'NN', 'dep': 'ROOT', 'pos': 'NN'}...",Warning : penny boards will make you a faggot,"[penny, faggot, warning, boards]","[Warning penny boards, penny boards will, boar...",[],0,0,"[{'root': 'warning', 'dep': 'ROOT', 'word': 'w...","[{'root': 'warning', 'preRoot': 'warning', 'wo..."


In [17]:
from itertools import chain
X = train_test_set['word_root_preRoot']
y = train_test_set['annotation_label']
clf = LinearSVC()

# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/
empty_analyzer = lambda x: x
vect = DictVectorizer()
transformer = TfidfTransformer(smooth_idf=False)
# vect = TfidfVectorizer(analyzer=empty_analyzer)
# hasher = FeatureHasher(input_type='string', non_negative=True)
# X = [[str(res) for res in tmp] for tmp in X]# 
# X = hasher.transform(X)

X = X.tolist()
X = vect.fit_transform([item[0] for item in X]).toarray()
# X = transformer.fit_transform(X)
# vect.vocabulary_
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = clf.fit(X_train, y_train)  # train the classifier
y_prediction = model.predict(X_test)          # apply the model to the test data
report = classification_report(y_test, y_prediction)
print("Classification Report : Random Experiment")
print(report)
cm = confusion_matrix(y_test, y_prediction)
print("Confusion matrix:")
print(cm)

Classification Report : Random Experiment
               precision    recall  f1-score   support

   hatespeech       0.41      0.17      0.24       309
not_offensive       0.85      0.95      0.90      1505

  avg / total       0.77      0.82      0.78      1814

Confusion matrix:
[[  51  258]
 [  73 1432]]


In [18]:
print(list(train_test_set[0:1]['text']))
list(train_test_set[0:1]['word_dep_root'])

['user_mention great news for people who want to see women be sick and die Utah be very proud of your Byzantine ideas ']


[[{'dep': 'nummod', 'root': 'news', 'word': 'user_mention'},
  {'dep': 'amod', 'root': 'news', 'word': 'great'},
  {'dep': 'nsubj', 'root': 'be', 'word': 'news'},
  {'dep': 'prep', 'root': 'news', 'word': 'for'},
  {'dep': 'pobj', 'root': 'for', 'word': 'people'},
  {'dep': 'nsubj', 'root': 'want', 'word': 'who'},
  {'dep': 'relcl', 'root': 'people', 'word': 'want'},
  {'dep': 'aux', 'root': 'see', 'word': 'to'},
  {'dep': 'xcomp', 'root': 'want', 'word': 'see'},
  {'dep': 'nsubj', 'root': 'be', 'word': 'women'},
  {'dep': 'ccomp', 'root': 'see', 'word': 'be'},
  {'dep': 'acomp', 'root': 'be', 'word': 'sick'},
  {'dep': 'cc', 'root': 'be', 'word': 'and'},
  {'dep': 'conj', 'root': 'be', 'word': 'die'},
  {'dep': 'nsubj', 'root': 'be', 'word': 'utah'},
  {'dep': 'ROOT', 'root': 'be', 'word': 'be'},
  {'dep': 'advmod', 'root': 'proud', 'word': 'very'},
  {'dep': 'acomp', 'root': 'be', 'word': 'proud'},
  {'dep': 'prep', 'root': 'proud', 'word': 'of'},
  {'dep': 'poss', 'root': 'ideas', '

In [64]:
test_string = "Austrailian scientists discovers star with telescope"
doc = nlp(test_string)

# elements = []
# dict1 = {"austrailian": "scientists amod"}
# dict2 = {"scientists": "austrailian amod"}
# dict3 = {"scientists": "discovers nsubj"}

# elements.append(dict1)
# elements.append(dict2)
# elements.append(dict3)

# vect = DictVectorizer()
# X = vect.fit_transform(elements)
# X
# vect.inverse_transform(X)
dependency_contexts = []
context = {}
for word in doc:
    dependency_contexts.append({word.lower_: word.dep_})
dependency_contexts

[{'austrailian': 'amod'},
 {'scientists': 'nsubj'},
 {'discovers': 'ROOT'},
 {'star': 'dobj'},
 {'with': 'prep'},
 {'telescope': 'pobj'}]

In [87]:
# https://github.com/explosion/spaCy/issues/533#issuecomment-254774296
def extract_conll_format(doc):
    result = []
    conll = []
    for sent in doc.sents:
        for i, word in enumerate(sent):
            if word.head is word:
                head_idx = 0
            else:
                 head_idx = word.head.i + 1
            conll.extend((i+1, word.lower_, word.lemma_, word.pos_, word.tag_, "_", head_idx, word.dep_, str(head_idx) + ":"+ word.dep_, "_"))
            result.append(" ".join(str(x) for x in conll))
            conll = []
    return result

conll_test = extract_conll_format(doc)
conll_test

['1 austrailian austrailian ADJ JJ _ 2 amod 2:amod _',
 '2 scientists scientist NOUN NNS _ 3 nsubj 3:nsubj _',
 '3 discovers discover VERB VBZ _ 0 ROOT 0:ROOT _',
 '4 star star NOUN NN _ 3 dobj 3:dobj _',
 '5 with with ADP IN _ 4 prep 4:prep _',
 '6 telescope telescope NOUN NN _ 5 pobj 5:pobj _']

In [22]:
res = [[{"text": word.lower_, "lemma": word.lemma_, "pos": word.tag_, "dependency": word.dep_, "root": word.head.lower_, "pre": word.suffix_}] for word in doc]
res

[[{'dependency': 'amod',
   'lemma': 'austrailian',
   'pos': 'JJ',
   'pre': 'ian',
   'root': 'scientists',
   'text': 'austrailian'}],
 [{'dependency': 'nsubj',
   'lemma': 'scientist',
   'pos': 'NNS',
   'pre': 'sts',
   'root': 'discovers',
   'text': 'scientists'}],
 [{'dependency': 'ROOT',
   'lemma': 'discover',
   'pos': 'VBZ',
   'pre': 'ers',
   'root': 'discovers',
   'text': 'discovers'}],
 [{'dependency': 'dobj',
   'lemma': 'star',
   'pos': 'NN',
   'pre': 'tar',
   'root': 'discovers',
   'text': 'star'}],
 [{'dependency': 'prep',
   'lemma': 'with',
   'pos': 'IN',
   'pre': 'ith',
   'root': 'star',
   'text': 'with'}],
 [{'dependency': 'pobj',
   'lemma': 'telescope',
   'pos': 'NN',
   'pre': 'ope',
   'root': 'with',
   'text': 'telescope'}]]