## Retrieve parsed collection

In [1]:
import pandas as pd
import numpy as np
import spacy
import joblib
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from modules.utils.CustomTwokenizer import CustomTwokenizer

# connection_params = ["twitter", "crowdflower_features"]
# client = mongo_base.connect()
# db_name = connection_params[0]
# connection_params.insert(0, client)

# query = {}
# query["filter"] = {}
# query["projection"] = {}
# query["limit"] = 0
# query["skip"] = 0
# query["no_cursor_timeout"] = True
# cursor = mongo_base.finder(connection_params, query, False)
# df = pd.DataFrame(list(cursor))
# df.head()

In [3]:
crowdflower_persistence = 'data/persistence/crowdflower_features.pkl.compressed'
nlp = spacy.load('en', create_make_doc=CustomTwokenizer)
# joblib.dump(df, crowdflower_persistence, compress=True)

In [4]:
df = joblib.load(crowdflower_persistence)

In [5]:
df[0:1]['dep_bigrams'][0][0]
hasher = FeatureHasher(input_type='string', non_negative=True)
v = DictVectorizer(sparse=False)
# raw_X = hasher.transform(df[0:1]['brown_cluster_ids'])
df[0:1]['bigrams'][0][0]
D = [{'bigram': 'Warning penny'}, {'bigram': 'penny boards'}, {'bigram': 'boards will'}, {'bigram': 'boards will'}]
trans = v.fit_transform(D)
v.inverse_transform(trans)
v.get_feature_names()
# raw_X.toarray()
# print(df[0:1]['bigrams'][0])
df[0:1]

Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,conllFormat,...,noun_chunks,pos_dep_rootPos,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count,word_dep_root,word_root_preRoot
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,"[1 warning warning NOUN NN _ 0 ROOT 0:ROOT _, ...",...,"[{'text': 'warning', 'root': 'warning'}, {'tex...","[{'pos': 'NN', 'dep': 'ROOT', 'rootPos': 'NN'}...",Warning : penny boards will make you a faggot,"[warning, penny, boards, faggot]","[Warning penny boards, penny boards will, boar...",[],0,0,"[{'word': 'warning', 'dep': 'ROOT', 'root': 'w...","[{'word': 'warning', 'preRoot': 'warning', 'ro..."


### Setup data split

In [6]:
# let's pick the same random 10% of the data to train with
train_test_set = df.sample(n=int(len(df) / 2), random_state=1965)

X = train_test_set['text']
y = train_test_set['annotation_label']

In [7]:
print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

Predicting the labels of the test set...
7254 documents
2 categories


### Setup generic model experiment

In [8]:
def run_experiment(X, y, pipeline, process_name, num_expts=1):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: " + process_name)
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

### Setup naive baseline classification (countVectorizer)

In [9]:
from sklearn.feature_selection import SelectKBest, chi2
# our two ingredients: the ngram counter and the classifier
nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('kBest', ch2),
    ('clf' , clf),   # feed the output through a classifier
])

# Run the classifcation
run_experiment(X, y, count_pipeline, "CountVectorizer")

Classification Report: CountVectorizer
               precision    recall  f1-score   support

   hatespeech       0.46      0.34      0.39       314
not_offensive       0.87      0.92      0.89      1500

  avg / total       0.80      0.82      0.81      1814

Confusion matrix:
[[ 107  207]
 [ 124 1376]]
0.817530319735


### Setup naive baseline classification (hashingVectorizer)

In [10]:
vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, hashing_pipeline, "HashingVectorizer")

Classification Report: HashingVectorizer
               precision    recall  f1-score   support

   hatespeech       0.58      0.36      0.44       322
not_offensive       0.87      0.95      0.91      1492

  avg / total       0.82      0.84      0.82      1814

Confusion matrix:
[[ 115  207]
 [  82 1410]]
0.840683572216


### Setup tf-idf baseline classification

In [11]:
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, tfidf_pipeline, "TfidfVectorizer")

Classification Report: TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.53      0.35      0.42       292
not_offensive       0.88      0.94      0.91      1522

  avg / total       0.83      0.85      0.83      1814

Confusion matrix:
[[ 101  191]
 [  89 1433]]
0.845644983462


### Investigate this

In [12]:
# As much as I want to ignore this, I shouldn't. The fact that the precision score is
# close to the other experiments is troubling

X = train_test_set[['hs_keyword_count', 'comment_length', 'unknown_words_count']]
y = train_test_set['annotation_label']
clf = LinearSVC()

# X = X.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = clf.fit(X_train, y_train)  # train the classifier
y_prediction = model.predict(X_test)          # apply the model to the test data
report = classification_report(y_test, y_prediction)
print("Classification Report : Random Experiment")
print(report)
cm = confusion_matrix(y_test, y_prediction)
print("Confusion matrix:")
print(cm)
df[0:1]

Classification Report : Random Experiment
               precision    recall  f1-score   support

   hatespeech       0.51      0.07      0.12       289
not_offensive       0.85      0.99      0.91      1525

  avg / total       0.79      0.84      0.79      1814

Confusion matrix:
[[  19  270]
 [  18 1507]]


Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,conllFormat,...,noun_chunks,pos_dep_rootPos,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count,word_dep_root,word_root_preRoot
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,"[1 warning warning NOUN NN _ 0 ROOT 0:ROOT _, ...",...,"[{'text': 'warning', 'root': 'warning'}, {'tex...","[{'pos': 'NN', 'dep': 'ROOT', 'rootPos': 'NN'}...",Warning : penny boards will make you a faggot,"[warning, penny, boards, faggot]","[Warning penny boards, penny boards will, boar...",[],0,0,"[{'word': 'warning', 'dep': 'ROOT', 'root': 'w...","[{'word': 'warning', 'preRoot': 'warning', 'ro..."


In [13]:
from itertools import chain
X = train_test_set['word_root_preRoot']
y = train_test_set['annotation_label']
clf = LinearSVC()

# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/
empty_analyzer = lambda x: x
vect = DictVectorizer()
transformer = TfidfTransformer(smooth_idf=False)
# vect = TfidfVectorizer(analyzer=empty_analyzer)
# hasher = FeatureHasher(input_type='string', non_negative=True)
# X = [[str(res) for res in tmp] for tmp in X]# 
# X = hasher.transform(X)

X = X.tolist()
X = vect.fit_transform([item[0] for item in X]).toarray()
# X = transformer.fit_transform(X)
# vect.vocabulary_
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = clf.fit(X_train, y_train)  # train the classifier
y_prediction = model.predict(X_test)          # apply the model to the test data
report = classification_report(y_test, y_prediction)
print("Classification Report : Random Experiment")
print(report)
cm = confusion_matrix(y_test, y_prediction)
print("Confusion matrix:")
print(cm)

Classification Report : Random Experiment
               precision    recall  f1-score   support

   hatespeech       0.38      0.14      0.20       303
not_offensive       0.85      0.95      0.90      1511

  avg / total       0.77      0.82      0.78      1814

Confusion matrix:
[[  41  262]
 [  68 1443]]


In [78]:
test_string = "Austrailian scientists .... discovers ten stars with telescope"
doc = nlp(test_string)

# elements = []
# dict1 = {"austrailian": "scientists amod"}
# dict2 = {"scientists": "austrailian amod"}
# dict3 = {"scientists": "discovers nsubj"}

# elements.append(dict1)
# elements.append(dict2)
# elements.append(dict3)

# vect = DictVectorizer()
# X = vect.fit_transform(elements)
# X
# vect.inverse_transform(X)

def extract_dep_context(doc):
    dependency_contexts = []
    for word in doc:
        if (str(word.head.prefix_).isdigit() or not str(word.head.prefix_).isalpha() or str(word.prefix_).isdigit() or not str(word.prefix_).isalpha() or "." in word.text or "." in str(word.head)):
            pass
        elif word.head is word and word.dep_ == "ROOT" and not (word.is_punct or word.is_digit or word.like_num):
            dependency_contexts.append(
                {word.lower_: str(word.head) + " " + str(word.dep_)})
        elif not (word.is_punct or word.is_digit or word.like_num):
            dependency_contexts.append({word.lower_: str(word.head) + " " + str(word.dep_)+"-INV"})
            dependency_contexts.append({word.head.lower_: str(word.lower_) + " " + str(word.dep_)})
    return dependency_contexts
dependency_contexts = extract_dep_context(doc)
dependency_contexts

[{'austrailian': 'scientists amod-INV'},
 {'scientists': 'austrailian amod'},
 {'scientists': 'scientists ROOT'},
 {'discovers': 'discovers ROOT'},
 {'ten': 'stars nummod-INV'},
 {'stars': 'ten nummod'},
 {'stars': 'discovers dobj-INV'},
 {'discovers': 'stars dobj'},
 {'with': 'stars prep-INV'},
 {'stars': 'with prep'},
 {'telescope': 'with pobj-INV'},
 {'with': 'telescope pobj'}]

In [16]:
# https://github.com/explosion/spaCy/issues/533#issuecomment-254774296
def extract_conll_format(doc):
    result = []
    conll = []
    for sent in doc.sents:
        for i, word in enumerate(sent):
            if word.head is word:
                head_idx = 0
            else:
                 head_idx = word.head.i + 1
            conll.extend((i+1, word.lower_, word.lemma_, word.pos_, word.tag_, "_", head_idx, word.dep_, str(head_idx) + ":"+ word.dep_, "_"))
            result.append(" ".join(str(x) for x in conll))
            conll = []
    return result

conll_test = extract_conll_format(doc)
conll_test

['1 austrailian austrailian ADJ JJ _ 2 amod 2:amod _',
 '2 scientists scientist NOUN NNS _ 3 nsubj 3:nsubj _',
 '3 discovers discover VERB VBZ _ 0 ROOT 0:ROOT _',
 '4 star star NOUN NN _ 3 dobj 3:dobj _',
 '5 with with ADP IN _ 4 prep 4:prep _',
 '6 telescope telescope NOUN NN _ 5 pobj 5:pobj _']