## Retrieve parsed collection

In [1]:
import pandas as pd
import numpy as np
import spacy
import joblib
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# connection_params = ["twitter", "crowdflower_features"]
# client = mongo_base.connect()
# db_name = connection_params[0]
# connection_params.insert(0, client)

# query = {}
# query["filter"] = {}
# query["projection"] = {}
# query["limit"] = 0
# query["skip"] = 0
# query["no_cursor_timeout"] = True
# cursor = mongo_base.finder(connection_params, query, False)
# df = pd.DataFrame(list(cursor))
# df.head()

In [2]:
crowdflower_persistence = 'data/persistence/crowdflower_features.pkl.compressed'
nlp = spacy.load('en', create_make_doc=CustomTwokenizer)
# joblib.dump(df, crowdflower_persistence, compress=True)

In [3]:
df = joblib.load(crowdflower_persistence)

In [4]:
df[0:1]['dep_bigrams'][0][0]
hasher = FeatureHasher(input_type='string', non_negative=True)
v = DictVectorizer(sparse=False)
# raw_X = hasher.transform(df[0:1]['brown_cluster_ids'])
df[0:1]['bigrams'][0][0]
D = [{'bigram': 'Warning penny'}, {'bigram': 'penny boards'}, {'bigram': 'boards will'}, {'bigram': 'boards will'}]
trans = v.fit_transform(D)
v.inverse_transform(trans)
v.get_feature_names()
# raw_X.toarray()
# print(df[0:1]['bigrams'][0])
df[0:1]

Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,dep_bigrams,...,noun_chunks,pos_dep_rootPos,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count,word_dep_root,word_root_preRoot
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,[warning ROOT NN warning | boards compound NN ...,...,"[{'root': 'warning', 'text': 'warning'}, {'roo...","[{'rootPos': 'NN', 'dep': 'ROOT', 'pos': 'NN'}...",Warning : penny boards will make you a faggot,"[penny, faggot, warning, boards]","[Warning penny boards, penny boards will, boar...",[],0,0,"[{'root': 'warning', 'word': 'warning', 'dep':...","[{'root': 'warning', 'preRoot': 'warning', 'wo..."


## Setup data split

In [5]:
# let's pick the same random 10% of the data to train with
train_test_set = df.sample(n=int(len(df) / 2), random_state=1965)

X = train_test_set['text']
y = train_test_set['annotation_label']

In [6]:
print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

Predicting the labels of the test set...
7254 documents
2 categories


### Setup generic model experiment

In [7]:
def run_experiment(X, y, pipeline, process_name, num_expts=1):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: " + process_name)
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

### Setup naive baseline classification (countVectorizer)

In [8]:
from sklearn.feature_selection import SelectKBest, chi2
# our two ingredients: the ngram counter and the classifier
nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('kBest', ch2),
    ('clf' , clf),   # feed the output through a classifier
])

# Run the classifcation
run_experiment(X, y, count_pipeline, "CountVectorizer")

Classification Report: CountVectorizer
               precision    recall  f1-score   support

   hatespeech       0.40      0.34      0.37       290
not_offensive       0.88      0.91      0.89      1524

  avg / total       0.80      0.81      0.81      1814

Confusion matrix:
[[  98  192]
 [ 144 1380]]
0.814773980154


### Setup naive baseline classification (hashingVectorizer)

In [9]:
vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, hashing_pipeline, "HashingVectorizer")

Classification Report: HashingVectorizer
               precision    recall  f1-score   support

   hatespeech       0.51      0.31      0.39       303
not_offensive       0.87      0.94      0.90      1511

  avg / total       0.81      0.83      0.82      1814

Confusion matrix:
[[  95  208]
 [  92 1419]]
0.834619625138


### Setup tf-idf baseline classification

In [10]:
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, tfidf_pipeline, "TfidfVectorizer")

Classification Report: TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.59      0.32      0.42       312
not_offensive       0.87      0.95      0.91      1502

  avg / total       0.82      0.85      0.83      1814

Confusion matrix:
[[ 101  211]
 [  70 1432]]
0.845093715546


### Investigate this

In [11]:
# As much as I want to ignore this, I shouldn't. The fact that the precision score is
# close to the other experiments is troubling

X = train_test_set[['hs_keyword_count', 'comment_length', 'unknown_words_count']]
y = train_test_set['annotation_label']
clf = LinearSVC()

# X = X.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = clf.fit(X_train, y_train)  # train the classifier
y_prediction = model.predict(X_test)          # apply the model to the test data
report = classification_report(y_test, y_prediction)
print("Classification Report : Random Experiment")
print(report)
cm = confusion_matrix(y_test, y_prediction)
print("Confusion matrix:")
print(cm)
df[0:1]

Classification Report : Random Experiment
               precision    recall  f1-score   support

   hatespeech       0.56      0.07      0.13       294
not_offensive       0.85      0.99      0.91      1520

  avg / total       0.80      0.84      0.79      1814

Confusion matrix:
[[  22  272]
 [  17 1503]]


Unnamed: 0,_id,annotation_label,avg_token_length,bigrams,brown_cluster_ids,char_pentagrams,char_quadgrams,char_trigrams,comment_length,dep_bigrams,...,noun_chunks,pos_dep_rootPos,text,tokens,trigrams,unknown_words,unknown_words_count,uppercase_token_count,word_dep_root,word_root_preRoot
0,58c659be6541913eb7f119dd,not_offensive,4.0,"[Warning penny, penny boards, boards will, wil...","[966, 228, 989, 333, 442, 4618, 602, 19]","[penny, warni, arnin, aggot, board, rning, fag...","[aggo, warn, arni, penn, oard, ards, enny, nin...","[pen, war, nin, mak, enn, you, agg, ggo, fag, ...",9,[warning ROOT NN warning | boards compound NN ...,...,"[{'root': 'warning', 'text': 'warning'}, {'roo...","[{'rootPos': 'NN', 'dep': 'ROOT', 'pos': 'NN'}...",Warning : penny boards will make you a faggot,"[penny, faggot, warning, boards]","[Warning penny boards, penny boards will, boar...",[],0,0,"[{'root': 'warning', 'word': 'warning', 'dep':...","[{'root': 'warning', 'preRoot': 'warning', 'wo..."


In [12]:
from itertools import chain
X = train_test_set['word_root_preRoot']
y = train_test_set['annotation_label']
clf = LinearSVC()

# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/
empty_analyzer = lambda x: x
vect = DictVectorizer()
transformer = TfidfTransformer(smooth_idf=False)
# vect = TfidfVectorizer(analyzer=empty_analyzer)
# hasher = FeatureHasher(input_type='string', non_negative=True)
# X = [[str(res) for res in tmp] for tmp in X]# 
# X = hasher.transform(X)

X = X.tolist()
X = vect.fit_transform([item[0] for item in X]).toarray()
# X = transformer.fit_transform(X)
# vect.vocabulary_
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = clf.fit(X_train, y_train)  # train the classifier
y_prediction = model.predict(X_test)          # apply the model to the test data
report = classification_report(y_test, y_prediction)
print("Classification Report : Random Experiment")
print(report)
cm = confusion_matrix(y_test, y_prediction)
print("Confusion matrix:")
print(cm)

Classification Report : Random Experiment
               precision    recall  f1-score   support

   hatespeech       0.40      0.12      0.19       334
not_offensive       0.83      0.96      0.89      1480

  avg / total       0.75      0.80      0.76      1814

Confusion matrix:
[[  41  293]
 [  61 1419]]


In [13]:
print(list(train_test_set[0:1]['text']))
list(train_test_set[0:1]['word_dep_root'])

['user_mention great news for people who want to see women be sick and die Utah be very proud of your Byzantine ideas ']


[[{'dep': 'nummod', 'root': 'news', 'word': 'user_mention'},
  {'dep': 'amod', 'root': 'news', 'word': 'great'},
  {'dep': 'nsubj', 'root': 'be', 'word': 'news'},
  {'dep': 'prep', 'root': 'news', 'word': 'for'},
  {'dep': 'pobj', 'root': 'for', 'word': 'people'},
  {'dep': 'nsubj', 'root': 'want', 'word': 'who'},
  {'dep': 'relcl', 'root': 'people', 'word': 'want'},
  {'dep': 'aux', 'root': 'see', 'word': 'to'},
  {'dep': 'xcomp', 'root': 'want', 'word': 'see'},
  {'dep': 'nsubj', 'root': 'be', 'word': 'women'},
  {'dep': 'ccomp', 'root': 'see', 'word': 'be'},
  {'dep': 'acomp', 'root': 'be', 'word': 'sick'},
  {'dep': 'cc', 'root': 'be', 'word': 'and'},
  {'dep': 'conj', 'root': 'be', 'word': 'die'},
  {'dep': 'nsubj', 'root': 'be', 'word': 'utah'},
  {'dep': 'ROOT', 'root': 'be', 'word': 'be'},
  {'dep': 'advmod', 'root': 'proud', 'word': 'very'},
  {'dep': 'acomp', 'root': 'be', 'word': 'proud'},
  {'dep': 'prep', 'root': 'proud', 'word': 'of'},
  {'dep': 'poss', 'root': 'ideas', '

In [14]:
test_string = "Austrailian scientists discovers star with telescope"
doc = nlp(test_string)

elements = []
dict1 = {"austrailian": "scientists amod"}
dict2 = {"scientists": "austrailian amod"}
dict3 = {"scientists": "discovers nsubj"}

elements.append(dict1)
elements.append(dict2)
elements.append(dict3)

# print(list(train_test_set[2:3]['text']))
# print(list(train_test_set[2:3]['word_dep_root']))
vect = DictVectorizer()
# h = FeatureHasher()
X = vect.fit_transform(elements)
# D = [{"people": "acomp ", 'bar': 2}, {'foo': 3, 'baz': 1}]
# X = vect.fit_transform(D)
# X
# elements
X
vect.inverse_transform(X)

[{'austrailian=scientists amod': 1.0},
 {'scientists=austrailian amod': 1.0},
 {'scientists=discovers nsubj': 1.0}]