# Model Building

This is a staging notebook for experiments related to the classifier model.

In [80]:
from itertools import chain
import pandas as pd
import numpy as np
from gensim import models, similarities
from tqdm import tqdm
import spacy
import joblib
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from modules.utils.CustomTwokenizer import CustomTwokenizer

### Store collection as dataframe

In [71]:
def fetch_as_df(connection_params, projection):
    client = mongo_base.connect()
    db_name = connection_params[0]
    connection_params.insert(0, client)
    query = {}
    query["filter"] = {}
    query["projection"] = projection
    query["limit"] = 0
    query["skip"] = 0
    query["no_cursor_timeout"] = True
    cursor = mongo_base.finder(connection_params, query, False)
    df = pd.DataFrame(list(cursor))
    return df

### Merge dataframes [CrowdFlower Dataset]

In [72]:
connection_params_1 = ["twitter", "crowdflower_features"]
connection_params_2 = ["twitter", "crowdflower_features_emo"]
# df = fetch_as_df(connection_params_1, {})
# df_emo = fetch_as_df(connection_params_2, {"emotions":1})
# df = pd.DataFrame.merge(df, df_emo, on="_id")
# df.head()

#### Pickle the raw feature collection

In [74]:
spacy_en_model = "en_core_web_md"
spacy_glove_model = "en_vectors_glove_md"
crowdflower_persistence_raw = 'data/persistence/df/crowdflower_features_raw.pkl.compressed'
crowdflower_persistence = 'data/persistence/df/crowdflower_features.pkl.compressed'
naacl_2016_persistence = 'data/persistence/df/naacl_2016.pkl.compressed'
nlp_2016_persistence = 'data/persistence/df/nlp_2016.pkl.compressed'
nlp = spacy.load(spacy_en_model, create_make_doc=CustomTwokenizer)
# joblib.dump(df, crowdflower_persistence_raw, compress=True)

In [9]:
# df = joblib.load(crowdflower_persistence_raw)

#### Create a dataframe with classifier features

In [10]:
# feat_df = df[['_id', 'text', 'annotation_label', 'hs_keyword_matches', 'hs_keyword_count', 'unknown_words', 'unknown_words_count', 'comment_length', 'brown_cluster_ids', 'feat_dependency_contexts', 'feat_word_dep_root', 'feat_pos_dep_rootPos', 'feat_word_root_rootparent', 'feat_dep_unigrams', 'feat_dep_bigrams', 'feat_dep_trigrams']]

In [81]:
# joblib.dump(feat_df, crowdflower_persistence, compress=True)
feat_df = joblib.load(crowdflower_persistence)
feat_df.head()

Unnamed: 0,_id,text,annotation_label,hs_keyword_matches,hs_keyword_count,unknown_words,unknown_words_count,comment_length,brown_cluster_ids,feat_dependency_contexts,feat_word_dep_root,feat_pos_dep_rootPos,feat_word_root_rootparent,feat_dep_unigrams,feat_dep_bigrams,feat_dep_trigrams
0,58c659be6541913eb7f119dd,Warning : penny boards will make you a faggot,not_offensive,[faggot],1,[],0,9,"[966, 228, 442, 4618, 602, 19]","[warning_:_punct, warning_make_acl, penny boar...","[warning_ROOT_warning, penny boards_nsubj_make...","[NN_ROOT_NN, NNS_nsubj_VB, MD_aux_VB, VB_acl_N...","[warning_warning_warning, penny boards_make_wa...","[warning_warning_ROOT_NN, penny boards_make_ns...",[warning_warning_ROOT_NN|penny boards_make_nsu...,[warning_warning_ROOT_NN|penny boards_make_nsu...
1,58c659be6541913eb7f119de,Fuck dykes,hatespeech,[],0,[],0,2,[],"[fuck_dykes_compoundINV, dykes_fuck_compound]","[fuck_compound_dykes, dykes_ROOT_dykes]","[NNP_compound_VBZ, VBZ_ROOT_VBZ]","[fuck_dykes_dykes, dykes_dykes_dykes]","[fuck_dykes_compound_NNP, dykes_dykes_ROOT_VBZ]",[fuck_dykes_compound_NNP|dykes_dykes_ROOT_VBZ],[]
2,58c659be6541913eb7f119df,user_mention user_mention user_mention user_me...,hatespeech,[faggot],1,[jefree],1,14,"[124, 3690, 966, 2442, 1684]",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,"[NN_ROOT_NN, IN_advmod_JJS, JJS_advmod_VBP, PR...",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...
3,58c659be6541913eb7f119e0,""" user_mention : "" user_mention : user_mention...",hatespeech,[fag],1,[neeeee],1,15,"[228, 228, 1214, 981]","[user_mention_is_nsubjINV, is_user_mention_nsu...","["" user_mention_ROOT_"" user_mention, user_ment...","[NN_ROOT_NN, NN_appos_NN, NN_nsubj_VBZ, VBZ_ac...","["" user_mention_"" user_mention_"" user_mention,...","["" user_mention_"" user_mention_ROOT_NN, user_m...","["" user_mention_"" user_mention_ROOT_NN|user_me...","["" user_mention_"" user_mention_ROOT_NN|user_me..."
4,58c659be6541913eb7f119e1,user_mention You heard me bitch but any way I'...,not_offensive,[bitch],1,[nigga],1,20,"[858, 26282, 1898, 2485, 148, 12266, 1349, 753...","[user_mention_heard_relcl, user_mention_bitch_...","[user_mention_ROOT_user_mention, you_nsubj_hea...","[NN_ROOT_NN, PRP_nsubj_VBD, VBD_relcl_NN, PRP_...","[user_mention_user_mention_user_mention, you_h...","[user_mention_user_mention_ROOT_NN, you_heard_...",[user_mention_user_mention_ROOT_NN|you_heard_n...,[user_mention_user_mention_ROOT_NN|you_heard_n...


### Fetch NAACL_SRW_2016 and NLP+CSS_2016 datasets

In [87]:
# connection_params_3 = ["twitter", "NAACL_SRW_2016"]
# connection_params_4 = ["twitter", "NLP_CSS_2016_expert"]
# df_naacl = fetch_as_df(connection_params_3, {})
# df_nlp = fetch_as_df(connection_params_4, {})
# joblib.dump(df_naacl, naacl_2016_persistence, compress=True)
# joblib.dump(df_nlp, nlp_2016_persistence, compress=True)

Connected to DB at mongodb://140.114.79.146:27017 successfully
Connected to DB at mongodb://140.114.79.146:27017 successfully


['data/persistence/df/nlp_2016.pkl.compressed']

In [88]:
df_naacl = joblib.load(naacl_2016_persistence)
df_nlp = joblib.load(nlp_2016_persistence)
df_nlp.head()

Unnamed: 0,_id,annotation,id_str,text
0,591c29f465419158a43b735d,neither,597576902212063232,Cisco had to deal with a fat cash payout to th...
1,591c29f465419158a43b735e,neither,565586175864610817,"@MadamPlumpette I'm decent at editing, no worr..."
2,591c29f465419158a43b735f,neither,563881580209246209,@girlziplocked will read. gotta go afk for a b...
3,591c29f465419158a43b7360,neither,595380689534656512,guys. show me the data. show me your github. t...
4,591c29f465419158a43b7361,neither,563757610327748608,@tpw_rules nothings broken. I was just driving...


### Setup data split

In [83]:
# let's pick the same random 10% of the data to train with
train_test_set = feat_df.sample(n=int(len(feat_df) / 2), random_state=1965)

X = train_test_set['text']
y = train_test_set['annotation_label']

In [59]:
print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

Predicting the labels of the test set...
14508 documents
2 categories


### Setup generic model experiment

In [84]:
def run_experiment(X, y, pipeline, process_name, num_expts=1):
    scores = list()
    for i in tqdm(range(num_expts)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: " + process_name)
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
        print("Confusion matrix:")
        print(cm)
    print(sum(scores) / num_expts)

### Setup naive baseline classification (countVectorizer)

In [85]:
# our two ingredients: the ngram counter and the classifier
nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('kBest', ch2),
    ('clf' , clf),   # feed the output through a classifier
])

# Run the classifcation
run_experiment(X, y, count_pipeline, "CountVectorizer")

100%|██████████| 1/1 [00:03<00:00,  3.51s/it]

Classification Report: CountVectorizer
               precision    recall  f1-score   support

   hatespeech       0.45      0.33      0.38       245
not_offensive       0.87      0.92      0.89      1206

  avg / total       0.80      0.82      0.81      1451

Confusion matrix:
[[  80  165]
 [  96 1110]]
0.820124052378





#### Compare against NAACL 2016 dataset

In [89]:
train_test_set = df_naacl.sample(n=int(len(feat_df) / 2), random_state=1965)

X_naacl = train_test_set['text']
y_naacl = train_test_set['annotation']

nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('kBest', ch2),
    ('clf' , clf),   # feed the output through a classifier
])

# Run the classifcation
run_experiment(X_naacl, y_naacl, count_pipeline, "CountVectorizer")

100%|██████████| 1/1 [00:04<00:00,  4.49s/it]

Classification Report: CountVectorizer
             precision    recall  f1-score   support

       none       0.85      0.89      0.87      1013
     racism       0.67      0.60      0.64       149
     sexism       0.70      0.63      0.66       289

avg / total       0.80      0.81      0.80      1451

Confusion matrix:
[[897  42  74]
 [ 55  90   4]
 [105   2 182]]
0.805651274983





### Setup naive baseline classification (hashingVectorizer)

In [62]:
vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, hashing_pipeline, "HashingVectorizer")

100%|██████████| 1/1 [00:03<00:00,  3.86s/it]

Classification Report: HashingVectorizer
               precision    recall  f1-score   support

   hatespeech       0.60      0.39      0.47       522
not_offensive       0.87      0.94      0.91      2380

  avg / total       0.82      0.84      0.83      2902

Confusion matrix:
[[ 201  321]
 [ 136 2244]]
0.842522398346





### Setup tf-idf baseline classification

In [63]:
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, tfidf_pipeline, "TfidfVectorizer")

100%|██████████| 1/1 [00:04<00:00,  4.78s/it]

Classification Report: TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.57      0.40      0.47       477
not_offensive       0.89      0.94      0.91      2425

  avg / total       0.84      0.85      0.84      2902

Confusion matrix:
[[ 190  287]
 [ 146 2279]]
0.850792556857





### Dependency tuple experiment

In [44]:
X = train_test_set['feat_word_root_rootparent']
y = train_test_set['annotation_label']

hasher = FeatureHasher(input_type='string', non_negative=True)
clf = LinearSVC()

featurer_hasher_pipeline = Pipeline([
    ('hasher', hasher),  # extract ngrams from tweet text
    ('clf' , clf),   # feed the output through a classifier
])
run_experiment(X, y, featurer_hasher_pipeline, "FeatureHasher")

100%|██████████| 1/1 [00:01<00:00,  1.06s/it]

Classification Report: FeatureHasher
               precision    recall  f1-score   support

   hatespeech       0.42      0.13      0.20       253
not_offensive       0.84      0.96      0.90      1198

  avg / total       0.77      0.82      0.77      1451

Confusion matrix:
[[  33  220]
 [  46 1152]]
0.816678152998





### Dependency context experiment

In [14]:
from sklearn.base import TransformerMixin, BaseEstimator
class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart 
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return df[self.column_name].tolist()
#         return np.asarray(df[self.column_name]).astype(str)
        
    def fit(self, *_):
        return self


class Apply(BaseEstimator, TransformerMixin):
    """Applies a function f element-wise to the numpy array
    """
    
    def __init__(self, fn):
        self.fn = np.vectorize(fn)
        
    def transform(self, data):
        # note: reshaping is necessary because otherwise sklearn
        # interprets 1-d array as a single sample
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self
    
class BooleanExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return np.asarray(df[self.column_name]).astype(np.int)
                                                       
    def fit(self, *_):
        return self

In [45]:
from itertools import chain
X = train_test_set[['feat_dependency_contexts', 'hs_keyword_count']]
y = train_test_set['annotation_label']
clf = LinearSVC()
empty_analyzer = lambda x: x
transformer = TfidfTransformer(smooth_idf=False)
vect = TfidfVectorizer(analyzer=empty_analyzer)

dependency_context_pipeline = Pipeline([
    ('dep_extractor', TextExtractor('feat_dependency_contexts')), # extract names from df
    ('vect', vect)
])

hs_keyword_count_pipeline = Pipeline([
    ('count_extractor', BooleanExtractor('hs_keyword_count')),
    ('identity', Apply(lambda x: x))
])

pipeline = Pipeline([
    ('all_features', FeatureUnion([
        ('dependency_context_pipeline', dependency_context_pipeline), # all text features
        ('hs_keyword_count_pipeline', hs_keyword_count_pipeline),
    ])),
    ('clf' , clf),   # feed the output through a classifier
])

run_experiment(X, y, pipeline, "TfidfVectorizer")

# https://github.com/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# https://github.com/amueller/kaggle_insults/blob/e4abac805be1d1e2b3201a978172bafd36cc01e3/features.py
# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/

100%|██████████| 1/1 [00:00<00:00,  1.58it/s]

Classification Report: TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.51      0.27      0.35       235
not_offensive       0.87      0.95      0.91      1216

  avg / total       0.81      0.84      0.82      1451

Confusion matrix:
[[  64  171]
 [  62 1154]]
0.839421088904



