# Model Building

This is a staging notebook for experiments related to the classifier model.

In [1]:
from itertools import chain
import pandas as pd
import numpy as np
from gensim import models, similarities
from tqdm import tqdm
import spacy
import joblib
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from modules.utils.CustomTwokenizer import CustomTwokenizer

### Store collection as dataframe

In [2]:
def fetch_as_df(connection_params, projection):
    client = mongo_base.connect()
    db_name = connection_params[0]
    connection_params.insert(0, client)
    query = {}
    query["filter"] = {}
    query["projection"] = projection
    query["limit"] = 0
    query["skip"] = 0
    query["no_cursor_timeout"] = True
    cursor = mongo_base.finder(connection_params, query, False)
    df = pd.DataFrame(list(cursor))
    return df

### Merge dataframes [CrowdFlower Dataset]

In [3]:
connection_params_1 = ["twitter", "crowdflower_features"]
connection_params_2 = ["twitter", "crowdflower_features_emo"]
# df = fetch_as_df(connection_params_1, {})
# df_emo = fetch_as_df(connection_params_2, {"emotions":1})
# df = pd.DataFrame.merge(df, df_emo, on="_id")
# df.head()

#### Pickle the raw feature collection

In [4]:
spacy_en_model = "en_core_web_md"
spacy_glove_model = "en_vectors_glove_md"
crowdflower_persistence_raw = 'data/persistence/df/crowdflower_features_raw.pkl.compressed'
crowdflower_persistence = 'data/persistence/df/crowdflower_features.pkl.compressed'
naacl_2016_persistence = 'data/persistence/df/naacl_2016.pkl.compressed'
nlp_2016_persistence = 'data/persistence/df/nlp_2016.pkl.compressed'
nlp = spacy.load(spacy_en_model, create_make_doc=CustomTwokenizer)
# joblib.dump(df, crowdflower_persistence_raw, compress=True)

In [5]:
# df = joblib.load(crowdflower_persistence_raw)

#### Create a dataframe with classifier features

In [6]:
# feat_df = df[['_id', 'text', 'annotation_label', 'hs_keyword_matches', 'hs_keyword_count', 'unknown_words', 'unknown_words_count', 'comment_length', 'brown_cluster_ids', 'feat_dependency_contexts', 'feat_word_dep_root', 'feat_pos_dep_rootPos', 'feat_word_root_rootparent', 'feat_dep_unigrams', 'feat_dep_bigrams', 'feat_dep_trigrams']]

In [7]:
# joblib.dump(feat_df, crowdflower_persistence, compress=True)
feat_df = joblib.load(crowdflower_persistence)
feat_df.head(3)

Unnamed: 0,_id,text,annotation_label,hs_keyword_matches,hs_keyword_count,unknown_words,unknown_words_count,comment_length,brown_cluster_ids,feat_dependency_contexts,feat_word_dep_root,feat_pos_dep_rootPos,feat_word_root_rootparent,feat_dep_unigrams,feat_dep_bigrams,feat_dep_trigrams
0,58c659be6541913eb7f119dd,Warning : penny boards will make you a faggot,not_offensive,[faggot],1,[],0,9,"[966, 228, 442, 4618, 602, 19]","[warning_:_punct, warning_make_acl, penny boar...","[warning_ROOT_warning, penny boards_nsubj_make...","[NN_ROOT_NN, NNS_nsubj_VB, MD_aux_VB, VB_acl_N...","[warning_warning_warning, penny boards_make_wa...","[warning_warning_ROOT_NN, penny boards_make_ns...",[warning_warning_ROOT_NN|penny boards_make_nsu...,[warning_warning_ROOT_NN|penny boards_make_nsu...
1,58c659be6541913eb7f119de,Fuck dykes,hatespeech,[],0,[],0,2,[],"[fuck_dykes_compoundINV, dykes_fuck_compound]","[fuck_compound_dykes, dykes_ROOT_dykes]","[NNP_compound_VBZ, VBZ_ROOT_VBZ]","[fuck_dykes_dykes, dykes_dykes_dykes]","[fuck_dykes_compound_NNP, dykes_dykes_ROOT_VBZ]",[fuck_dykes_compound_NNP|dykes_dykes_ROOT_VBZ],[]
2,58c659be6541913eb7f119df,user_mention user_mention user_mention user_me...,hatespeech,[faggot],1,[jefree],1,14,"[124, 3690, 966, 2442, 1684]",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,"[NN_ROOT_NN, IN_advmod_JJS, JJS_advmod_VBP, PR...",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...


### Fetch NAACL_SRW_2016 and NLP+CSS_2016 datasets

In [8]:
# connection_params_3 = ["twitter", "NAACL_SRW_2016"]
# connection_params_4 = ["twitter", "NLP_CSS_2016_expert"]
# df_naacl = fetch_as_df(connection_params_3, {})
# df_nlp = fetch_as_df(connection_params_4, {})
# joblib.dump(df_naacl, naacl_2016_persistence, compress=True)
# joblib.dump(df_nlp, nlp_2016_persistence, compress=True)

In [9]:
df_naacl = joblib.load(naacl_2016_persistence)
df_nlp = joblib.load(nlp_2016_persistence)
df_nlp.head(3)

Unnamed: 0,_id,annotation,id_str,text
0,591c29f465419158a43b735d,neither,597576902212063232,Cisco had to deal with a fat cash payout to th...
1,591c29f465419158a43b735e,neither,565586175864610817,"@MadamPlumpette I'm decent at editing, no worr..."
2,591c29f465419158a43b735f,neither,563881580209246209,@girlziplocked will read. gotta go afk for a b...


### Setup generic model experiment

In [10]:
def run_experiment(X, y, pipeline, process_name, num_expts=1):
    scores = list()
    for i in tqdm(range(num_expts)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: " + process_name)
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
#         print("Confusion matrix:")
#         print(cm)
#     print(sum(scores) / num_expts)

### Naive baseline classification (countVectorizer: character)

#### CrowdFlower dataset

In [11]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

# our two ingredients: the ngram counter and the classifier
nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('kBest', ch2),
    ('clf' , clf),   # feed the output through a classifier
])

# Run the classifcation
run_experiment(X, y, count_pipeline, "CrowdFlower: CountVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories


100%|██████████| 1/1 [00:06<00:00,  6.81s/it]

Classification Report: CrowdFlower: CountVectorizer[character]
               precision    recall  f1-score   support

   hatespeech       0.47      0.43      0.45       460
not_offensive       0.89      0.91      0.90      2442

  avg / total       0.83      0.83      0.83      2902






#### NAACL 2016 dataset

In [21]:
train_test_set = df_naacl.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

nm = 5000
vect = CountVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()
ch2 = SelectKBest(chi2, k=nm)

# There are just two steps to our process: extracting the ngrams and
# putting them through the classifier. So our Pipeline looks like this:

count_pipeline = Pipeline([
    ('vect', vect),
    ('kBest', ch2),
    ('clf' , clf),
])

# Run the classifcation
run_experiment(X, y, count_pipeline, "NAACL2016: CountVectorizer [character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
3 categories


100%|██████████| 1/1 [00:09<00:00,  9.66s/it]

Classification Report: NAACL2016: CountVectorizer [character]
             precision    recall  f1-score   support

       none       0.87      0.88      0.87      2048
     racism       0.67      0.66      0.66       332
     sexism       0.71      0.67      0.69       522

avg / total       0.82      0.82      0.82      2902






### Naive baseline classification (hashingVectorizer: character)

#### CrowdFlower dataset

In [13]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf), 
])
run_experiment(X, y, hashing_pipeline, "CrowdFlower: HashingVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories


100%|██████████| 1/1 [00:04<00:00,  4.02s/it]

Classification Report: CrowdFlower: HashingVectorizer[character]
               precision    recall  f1-score   support

   hatespeech       0.55      0.36      0.43       478
not_offensive       0.88      0.94      0.91      2424

  avg / total       0.83      0.85      0.83      2902






#### NAACL 2016 dataset

In [14]:
train_test_set = df_naacl.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf), 
])
run_experiment(X, y, hashing_pipeline, "NAACL2016: HashingVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
3 categories


100%|██████████| 1/1 [00:05<00:00,  5.70s/it]

Classification Report: NAACL2016: HashingVectorizer[character]
             precision    recall  f1-score   support

       none       0.86      0.94      0.89      2000
     racism       0.81      0.69      0.74       350
     sexism       0.83      0.64      0.72       552

avg / total       0.85      0.85      0.84      2902






### Naive baseline classification (TfidfVectorizer: character)

#### CrowdFlower dataset

In [15]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf),
])
run_experiment(X, y, tfidf_pipeline, "CrowdFlower: TfidfVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories


100%|██████████| 1/1 [00:04<00:00,  4.75s/it]

Classification Report: CrowdFlower: TfidfVectorizer[character]
               precision    recall  f1-score   support

   hatespeech       0.59      0.37      0.46       468
not_offensive       0.89      0.95      0.92      2434

  avg / total       0.84      0.86      0.84      2902






#### NAACL 2016 dataset

In [16]:
train_test_set = df_naacl.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf),
])
run_experiment(X, y, tfidf_pipeline, "NAACL2016: TfidfVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
3 categories


100%|██████████| 1/1 [00:07<00:00,  7.22s/it]

Classification Report: NAACL2016: TfidfVectorizer[character]
             precision    recall  f1-score   support

       none       0.87      0.93      0.90      1994
     racism       0.77      0.66      0.71       347
     sexism       0.86      0.70      0.77       561

avg / total       0.85      0.85      0.85      2902






### Dependency tuple experiment

In [17]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['feat_word_root_rootparent']
y = train_test_set['annotation_label']

hasher = FeatureHasher(input_type='string', non_negative=True)
clf = LinearSVC()

featurer_hasher_pipeline = Pipeline([
    ('hasher', hasher),
    ('clf' , clf),
])
run_experiment(X, y, featurer_hasher_pipeline, "CrowdFlower: FeatureHasher")

100%|██████████| 1/1 [00:01<00:00,  1.96s/it]

Classification Report: CrowdFlower: FeatureHasher
               precision    recall  f1-score   support

   hatespeech       0.54      0.19      0.28       497
not_offensive       0.85      0.97      0.91      2405

  avg / total       0.80      0.83      0.80      2902






### Dependency context experiment

In [18]:
from sklearn.base import TransformerMixin, BaseEstimator
class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart 
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return df[self.column_name].tolist()
#         return np.asarray(df[self.column_name]).astype(str)
        
    def fit(self, *_):
        return self


class Apply(BaseEstimator, TransformerMixin):
    """Applies a function f element-wise to the numpy array
    """
    
    def __init__(self, fn):
        self.fn = np.vectorize(fn)
        
    def transform(self, data):
        # note: reshaping is necessary because otherwise sklearn
        # interprets 1-d array as a single sample
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self
    
class BooleanExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return np.asarray(df[self.column_name]).astype(np.int)
                                                       
    def fit(self, *_):
        return self

In [19]:
from itertools import chain
X = train_test_set[['feat_dependency_contexts', 'hs_keyword_count']]
y = train_test_set['annotation_label']
clf = LinearSVC()
empty_analyzer = lambda x: x
transformer = TfidfTransformer(smooth_idf=False)
vect = TfidfVectorizer(analyzer=empty_analyzer)

dependency_context_pipeline = Pipeline([
    ('dep_extractor', TextExtractor('feat_dependency_contexts')), # extract names from df
    ('vect', vect)
])

hs_keyword_count_pipeline = Pipeline([
    ('count_extractor', BooleanExtractor('hs_keyword_count')),
    ('identity', Apply(lambda x: x))
])

pipeline = Pipeline([
    ('all_features', FeatureUnion([
        ('dependency_context_pipeline', dependency_context_pipeline), # all text features
        ('hs_keyword_count_pipeline', hs_keyword_count_pipeline),
    ])),
    ('clf' , clf),   # feed the output through a classifier
])

run_experiment(X, y, pipeline, "TfidfVectorizer")

# https://github.com/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# https://github.com/amueller/kaggle_insults/blob/e4abac805be1d1e2b3201a978172bafd36cc01e3/features.py
# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/

100%|██████████| 1/1 [00:01<00:00,  1.42s/it]

Classification Report: TfidfVectorizer
               precision    recall  f1-score   support

   hatespeech       0.57      0.30      0.39       475
not_offensive       0.87      0.95      0.91      2427

  avg / total       0.82      0.85      0.83      2902




