# Model Building

This is a staging notebook for experiments related to the classifier model.

In [1]:
import numpy as np
from gensim import models, similarities
from tqdm import tqdm
import spacy
import joblib
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from modules.utils.CustomTwokenizer import CustomTwokenizer
from modules.utils import file_ops
from modules.utils import settings
from modules.utils import model_helpers

## Initialize params and objects 

In [2]:
model_helpers.init_plotly()
# nlp = spacy.load(settings.SPACY_EN_MODEL, create_make_doc=CustomTwokenizer)

## Load dataframes

#### Load, merge, and pickle [CrowdFlower Dataset]

In [3]:
connection_params_1 = ["twitter", "crowdflower_features"]
connection_params_2 = ["twitter", "crowdflower_features_emo"]
# df = model_helpers.fetch_as_df(connection_params_1, {})
# df_emo = model_helpers.fetch_as_df(connection_params_2, {"emotions":1})
# df = model_helpers.pd.DataFrame.merge(df, df_emo, on="_id")
# joblib.dump(df, settings.CRWDFLR_DATA_RAW, compress=True)
# df.head()
# df = joblib.load(settings.CRWDFLR_DATA_RAW)

#### Create a dataframe with classifier features [CrowdFlower dataset]

In [4]:
# feat_df = df[['_id', 'text', 'annotation_label', 'hs_keyword_matches', 'hs_keyword_count', 'unknown_words', 'unknown_words_count', 'comment_length', 'brown_cluster_ids', 'feat_dependency_contexts', 'feat_word_dep_root', 'feat_pos_dep_rootPos', 'feat_word_root_rootparent', 'feat_dep_unigrams', 'feat_dep_bigrams', 'feat_dep_trigrams']]
# joblib.dump(feat_df, settings.CRWDFLR_DATA, compress=True)
feat_df = joblib.load(settings.CRWDFLR_DATA)
feat_df.head(3)

Unnamed: 0,_id,text,annotation_label,hs_keyword_matches,hs_keyword_count,unknown_words,unknown_words_count,comment_length,brown_cluster_ids,feat_dependency_contexts,feat_word_dep_root,feat_pos_dep_rootPos,feat_word_root_rootparent,feat_dep_unigrams,feat_dep_bigrams,feat_dep_trigrams
0,58c659be6541913eb7f119dd,Warning : penny boards will make you a faggot,not_offensive,[faggot],1,[],0,9,"[966, 228, 442, 4618, 602, 19]","[warning_:_punct, warning_make_acl, penny boar...","[warning_ROOT_warning, penny boards_nsubj_make...","[NN_ROOT_NN, NNS_nsubj_VB, MD_aux_VB, VB_acl_N...","[warning_warning_warning, penny boards_make_wa...","[warning_warning_ROOT_NN, penny boards_make_ns...",[warning_warning_ROOT_NN|penny boards_make_nsu...,[warning_warning_ROOT_NN|penny boards_make_nsu...
1,58c659be6541913eb7f119de,Fuck dykes,hatespeech,[],0,[],0,2,[],"[fuck_dykes_compoundINV, dykes_fuck_compound]","[fuck_compound_dykes, dykes_ROOT_dykes]","[NNP_compound_VBZ, VBZ_ROOT_VBZ]","[fuck_dykes_dykes, dykes_dykes_dykes]","[fuck_dykes_compound_NNP, dykes_dykes_ROOT_VBZ]",[fuck_dykes_compound_NNP|dykes_dykes_ROOT_VBZ],[]
2,58c659be6541913eb7f119df,user_mention user_mention user_mention user_me...,hatespeech,[faggot],1,[jefree],1,14,"[124, 3690, 966, 2442, 1684]",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,"[NN_ROOT_NN, IN_advmod_JJS, JJS_advmod_VBP, PR...",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...


#### Load, merge, and pickle [NAACL_SRW_2016 and NLP+CSS_2016 datasets]

In [5]:
connection_params_3 = ["twitter", "NAACL_SRW_2016_features"]
connection_params_4 = ["twitter", "NLP_CSS_2016_expert_features"]
# df_naacl = model_helpers.fetch_as_df(connection_params_3, {})
# df_nlp = model_helpers.fetch_as_df(connection_params_4, {})
# joblib.dump(df_naacl, settings.NAACL_2016_DATA, compress=True)
# joblib.dump(df_nlp, settings.NLP_2016_DATA, compress=True)

df_naacl = joblib.load(settings.NAACL_2016_DATA)
df_nlp = joblib.load(settings.NLP_2016_DATA)
df_nlp.head(3)

Unnamed: 0,_id,annotation,avg_token_length,brown_cluster_ids,comment_length,feat_dep_bigrams,feat_dep_trigrams,feat_dep_unigrams,feat_dependency_contexts,feat_pos_dep_rootPos,feat_word_dep_root,feat_word_root_rootparent,has_hs_keywords,hs_keyword_count,hs_keyword_matches,text,unknown_words,unknown_words_count,uppercase_token_count
0,591c29f465419158a43b735d,neither,4.0,"[166, 1914, 12, 5829, 1020, 12, 5098, 20, 5098...",29,"[cisco_had_nsubj_NNP|had_allow_aux_VBD, had_al...",[cisco_had_nsubj_NNP|had_allow_aux_VBD|to_deal...,"[cisco_had_nsubj_NNP, had_allow_aux_VBD, to_de...","[cisco_had_nsubjINV, had_cisco_nsubj, had_deal...","[NNP_nsubj_VBD, VBD_aux_VB, TO_aux_VB, VB_xcom...","[cisco_nsubj_had, had_aux_allow, to_aux_deal, ...","[cisco_had_allow, had_allow_allow, to_deal_had...",False,0,[],Cisco had to deal with a fat cash payout to th...,"[fsf, compliancy]",2,2
1,591c29f465419158a43b735e,neither,5.0,"[551, 124, 3050, 4]",9,"[user_mention_i'm_nsubj_NN|i'm_i'm_ROOT_VBZ, i...",[user_mention_i'm_nsubj_NN|i'm_i'm_ROOT_VBZ|de...,"[user_mention_i'm_nsubj_NN, i'm_i'm_ROOT_VBZ, ...","[user_mention_i'm_nsubjINV, i'm_user_mention_n...","[NN_nsubj_VBZ, VBZ_ROOT_VBZ, JJ_acomp_VBZ, IN_...","[user_mention_nsubj_i'm, i'm_ROOT_i'm, decent_...","[user_mention_i'm_i'm, i'm_i'm_i'm, decent_i'm...",False,0,[],"user_mention I'm decent at editing , no worrie...",[^.^],1,0
2,591c29f465419158a43b735f,neither,4.0,"[442, 6314, 8, 5114, 3466, 508, 19, 853, 36, 1...",23,"[user_mention_read_nsubj_NN|will_read_aux_MD, ...",[user_mention_read_nsubj_NN|will_read_aux_MD|r...,"[user_mention_read_nsubj_NN, will_read_aux_MD,...","[user_mention_read_nsubjINV, will_read_auxINV,...","[NN_nsubj_VB, MD_aux_VB, VB_ROOT_VB, NN_aux_VB...","[user_mention_nsubj_read, will_aux_read, read_...","[user_mention_read_read, will_read_read, read_...",False,0,[],user_mention will read . gotta go afk for a bi...,[afk],1,0


In [6]:
print('Number of words: {0}'.format(df_naacl['text'].apply(lambda x: len(x.split(' '))).sum()))
print(df_naacl['annotation'].value_counts())
print()
print('Number of words: {0}'.format(feat_df['text'].apply(lambda x: len(x.split(' '))).sum()))
print(feat_df['annotation_label'].value_counts())

Number of words: 296439
none      11070
sexism     3169
racism     1948
Name: annotation, dtype: int64

Number of words: 242568
not_offensive    12109
hatespeech        2399
Name: annotation_label, dtype: int64


### Naive baseline classification (countVectorizer: character)

#### NAACL 2016 dataset

In [7]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

N_FEATURES_OPTIONS = [100000,130000]
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

k_features = 130000
n_components = 120

vect = CountVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
clf = LinearSVC()
svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)

count_pipeline = Pipeline([
    ('vect', vect),
    ('skb', skb),
    ('svd', svd),
    ('clf' , clf),
])

model_helpers.run_experiment(X, y, count_pipeline, "NAACL2016: CountVectorizer [character]", [False, True])

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories
Classification Report: NAACL2016: CountVectorizer [character]
             precision    recall  f1-score   support

       none       0.83      0.92      0.87      2230
     racism       0.69      0.71      0.70       390
     sexism       0.85      0.49      0.62       618

avg / total       0.82      0.81      0.80      3238



100%|██████████| 1/1 [00:40<00:00, 40.83s/it]


### Naive baseline classification (hashingVectorizer: character)

#### NAACL 2016 dataset

In [8]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf), 
])
model_helpers.run_experiment(X, y, hashing_pipeline, "NAACL2016: HashingVectorizer[character]", [False, True])

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories
Classification Report: NAACL2016: HashingVectorizer[character]
             precision    recall  f1-score   support

       none       0.85      0.93      0.89      2210
     racism       0.76      0.65      0.71       382
     sexism       0.82      0.61      0.70       646

avg / total       0.83      0.83      0.83      3238



100%|██████████| 1/1 [00:06<00:00,  6.67s/it]


### Naive baseline classification (TfidfVectorizer: character)

#### CrowdFlower dataset

In [9]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

N_FEATURES_OPTIONS = [100000, 200000]
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

k_features = 150000
n_components = 120

svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),
    ('skb', skb),
    ('svd', svd),
    ('clf' , clf),
])

# print(model_helpers.get_num_features(vect, X))
# model_helpers.run_gridsearch_cv(tfidf_pipeline, X, y, param_grid, n_jobs)
model_helpers.run_experiment(X, y, tfidf_pipeline, "CrowdFlower: LSA - TfidfVectorizer[character]", [False, True])

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories
Classification Report: CrowdFlower: LSA - TfidfVectorizer[character]
               precision    recall  f1-score   support

   hatespeech       0.56      0.27      0.37       477
not_offensive       0.87      0.96      0.91      2425

  avg / total       0.82      0.85      0.82      2902



100%|██████████| 1/1 [00:23<00:00, 23.75s/it]


#### NAACL 2016 dataset

In [10]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

N_FEATURES_OPTIONS = [100000, 200000]
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

k_features = 150000
n_components = 120

svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')

clf = LinearSVC()
tfidf_pipeline = Pipeline([
    ('vect', vect),
    ('skb', skb),
    ('svd', svd),
    ('clf' , clf),
])

# print(model_helpers.get_num_features(vect, X))
# model_helpers.run_gridsearch_cv(tfidf_pipeline, X, y, param_grid, n_jobs)
model_helpers.run_experiment(X, y, tfidf_pipeline, "NAACL2016: TfidfVectorizer[character]", [False, True])

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories
Classification Report: NAACL2016: TfidfVectorizer[character]
             precision    recall  f1-score   support

       none       0.81      0.94      0.87      2205
     racism       0.77      0.58      0.66       377
     sexism       0.84      0.48      0.61       656

avg / total       0.81      0.81      0.79      3238



100%|██████████| 1/1 [00:26<00:00, 26.81s/it]


### Dependency tuple experiment

In [12]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['feat_word_root_rootparent']
y = train_test_set['annotation_label']

hasher = FeatureHasher(input_type='string', non_negative=True)
clf = LinearSVC()

featurer_hasher_pipeline = Pipeline([
    ('hasher', hasher),
    ('clf' , clf),
])

model_helpers.run_experiment(X, y, featurer_hasher_pipeline, "CrowdFlower: FeatureHasher", [False, True])

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories
Classification Report: CrowdFlower: FeatureHasher
               precision    recall  f1-score   support

   hatespeech       0.59      0.21      0.31       482
not_offensive       0.86      0.97      0.91      2420

  avg / total       0.82      0.84      0.81      2902



100%|██████████| 1/1 [00:02<00:00,  2.79s/it]


## Model Feature Combinations

#### Update feature with top k similar words from Dependency2vec model

In [13]:
hs_candidates_exp6_word = "data/persistence/word_embeddings/dim200vecs_hs_candidates_exp6"
hs_candidates_exp6_model = models.KeyedVectors.load_word2vec_format(hs_candidates_exp6_word, binary=False)

In [14]:
df_naacl['similar_hs_keywords'] = df_naacl.apply(lambda row: model_helpers.fetch_top_k_similar(hs_candidates_exp6_model, row, 'hs_keyword_matches', 5), axis=1)
feat_df['similar_hs_keywords'] = feat_df.apply(lambda row: model_helpers.fetch_top_k_similar(hs_candidates_exp6_model, row, 'hs_keyword_matches', 5), axis=1)

#### Train model

In [15]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set[['text', 'feat_dep_unigrams', 'hs_keyword_count', 'similar_hs_keywords']]
y = train_test_set['annotation']
clf = LinearSVC()

empty_analyzer = model_helpers.empty_analyzer()
# Setup char ngram pipeline
char_vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
char_tfidf_pipeline = Pipeline([
    ('text_extractor', model_helpers.TextExtractor('text')),
    ('char_vect', char_vect)
])

# Setup feature tf-idf vectorizer
dep_context_vect = TfidfVectorizer(analyzer=empty_analyzer, max_df=0.3)
dependency_context_pipeline = Pipeline([
    ('dep_extractor', model_helpers.TextListExtractor('feat_dep_unigrams')), # extract names from df
    ('dep_vect', dep_context_vect)
])


hs_keyword_count_pipeline = Pipeline([
    ('count_extractor', model_helpers.BooleanExtractor('hs_keyword_count')),
    ('identity', model_helpers.Apply(lambda x: x))
])

# Setup similar hs_keywords vectorizer
dep2vec_vect = TfidfVectorizer(analyzer=empty_analyzer)
dep2vec_similarity_pipeline = Pipeline([
    ('dep2vec_extractor', model_helpers.TextListExtractor('similar_hs_keywords')),
    ('dep2vec_vect', dep2vec_vect)
])

k_features = 200000
n_components = 120
skb = SelectKBest(chi2, k=k_features)
svd = TruncatedSVD(n_components=n_components)

pipeline = Pipeline([
    ('all_features', FeatureUnion([
        ('char_tfidf_pipeline', char_tfidf_pipeline),
        ('dependency_context_pipeline', dependency_context_pipeline),
#         ('hs_keyword_count_pipeline', hs_keyword_count_pipeline),
        ('dep2vec_similarity_pipeline', dep2vec_similarity_pipeline)
    ])),
    ('skb', skb),
    ('clf' , clf)
])

model_helpers.run_experiment(X, y, pipeline, "TfidfVectorizer", [False, True])

# https://github.com/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# https://github.com/amueller/kaggle_insults/blob/e4abac805be1d1e2b3201a978172bafd36cc01e3/features.py
# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories
Classification Report: TfidfVectorizer
             precision    recall  f1-score   support

       none       0.85      0.93      0.89      2228
     racism       0.78      0.70      0.74       386
     sexism       0.83      0.62      0.71       624

avg / total       0.84      0.84      0.84      3238



100%|██████████| 1/1 [00:09<00:00,  9.89s/it]
