# Model Building

This is a staging notebook for experiments related to the classifier model.

In [1]:
from itertools import chain
import pandas as pd
import numpy as np
from gensim import models, similarities
from tqdm import tqdm
import spacy
import joblib
from gensim import models, similarities
from modules.db import mongo_base
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from modules.utils.CustomTwokenizer import CustomTwokenizer

### Store collection as dataframe

In [2]:
def fetch_as_df(connection_params, projection):
    client = mongo_base.connect()
    db_name = connection_params[0]
    connection_params.insert(0, client)
    query = {}
    query["filter"] = {}
    query["projection"] = projection
    query["limit"] = 0
    query["skip"] = 0
    query["no_cursor_timeout"] = True
    cursor = mongo_base.finder(connection_params, query, False)
    df = pd.DataFrame(list(cursor))
    return df

### Merge dataframes [CrowdFlower Dataset]

In [3]:
connection_params_1 = ["twitter", "crowdflower_features"]
connection_params_2 = ["twitter", "crowdflower_features_emo"]
# df = fetch_as_df(connection_params_1, {})
# df_emo = fetch_as_df(connection_params_2, {"emotions":1})
# df = pd.DataFrame.merge(df, df_emo, on="_id")
# df.head()

#### Pickle the raw feature collection

In [4]:
spacy_en_model = "en_core_web_md"
spacy_glove_model = "en_vectors_glove_md"
crowdflower_persistence_raw = 'data/persistence/df/crowdflower_features_raw.pkl.compressed'
crowdflower_persistence = 'data/persistence/df/crowdflower_features.pkl.compressed'
naacl_2016_persistence = 'data/persistence/df/naacl_2016.pkl.compressed'
nlp_2016_persistence = 'data/persistence/df/nlp_2016.pkl.compressed'
# nlp = spacy.load(spacy_en_model, create_make_doc=CustomTwokenizer)
# joblib.dump(df, crowdflower_persistence_raw, compress=True)

In [5]:
# df = joblib.load(crowdflower_persistence_raw)

#### Create a dataframe with classifier features

In [6]:
# feat_df = df[['_id', 'text', 'annotation_label', 'hs_keyword_matches', 'hs_keyword_count', 'unknown_words', 'unknown_words_count', 'comment_length', 'brown_cluster_ids', 'feat_dependency_contexts', 'feat_word_dep_root', 'feat_pos_dep_rootPos', 'feat_word_root_rootparent', 'feat_dep_unigrams', 'feat_dep_bigrams', 'feat_dep_trigrams']]

In [5]:
# joblib.dump(feat_df, crowdflower_persistence, compress=True)
feat_df = joblib.load(crowdflower_persistence)
feat_df.head(3)

Unnamed: 0,_id,text,annotation_label,hs_keyword_matches,hs_keyword_count,unknown_words,unknown_words_count,comment_length,brown_cluster_ids,feat_dependency_contexts,feat_word_dep_root,feat_pos_dep_rootPos,feat_word_root_rootparent,feat_dep_unigrams,feat_dep_bigrams,feat_dep_trigrams
0,58c659be6541913eb7f119dd,Warning : penny boards will make you a faggot,not_offensive,[faggot],1,[],0,9,"[966, 228, 442, 4618, 602, 19]","[warning_:_punct, warning_make_acl, penny boar...","[warning_ROOT_warning, penny boards_nsubj_make...","[NN_ROOT_NN, NNS_nsubj_VB, MD_aux_VB, VB_acl_N...","[warning_warning_warning, penny boards_make_wa...","[warning_warning_ROOT_NN, penny boards_make_ns...",[warning_warning_ROOT_NN|penny boards_make_nsu...,[warning_warning_ROOT_NN|penny boards_make_nsu...
1,58c659be6541913eb7f119de,Fuck dykes,hatespeech,[],0,[],0,2,[],"[fuck_dykes_compoundINV, dykes_fuck_compound]","[fuck_compound_dykes, dykes_ROOT_dykes]","[NNP_compound_VBZ, VBZ_ROOT_VBZ]","[fuck_dykes_dykes, dykes_dykes_dykes]","[fuck_dykes_compound_NNP, dykes_dykes_ROOT_VBZ]",[fuck_dykes_compound_NNP|dykes_dykes_ROOT_VBZ],[]
2,58c659be6541913eb7f119df,user_mention user_mention user_mention user_me...,hatespeech,[faggot],1,[jefree],1,14,"[124, 3690, 966, 2442, 1684]",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,"[NN_ROOT_NN, IN_advmod_JJS, JJS_advmod_VBP, PR...",[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...,[user_mention user_mention user_mention user_m...


### Fetch NAACL_SRW_2016 and NLP+CSS_2016 datasets

In [8]:
# connection_params_3 = ["twitter", "NAACL_SRW_2016_features"]
# connection_params_4 = ["twitter", "NLP_CSS_2016_expert_features"]
# df_naacl = fetch_as_df(connection_params_3, {})
# df_nlp = fetch_as_df(connection_params_4, {})
# joblib.dump(df_naacl, naacl_2016_persistence, compress=True)
# joblib.dump(df_nlp, nlp_2016_persistence, compress=True)

In [6]:
df_naacl = joblib.load(naacl_2016_persistence)
df_nlp = joblib.load(nlp_2016_persistence)
df_nlp.head(3)

Unnamed: 0,_id,annotation,avg_token_length,brown_cluster_ids,comment_length,feat_dep_bigrams,feat_dep_trigrams,feat_dep_unigrams,feat_dependency_contexts,feat_pos_dep_rootPos,feat_word_dep_root,feat_word_root_rootparent,has_hs_keywords,hs_keyword_count,hs_keyword_matches,text,unknown_words,unknown_words_count,uppercase_token_count
0,591c29f465419158a43b735d,neither,4.0,"[166, 1914, 12, 5829, 1020, 12, 5098, 20, 5098...",29,"[cisco_had_nsubj_NNP|had_allow_aux_VBD, had_al...",[cisco_had_nsubj_NNP|had_allow_aux_VBD|to_deal...,"[cisco_had_nsubj_NNP, had_allow_aux_VBD, to_de...","[cisco_had_nsubjINV, had_cisco_nsubj, had_deal...","[NNP_nsubj_VBD, VBD_aux_VB, TO_aux_VB, VB_xcom...","[cisco_nsubj_had, had_aux_allow, to_aux_deal, ...","[cisco_had_allow, had_allow_allow, to_deal_had...",False,0,[],Cisco had to deal with a fat cash payout to th...,"[fsf, compliancy]",2,2
1,591c29f465419158a43b735e,neither,5.0,"[551, 124, 3050, 4]",9,"[user_mention_i'm_nsubj_NN|i'm_i'm_ROOT_VBZ, i...",[user_mention_i'm_nsubj_NN|i'm_i'm_ROOT_VBZ|de...,"[user_mention_i'm_nsubj_NN, i'm_i'm_ROOT_VBZ, ...","[user_mention_i'm_nsubjINV, i'm_user_mention_n...","[NN_nsubj_VBZ, VBZ_ROOT_VBZ, JJ_acomp_VBZ, IN_...","[user_mention_nsubj_i'm, i'm_ROOT_i'm, decent_...","[user_mention_i'm_i'm, i'm_i'm_i'm, decent_i'm...",False,0,[],"user_mention I'm decent at editing , no worrie...",[^.^],1,0
2,591c29f465419158a43b735f,neither,4.0,"[442, 6314, 8, 5114, 3466, 508, 19, 853, 36, 1...",23,"[user_mention_read_nsubj_NN|will_read_aux_MD, ...",[user_mention_read_nsubj_NN|will_read_aux_MD|r...,"[user_mention_read_nsubj_NN, will_read_aux_MD,...","[user_mention_read_nsubjINV, will_read_auxINV,...","[NN_nsubj_VB, MD_aux_VB, VB_ROOT_VB, NN_aux_VB...","[user_mention_nsubj_read, will_aux_read, read_...","[user_mention_read_read, will_read_read, read_...",False,0,[],user_mention will read . gotta go afk for a bi...,[afk],1,0


### Setup generic model experiment

In [10]:
def run_experiment(X, y, pipeline, process_name, num_expts=1):
    scores = list()
    for i in tqdm(range(num_expts)):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, train_size=0.80)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_prediction = model.predict(X_test)          # apply the model to the test data
        report = classification_report(y_test, y_prediction)
        score = accuracy_score(y_prediction, y_test)  # compare the results to the gold standard
        scores.append(score)
        print("Classification Report: " + process_name)
        print(report)
        cm = confusion_matrix(y_test, y_prediction)
#         print("Confusion matrix:")
#         print(cm)
#     print(sum(scores) / num_expts)

### Setup helpers [GridSearch, numFeatures]

In [7]:
def run_gridsearch_cv(pipeline, X, y, param_grid, n_jobs):
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_jobs=4)
    grid_search.fit(X, y)
    print(grid_search.best_params_)
    print(grid_search.best_score_)
    
def get_num_features(vectorizer, X):
    vect.fit(X)
    # feature_names = [feature_names[i] for i in skb.get_support(indices=True)]
    return len(vect.get_feature_names())

### Naive baseline classification (countVectorizer: character)

#### CrowdFlower dataset

In [12]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

N_FEATURES_OPTIONS = [100000, 150000]
# http://www.kdnuggets.com/2016/08/approaching-almost-any-machine-learning-problem.html/2
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

# Params learned through GridSearch
k_features = 100000
n_components = 120

vect = CountVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
# print(get_num_features(vect, X))

clf = LinearSVC()
svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)

count_pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from tweet text
    ('skb', skb),
    ('svd', svd),
    ('clf' , clf),   # feed the output through a classifier
])

# run_gridsearch_cv(count_pipeline, X, y, param_grid, n_jobs)

# Run the classifcation
run_experiment(X, y, count_pipeline, "CrowdFlower: CountVectorizer[character] LSA")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories


100%|██████████| 1/1 [00:22<00:00, 22.23s/it]

Classification Report: CrowdFlower: CountVectorizer[character] LSA
               precision    recall  f1-score   support

   hatespeech       0.59      0.34      0.43       470
not_offensive       0.88      0.95      0.92      2432

  avg / total       0.83      0.85      0.84      2902






#### NAACL 2016 dataset

In [13]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

N_FEATURES_OPTIONS = [100000,130000]
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

# Params learned through GridSearch
k_features = 130000
n_components = 120

vect = CountVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
# print(get_num_features(vect, X))

clf = LinearSVC()
svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)

count_pipeline = Pipeline([
    ('vect', vect),
    ('skb', skb),
    ('svd', svd),
    ('clf' , clf),
])

# run_gridsearch_cv(count_pipeline, X, y, param_grid, n_jobs)
run_experiment(X, y, count_pipeline, "NAACL2016: CountVectorizer [character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories


100%|██████████| 1/1 [00:38<00:00, 38.57s/it]

Classification Report: NAACL2016: CountVectorizer [character]
             precision    recall  f1-score   support

       none       0.83      0.94      0.88      2214
     racism       0.77      0.69      0.73       383
     sexism       0.85      0.52      0.65       641

avg / total       0.83      0.83      0.82      3238






### Naive baseline classification (hashingVectorizer: character)

#### CrowdFlower dataset

In [14]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

vect = HashingVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf), 
])
run_experiment(X, y, hashing_pipeline, "CrowdFlower: HashingVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories


100%|██████████| 1/1 [00:03<00:00,  3.73s/it]

Classification Report: CrowdFlower: HashingVectorizer[character]
               precision    recall  f1-score   support

   hatespeech       0.56      0.38      0.45       471
not_offensive       0.89      0.94      0.91      2431

  avg / total       0.83      0.85      0.84      2902






#### NAACL 2016 dataset

In [15]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

vect = HashingVectorizer(ngram_range=(3,5), analyzer='char')
clf = LinearSVC()

hashing_pipeline = Pipeline([
    ('vect', vect),
    ('clf' , clf), 
])
run_experiment(X, y, hashing_pipeline, "NAACL2016: HashingVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories


100%|██████████| 1/1 [00:06<00:00,  6.18s/it]

Classification Report: NAACL2016: HashingVectorizer[character]
             precision    recall  f1-score   support

       none       0.86      0.94      0.89      2197
     racism       0.79      0.68      0.73       377
     sexism       0.85      0.65      0.74       664

avg / total       0.85      0.85      0.84      3238






### Naive baseline classification (TfidfVectorizer: character)

#### CrowdFlower dataset

In [16]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation_label']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

N_FEATURES_OPTIONS = [100000, 200000]
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

# Params learned through GridSearch
k_features = 150000
n_components = 120

svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
# print(get_num_features(vect, X))

clf = LinearSVC()

tfidf_pipeline = Pipeline([
    ('vect', vect),
    ('skb', skb),
    ('svd', svd),
    ('clf' , clf),
])

# run_gridsearch_cv(tfidf_pipeline, X, y, param_grid, n_jobs)
run_experiment(X, y, tfidf_pipeline, "CrowdFlower: LSA - TfidfVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
14508 documents
2 categories


100%|██████████| 1/1 [00:19<00:00, 19.28s/it]

Classification Report: CrowdFlower: LSA - TfidfVectorizer[character]
               precision    recall  f1-score   support

   hatespeech       0.62      0.25      0.36       517
not_offensive       0.86      0.97      0.91      2385

  avg / total       0.81      0.84      0.81      2902






#### NAACL 2016 dataset

In [17]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set['text']
y = train_test_set['annotation']

print("Predicting the labels of the test set...")
print("%d documents" % len(X))
print("%d categories" % len(y.value_counts()))

N_FEATURES_OPTIONS = [100000, 200000]
N_COMPONENTS = [120]
n_jobs = 2

param_grid = [
    {
        'skb__k': N_FEATURES_OPTIONS,
        'svd__n_components': N_COMPONENTS
    }
]

# Params learned through GridSearch
k_features = 150000
n_components = 120

svd = TruncatedSVD(n_components=n_components)
skb = SelectKBest(chi2, k=k_features)
vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
# print(get_num_features(vect, X))

clf = LinearSVC()
tfidf_pipeline = Pipeline([
    ('vect', vect),
    ('skb', skb),
#     ('svd', svd),
    ('clf' , clf),
])

# run_gridsearch_cv(tfidf_pipeline, X, y, param_grid, n_jobs)
run_experiment(X, y, tfidf_pipeline, "NAACL2016: TfidfVectorizer[character]")

  0%|          | 0/1 [00:00<?, ?it/s]

Predicting the labels of the test set...
16187 documents
3 categories


100%|██████████| 1/1 [00:06<00:00,  6.91s/it]

Classification Report: NAACL2016: TfidfVectorizer[character]
             precision    recall  f1-score   support

       none       0.87      0.92      0.89      2267
     racism       0.74      0.70      0.72       369
     sexism       0.80      0.67      0.73       602

avg / total       0.84      0.85      0.84      3238






### Dependency tuple experiment

In [18]:
train_test_set = feat_df.sample(n=int(len(feat_df)), random_state=1965)
X = train_test_set['feat_word_root_rootparent']
y = train_test_set['annotation_label']

hasher = FeatureHasher(input_type='string', non_negative=True)
clf = LinearSVC()

featurer_hasher_pipeline = Pipeline([
    ('hasher', hasher),
    ('clf' , clf),
])
run_experiment(X, y, featurer_hasher_pipeline, "CrowdFlower: FeatureHasher")

100%|██████████| 1/1 [00:02<00:00,  2.08s/it]

Classification Report: CrowdFlower: FeatureHasher
               precision    recall  f1-score   support

   hatespeech       0.51      0.22      0.30       454
not_offensive       0.87      0.96      0.91      2448

  avg / total       0.81      0.84      0.82      2902






## Model Feature Combinations

#### Update feature with top k similar words from Dependency2vec model

In [24]:
hs_candidates_exp6_word = "data/persistence/word_embeddings/dim200vecs_hs_candidates_exp6"
hs_candidates_exp6_model = models.KeyedVectors.load_word2vec_format(hs_candidates_exp6_word, binary=False)

In [92]:
def fetch_top_k_similar(model,row,field_name,k):
    similar_words = []
    for word in row[field_name]:
        if word in model.vocab:
            matches = model.similar_by_word(word, topn=k, restrict_vocab=None)
            for m in matches:
                similar_words.append(m[0])
    return similar_words

In [199]:
df_naacl['similar_hs_keywords'] = df_naacl.apply(lambda row: fetch_top_k_similar(hs_candidates_exp6_model, row, 'hs_keyword_matches', 5), axis=1)
feat_df['similar_hs_keywords'] = feat_df.apply(lambda row: fetch_top_k_similar(hs_candidates_exp6_model, row, 'hs_keyword_matches', 5), axis=1)

#### Setup column extractor

In [19]:
class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart 
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return np.asarray(df[self.column_name]).astype(str)
        
    def fit(self, *_):
        return self

class TextListExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_name):
        self.column_name = column_name
    
    def transform(self, df):
        return df[self.column_name].tolist()
    
    def fit(self, *_):
        return self    

class Apply(BaseEstimator, TransformerMixin):
    """Applies a function f element-wise to the numpy array
    """
    
    def __init__(self, fn):
        self.fn = np.vectorize(fn)
        
    def transform(self, data):
        # note: reshaping is necessary because otherwise sklearn
        # interprets 1-d array as a single sample
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self
    
class BooleanExtractor(BaseEstimator, TransformerMixin):
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return np.asarray(df[self.column_name]).astype(np.int)
                                                       
    def fit(self, *_):
        return self

empty_analyzer = lambda x: x

#### Train model

In [205]:
train_test_set = df_naacl.sample(n=int(len(df_naacl)), random_state=1965)
X = train_test_set[['text', 'feat_dep_unigrams', 'hs_keyword_count', 'similar_hs_keywords']]
y = train_test_set['annotation']
clf = LinearSVC()

# Setup char ngram pipeline
char_vect = TfidfVectorizer(ngram_range=(3,5), analyzer='char', stop_words='english')
char_tfidf_pipeline = Pipeline([
    ('text_extractor', TextExtractor('text')),
    ('char_vect', char_vect)
])

# Setup feature tf-idf vectorizer
dep_context_vect = TfidfVectorizer(analyzer=empty_analyzer, max_df=0.3)
dependency_context_pipeline = Pipeline([
    ('dep_extractor', TextListExtractor('feat_dep_unigrams')), # extract names from df
    ('dep_vect', dep_context_vect)
])


hs_keyword_count_pipeline = Pipeline([
    ('count_extractor', BooleanExtractor('hs_keyword_count')),
    ('identity', Apply(lambda x: x))
])

# Setup similar hs_keywords vectorizer
dep2vec_vect = TfidfVectorizer(analyzer=empty_analyzer)
dep2vec_similarity_pipeline = Pipeline([
    ('dep2vec_extractor', TextListExtractor('similar_hs_keywords')),
    ('dep2vec_vect', dep2vec_vect)
])

k_features = 200000
n_components = 120
skb = SelectKBest(chi2, k=k_features)
svd = TruncatedSVD(n_components=n_components)

pipeline = Pipeline([
    ('all_features', FeatureUnion([
        ('char_tfidf_pipeline', char_tfidf_pipeline),
        ('dependency_context_pipeline', dependency_context_pipeline),
#         ('hs_keyword_count_pipeline', hs_keyword_count_pipeline),
        ('dep2vec_similarity_pipeline', dep2vec_similarity_pipeline)
    ])),
    ('skb', skb),
    ('clf' , clf)
])

run_experiment(X, y, pipeline, "TfidfVectorizer")

# https://github.com/michelleful/SingaporeRoadnameOrigins/blob/master/notebooks/04%20Adding%20features%20with%20Pipelines.ipynb
# https://github.com/amueller/kaggle_insults/blob/e4abac805be1d1e2b3201a978172bafd36cc01e3/features.py
# http://www.markhneedham.com/blog/2015/03/02/python-scikit-learn-training-a-classifier-with-non-numeric-features/

100%|██████████| 1/1 [00:09<00:00,  9.45s/it]

Classification Report: TfidfVectorizer
             precision    recall  f1-score   support

       none       0.86      0.94      0.90      2225
     racism       0.76      0.67      0.71       363
     sexism       0.87      0.64      0.74       650

avg / total       0.85      0.85      0.84      3238






In [105]:
df_naacl.head()

Unnamed: 0,_id,annotation,avg_token_length,brown_cluster_ids,comment_length,feat_dep_bigrams,feat_dep_trigrams,feat_dep_unigrams,feat_dependency_contexts,feat_pos_dep_rootPos,feat_word_dep_root,feat_word_root_rootparent,has_hs_keywords,hs_keyword_count,hs_keyword_matches,text,unknown_words,unknown_words_count,uppercase_token_count,similar_hs_keywords
0,591c2a2065419158a43b8e5a,racism,4.0,"[1726, 31978, 116, 218, 250, 4586, 2218, 4906,...",20,"[so_said_advmod_RB|drasko_said_nsubj_NNP, dras...",[so_said_advmod_RB|drasko_said_nsubj_NNP|just_...,"[so_said_advmod_RB, drasko_said_nsubj_NNP, jus...","[so_said_advmodINV, drasko_said_nsubjINV, just...","[RB_advmod_VBD, NNP_nsubj_VBD, RB_advmod_VBD, ...","[so_advmod_said, drasko_nsubj_said, just_advmo...","[so_said_said, drasko_said_said, just_said_sai...",False,0,[],So Drasko just said he was impressed the girls...,[drasko],1,1,[]
1,591c2a2065419158a43b8e5b,racism,4.0,"[90, 2485, 602, 3877]",10,"[drasko_drasko_ROOT_NNP|they_cook_nsubj_PRP, t...",[drasko_drasko_ROOT_NNP|they_cook_nsubj_PRP|di...,"[drasko_drasko_ROOT_NNP, they_cook_nsubj_PRP, ...","[drasko_cook_ccomp, they_cook_nsubjINV, didn't...","[NNP_ROOT_NNP, PRP_nsubj_VB, MD_aux_VB, VB_cco...","[drasko_ROOT_drasko, they_nsubj_cook, didn't_a...","[drasko_drasko_drasko, they_cook_drasko, didn'...",True,2,"[idiot, bird]",Drasko they didn't cook half a bird you idiot ...,[drasko],1,0,"[moron, bint, buffoon, imbecile, rube, lamb, l..."
2,591c2a2065419158a43b8e5c,racism,4.0,"[1726, 30698, 77, 60, 28]",10,[hopefully_cooks_advmod_RB|someone_cooks_nsubj...,[hopefully_cooks_advmod_RB|someone_cooks_nsubj...,"[hopefully_cooks_advmod_RB, someone_cooks_nsub...","[hopefully_cooks_advmodINV, someone_cooks_nsub...","[RB_advmod_VBZ, NN_nsubj_VBZ, VBZ_ROOT_VBZ, NN...","[hopefully_advmod_cooks, someone_nsubj_cooks, ...","[hopefully_cooks_cooks, someone_cooks_cooks, c...",False,0,[],Hopefully someone cooks Drasko in the next ep ...,"[drasko, ep]",2,1,[]
3,591c2a2065419158a43b8e5d,racism,4.0,"[28, 1009, 602, 506, 1706, 60, 3494, 966, 212,...",16,"[of_born_prep_IN|course_of_pobj_NN, course_of_...",[of_born_prep_IN|course_of_pobj_NN|you_born_ns...,"[of_born_prep_IN, course_of_pobj_NN, you_born_...","[of_course_pobj, of_born_prepINV, course_of_po...","[IN_prep_VBN, NN_pobj_IN, PRP_nsubjpass_VBN, V...","[of_prep_born, course_pobj_of, you_nsubjpass_b...","[of_born_born, course_of_born, you_born_born, ...",False,0,[],of course you were born in serbia ... you're a...,[],0,1,[]
4,591c2a2065419158a43b8e5e,racism,4.0,"[1530, 28, 19, 853, 189, 6442, 8, 1726, 26282,...",21,"[these girls_are_nsubj_NNS|are_are_ROOT_VBP, a...",[these girls_are_nsubj_NNS|are_are_ROOT_VBP|th...,"[these girls_are_nsubj_NNS, are_are_ROOT_VBP, ...","[these girls_are_nsubjINV, are_these girls_nsu...","[NNS_nsubj_VBP, VBP_ROOT_VBP, NN_attr_VBP, IN_...","[these girls_nsubj_are, are_ROOT_are, the equi...","[these girls_are_are, are_are_are, the equival...",False,0,[],These girls are the equivalent of the irritati...,[],0,1,[]
