# Toxic Comment Classification Challenge - Ensemble v1

![Model v1](images/model_v1.png)

# Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
import gc

In [2]:
np.random.seed(5)

# Helper Functions

In [3]:
def read_train_data(label):
    '''Reads the train data
    0: id
    1: comment_text
    2: toxic
    3: severe_toxic
    4: obscene
    5: threat
    6: insult
    7: identity_hate
    '''
    if label == 'toxic':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'toxic': int}, usecols = [1, 2])
    elif label == 'severe_toxic':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'severe_toxic': int}, usecols = [1, 3])
    elif label == 'obscene':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'obscene': int}, usecols = [1, 4])
    elif label == 'threat':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'threat': int}, usecols = [1, 5])
    elif label == 'insult':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'insult': int}, usecols = [1, 6])
    elif label == 'identity_hate':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'identity_hate': int}, usecols = [1, 7])
    else:
        print('Not defined!...')
    return data

def read_test_data():
    '''Reads test data
    0: ids
    1: comment_text
    '''
    return pd.read_csv("../data/test.csv", sep = ",", dtype = {'id': str, 'comment_text': str})

def pre_processing(data):
    # to lower
    data['comment_text'].fillna('unknow', inplace = True)
    data['comment_text'] = data['comment_text'].str.lower()
    # replace don't = n't = [space] not => do not
    data.comment_text.replace("n't", value = ' not', inplace = True, regex = True)
    # 'll will
    data.comment_text.replace("'ll", value = ' will', inplace = True, regex = True)
    # remove \n, dots, commas
    data.comment_text.replace('[^a-zA-Z]+', value = ' ', inplace = True, regex = True)
    return data
    
def generate_features(data, label, num_features):
    '''Tis function create a tiff feature'''
    tiff = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = num_features, stop_words = 'english')
    feature_tiff = tiff.fit_transform(data['comment_text'])
    y = data[label].values
    del data
    gc.collect()
    return feature_tiff, y, tiff


def split_data(data, y, random_seed):
    '''Split the data set'''
    X_train, X_dev, y_train, y_dev = train_test_split(data, y, stratify = y, random_state = random_seed, test_size = .3)
    X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, stratify = y_dev, random_state = random_seed, test_size = .5)
    
    del data, y # 159571 samples
    gc.collect()
    
    return X_train, X_dev, X_test, y_train, y_dev, y_test

def get_imb_data_info(label):
    '''Show the classes in the data and return the ratio to: scale_pos_weight'''
    data = read_train_data(label)
    count_classes = pd.DataFrame(data[label].value_counts().index, columns = [label])
    count_classes['count'] = list(data[label].value_counts())
    del data
    gc.collect()
    print(count_classes)
    ratio = count_classes['count'][0] / count_classes['count'][1]
    print('Neg examples {} / Pos examples {} = {}'.format(count_classes['count'][0], count_classes['count'][1], ratio))
    return ratio

def individual_predictions(test, model):
    '''Predict one model
    model['insult']
    '''
    test = model['tiff'].transform(test['comment_text']) # transform test
    
    temp = pd.DataFrame(model['BernoulliNB_1'].predict_proba(test)[:, 0], columns = ['BernoulliNB_1_0'])
    temp['BernoulliNB_1_1'] = model['BernoulliNB_1'].predict_proba(test)[:, 1]
    temp['RidgeClassifier'] = model['RidgeClassifier'].predict(test)
    temp['AdaBoostClassifier_0'] = model['AdaBoostClassifier'].predict_proba(test)[:, 0]
    temp['AdaBoostClassifier_1'] = model['AdaBoostClassifier'].predict_proba(test)[:, 1]
    temp['BernoulliNB_2_0'] = model['BernoulliNB_2'].predict_proba(test)[:, 0]
    temp['BernoulliNB_2_1'] = model['BernoulliNB_2'].predict_proba(test)[:, 1]
        
    return model['CatBoostClassifier'].predict_proba(temp)[:, 1]
    
    
def predict(models):
    '''This function predict over the test data
    models[0]: toxic
    models[1]: severe_toxic
    models[2]: obscene
    models[3]: threat
    models[4]: insult
    models[5]: identity_hate
    '''
    print('Loading test data...')
    test = read_test_data()

    print('Pre processing...')
    test = pre_processing(test)
    
    print('Predictions')
    submit = pd.DataFrame(test['id'].values, columns = ['id'])
    # [:, 1]: class 1
    submit['toxic'] = individual_predictions(test, models['toxic'])
    submit['severe_toxic'] = individual_predictions(test, models['severe_toxic'])
    submit['obscene'] = individual_predictions(test, models['obscene'])
    submit['threat'] = individual_predictions(test, models['threat'])
    submit['insult'] = individual_predictions(test, models['insult'])
    submit['identity_hate'] = individual_predictions(test, models['identity_hate'])
    
    return submit

In [None]:
def train(label, random_seed, cat_parameters):#, exit = False):
    '''Train a BernoulliNB Classifier'''
    data = read_train_data(label)
    data = pre_processing(data)
    
    feature_tiff, y, tiff = generate_features(data, label, 5000)
    ## add feature engineer
    #feature_engineer_features = get_engineer_features(data)
    # Split
    #X_feature, dev_feature, test_feture, _, _, _ = split_data(feature_engineer_features, y, random_seed)
    del data#, feature_engineer_features
    gc.collect()
    
    X_train, X_dev, X_test, y_train, y_dev, y_test = split_data(feature_tiff, y, random_seed)
    X = [(X_train, y_train), (X_dev, y_dev), (X_test, y_test)]
    
    #print('shape X_train: [{}], and y_train: [{}]'.format(X_train.shape, y_train.shape))
    #print('shape X_feature: [{}]'.format(X_feature.shape))
    
    print('BernoulliNB [1]...')
    berNB_m1 = BernoulliNB()
    berNB_m1.fit(X_train, y_train)
    get_results('BernoulliNB_1', berNB_m1, X)
    
    print('RidgeClassifier...')
    ridgeC = RidgeClassifier(normalize = True, random_state = 7)
    ridgeC.fit(X_train, y_train)
    get_results('RidgeClassifier', ridgeC, X)
    
    print('Ada Boost...')
    ada = AdaBoostClassifier(random_state = 7)
    ada.fit(X_train, y_train)
    get_results('AdaBoostClassifier', ada, X)
    
    print('BernoulliNB [2]...')
    berNB_m2 = BernoulliNB()
    berNB_m2.fit(X_train, y_train)
    get_results('BernoulliNB_2', berNB_m2, X)
    

    
    print('Creating temp train/dev/test data...')

    temp_train = pd.DataFrame(berNB_m1.predict_proba(X_train)[:, 0], columns = ['BernoulliNB_1_0'])
    temp_train['BernoulliNB_1_1'] = berNB_m1.predict_proba(X_train)[:, 1]
    temp_train['RidgeClassifier'] = ridgeC.predict(X_train)
    temp_train['AdaBoostClassifier_0'] = ada.predict_proba(X_train)[:, 0]
    temp_train['AdaBoostClassifier_1'] = ada.predict_proba(X_train)[:, 1]
    temp_train['BernoulliNB_2_0'] = berNB_m2.predict_proba(X_train)[:, 0]
    temp_train['BernoulliNB_2_1'] = berNB_m2.predict_proba(X_train)[:, 1]
    

    temp_dev = pd.DataFrame(berNB_m1.predict_proba(X_dev)[:, 0], columns = ['BernoulliNB_1_0'])
    temp_dev['BernoulliNB_1_1'] = berNB_m1.predict_proba(X_dev)[:, 1]
    temp_dev['RidgeClassifier'] = ridgeC.predict(X_dev)
    temp_dev['AdaBoostClassifier_0'] = ada.predict_proba(X_dev)[:, 0]
    temp_dev['AdaBoostClassifier_1'] = ada.predict_proba(X_dev)[:, 1]
    temp_dev['BernoulliNB_2_0'] = berNB_m2.predict_proba(X_dev)[:, 0]
    temp_dev['BernoulliNB_2_1'] = berNB_m2.predict_proba(X_dev)[:, 1]
    

    temp_test = pd.DataFrame(berNB_m1.predict_proba(X_test)[:, 0], columns = ['BernoulliNB_1_0'])
    temp_test['BernoulliNB_1_1'] = berNB_m1.predict_proba(X_test)[:, 1]
    temp_test['RidgeClassifier'] = ridgeC.predict(X_test)
    temp_test['AdaBoostClassifier_0'] = ada.predict_proba(X_test)[:, 0]
    temp_test['AdaBoostClassifier_1'] = ada.predict_proba(X_test)[:, 1]
    temp_test['BernoulliNB_2_0'] = berNB_m2.predict_proba(X_test)[:, 0]
    temp_test['BernoulliNB_2_1'] = berNB_m2.predict_proba(X_test)[:, 1]
    
    print('CatBoost...')

    X_temp = [(temp_train, y_train), (temp_dev, y_dev), (temp_test, y_test)]   
        
    cat = CatBoostClassifier(iterations = cat_parameters['iterations'], learning_rate = cat_parameters['learning_rate'], 
                             depth = cat_parameters['depth'], logging_level = 'Verbose', loss_function='Logloss', 
                             scale_pos_weight = cat_parameters['scale_pos_weight'], random_seed = 7)
    
    cat.fit(temp_train, y_train)
    get_results('CatBoostClassifier', cat, X_temp)
    
    del feature_tiff, y, X_train, X_dev, X_test, y_train, y_dev, y_test, temp_train, temp_dev, temp_test
    gc.collect()
    
    results = {'BernoulliNB_1': berNB_m1, 'RidgeClassifier': ridgeC, 
               'AdaBoostClassifier': ada, 'BernoulliNB_2': berNB_m2, 
               'CatBoostClassifier': cat, 'tiff': tiff}
    return results

In [None]:
def get_results(model_name, model, X):
    optimum_aprox = .98
    train = roc_auc_score(X[0][1], model.predict(X[0][0]))
    dev = roc_auc_score(X[1][1], model.predict(X[1][0]))
    test = roc_auc_score(X[2][1], model.predict(X[2][0]))
    print('Results: ', model_name)
    print('Optimum Bayes: ', optimum_bayes)
    print('Auc Roc - Train: [{}], and the difference with opt_aprox is: {}'. format(train, optimum_bayes - train))
    print('Auc Roc - Dev: [{}], and the difference with opt_aprox is: {}'. format(train, optimum_bayes - dev))
    print('Auc Roc - Test: [{}], and the difference with opt_aprox is: {}'. format(train, optimum_bayes - test))

# Basic EDA

In [112]:
data = read_train_data('toxic')

In [5]:
data.head(5)

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


In [6]:
data = pre_processing(data)

In [8]:
data.head(7)

Unnamed: 0,comment_text,toxic
0,explanation why the edits made under my userna...,0
1,d aww he matches this background colour i m se...,0
2,hey man i m really not trying to edit war it s...,0
3,more i can t make any real suggestions on imp...,0
4,you sir are my hero any chance you remember wh...,0
5,congratulations from me as well use the tools...,0
6,cocksucker before you piss around on my work,1


In [9]:
data.comment_text.isnull().values.any()

False

In [None]:
# Idea: train a single classifer to recognize each category, this make it a binary classification problem
# toxic_classifier

In [25]:
test = pd.read_csv("../data/test.csv", sep = ",")

In [26]:
test.head(3)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."


In [None]:
#add an ensamble for each label, for example

|     Label    | Predictors |
|--------------|------------|
|    toxic     |      3     |
| severe_toxic |      3     |
|    obscene   |      3     |
|    threat    |      3     |
|    insult    |      3     |
| identity_hate|      3     |

In order to low the training time I can use predictors with low hyperparameters; also I think that I could use only one for __all the classifiers TfidfVectorizer__, also I can join all the predictions into one single multiclassifer...

# Train Toxic

In [5]:
ratio = get_imb_data_info('toxic')

   toxic   count
0      0  144277
1      1   15294
Neg examples 144277 / Pos examples 15294 = 9.433568719759382


In [6]:
# With Cat
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
models = {}
#X_temp, X_features = train('toxic', random_seed = 1, cat_parameters = cat_parameters, exit = True)
#models['toxic'] = toxic

toxic = train('toxic', random_seed = 1, cat_parameters = cat_parameters)
models['toxic'] = toxic

BernoulliNB [1]...
Results:  BernoulliNB_1
Optimum Bayes:  0.98
Auc Roc - Train: [0.783624593680512], and the difference with opt_bayes is: 0.19637540631948802
Auc Roc - Dev: [0.783624593680512], and the difference with opt_bayes is: 0.20484157471905318
Auc Roc - Test: [0.783624593680512], and the difference with opt_bayes is: 0.20020999240473902
RidgeClassifier...
Results:  RidgeClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.8998701469043446], and the difference with opt_bayes is: 0.08012985309565535
Auc Roc - Dev: [0.8998701469043446], and the difference with opt_bayes is: 0.08799057775143693
Auc Roc - Test: [0.8998701469043446], and the difference with opt_bayes is: 0.08549847091696727
Ada Boost...
Results:  AdaBoostClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.758534551363211], and the difference with opt_bayes is: 0.22146544863678896
Auc Roc - Dev: [0.758534551363211], and the difference with opt_bayes is: 0.21990485741382293
Auc Roc - Test: [0.758534551363211], and the

138:	learn: 0.2557761	total: 5.6s	remaining: 14.5s
139:	learn: 0.2557689	total: 5.64s	remaining: 14.5s
140:	learn: 0.2557639	total: 5.67s	remaining: 14.4s
141:	learn: 0.2557490	total: 5.72s	remaining: 14.4s
142:	learn: 0.2556610	total: 5.75s	remaining: 14.4s
143:	learn: 0.2556592	total: 5.79s	remaining: 14.3s
144:	learn: 0.2554386	total: 5.83s	remaining: 14.3s
145:	learn: 0.2554364	total: 5.87s	remaining: 14.2s
146:	learn: 0.2554291	total: 5.91s	remaining: 14.2s
147:	learn: 0.2554231	total: 5.95s	remaining: 14.2s
148:	learn: 0.2554173	total: 5.99s	remaining: 14.1s
149:	learn: 0.2554081	total: 6.03s	remaining: 14.1s
150:	learn: 0.2554038	total: 6.08s	remaining: 14s
151:	learn: 0.2553991	total: 6.12s	remaining: 14s
152:	learn: 0.2553793	total: 6.15s	remaining: 14s
153:	learn: 0.2553644	total: 6.19s	remaining: 13.9s
154:	learn: 0.2553073	total: 6.23s	remaining: 13.9s
155:	learn: 0.2553009	total: 6.27s	remaining: 13.8s
156:	learn: 0.2552969	total: 6.31s	remaining: 13.8s
157:	learn: 0.25529

300:	learn: 0.2536247	total: 12s	remaining: 7.92s
301:	learn: 0.2536202	total: 12s	remaining: 7.88s
302:	learn: 0.2536129	total: 12.1s	remaining: 7.83s
303:	learn: 0.2536099	total: 12.1s	remaining: 7.79s
304:	learn: 0.2536025	total: 12.1s	remaining: 7.75s
305:	learn: 0.2535923	total: 12.2s	remaining: 7.71s
306:	learn: 0.2535865	total: 12.2s	remaining: 7.68s
307:	learn: 0.2535855	total: 12.3s	remaining: 7.64s
308:	learn: 0.2535831	total: 12.3s	remaining: 7.6s
309:	learn: 0.2535669	total: 12.3s	remaining: 7.55s
310:	learn: 0.2535511	total: 12.4s	remaining: 7.51s
311:	learn: 0.2535483	total: 12.4s	remaining: 7.47s
312:	learn: 0.2535440	total: 12.4s	remaining: 7.44s
313:	learn: 0.2535377	total: 12.5s	remaining: 7.4s
314:	learn: 0.2535288	total: 12.5s	remaining: 7.36s
315:	learn: 0.2535246	total: 12.6s	remaining: 7.32s
316:	learn: 0.2535229	total: 12.6s	remaining: 7.28s
317:	learn: 0.2535220	total: 12.6s	remaining: 7.23s
318:	learn: 0.2535179	total: 12.7s	remaining: 7.2s
319:	learn: 0.25351

460:	learn: 0.2523965	total: 18.3s	remaining: 1.55s
461:	learn: 0.2523951	total: 18.4s	remaining: 1.51s
462:	learn: 0.2523927	total: 18.4s	remaining: 1.47s
463:	learn: 0.2523923	total: 18.4s	remaining: 1.43s
464:	learn: 0.2523906	total: 18.5s	remaining: 1.39s
465:	learn: 0.2523831	total: 18.5s	remaining: 1.35s
466:	learn: 0.2523821	total: 18.6s	remaining: 1.31s
467:	learn: 0.2523778	total: 18.6s	remaining: 1.27s
468:	learn: 0.2523758	total: 18.6s	remaining: 1.23s
469:	learn: 0.2523735	total: 18.7s	remaining: 1.19s
470:	learn: 0.2523705	total: 18.7s	remaining: 1.15s
471:	learn: 0.2523684	total: 18.8s	remaining: 1.11s
472:	learn: 0.2523656	total: 18.8s	remaining: 1.07s
473:	learn: 0.2523637	total: 18.8s	remaining: 1.03s
474:	learn: 0.2523580	total: 18.9s	remaining: 993ms
475:	learn: 0.2523561	total: 18.9s	remaining: 954ms
476:	learn: 0.2523539	total: 18.9s	remaining: 914ms
477:	learn: 0.2523530	total: 19s	remaining: 874ms
478:	learn: 0.2523524	total: 19s	remaining: 834ms
479:	learn: 0.25

# Train Severe Toxic

In [7]:
# https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list
ratio = get_imb_data_info('severe_toxic')

   severe_toxic   count
0             0  157976
1             1    1595
Neg examples 157976 / Pos examples 1595 = 99.04451410658307


In [8]:
# Cat
cat_parameters = {
    'iterations': 150,
    'depth': 10,
    'learning_rate': 1.7, 
    'scale_pos_weight': ratio
}
severe_toxic = train('severe_toxic', random_seed = 2, cat_parameters = cat_parameters)
models['severe_toxic'] = severe_toxic

BernoulliNB [1]...
Results:  BernoulliNB_1
Optimum Bayes:  0.98
Auc Roc - Train: [0.9217749301137986], and the difference with opt_bayes is: 0.05822506988620135
Auc Roc - Dev: [0.9217749301137986], and the difference with opt_bayes is: 0.06328550528921906
Auc Roc - Test: [0.9217749301137986], and the difference with opt_bayes is: 0.06386731509011156
RidgeClassifier...
Results:  RidgeClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.9336104423680593], and the difference with opt_bayes is: 0.04638955763194064
Auc Roc - Dev: [0.9336104423680593], and the difference with opt_bayes is: 0.051252532072923684
Auc Roc - Test: [0.9336104423680593], and the difference with opt_bayes is: 0.0747108040969825
Ada Boost...
Results:  AdaBoostClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.6453256683857083], and the difference with opt_bayes is: 0.3346743316142917
Auc Roc - Dev: [0.6453256683857083], and the difference with opt_bayes is: 0.3418304073823992
Auc Roc - Test: [0.6453256683857083], and

# Train Obscene

In [9]:
ratio = get_imb_data_info('obscene')

   obscene   count
0        0  151122
1        1    8449
Neg examples 151122 / Pos examples 8449 = 17.886377086045687


In [10]:
# Cat
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
obscene = train('obscene', random_seed = 3, cat_parameters = cat_parameters)
models['obscene'] = obscene

BernoulliNB [1]...
Results:  BernoulliNB_1
Optimum Bayes:  0.98
Auc Roc - Train: [0.8198153045186166], and the difference with opt_bayes is: 0.16018469548138337
Auc Roc - Dev: [0.8198153045186166], and the difference with opt_bayes is: 0.16932531246538285
Auc Roc - Test: [0.8198153045186166], and the difference with opt_bayes is: 0.16701625949202092
RidgeClassifier...
Results:  RidgeClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.944500391927917], and the difference with opt_bayes is: 0.03549960807208297
Auc Roc - Dev: [0.944500391927917], and the difference with opt_bayes is: 0.04127218903619889
Auc Roc - Test: [0.944500391927917], and the difference with opt_bayes is: 0.036747889908589015
Ada Boost...
Results:  AdaBoostClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.7812237771979266], and the difference with opt_bayes is: 0.1987762228020734
Auc Roc - Dev: [0.7812237771979266], and the difference with opt_bayes is: 0.20161968761533233
Auc Roc - Test: [0.7812237771979266], and 

137:	learn: 0.1561565	total: 6.16s	remaining: 16.2s
138:	learn: 0.1561543	total: 6.21s	remaining: 16.1s
139:	learn: 0.1561502	total: 6.28s	remaining: 16.1s
140:	learn: 0.1561473	total: 6.33s	remaining: 16.1s
141:	learn: 0.1561461	total: 6.38s	remaining: 16.1s
142:	learn: 0.1560225	total: 6.42s	remaining: 16s
143:	learn: 0.1560198	total: 6.48s	remaining: 16s
144:	learn: 0.1560177	total: 6.51s	remaining: 15.9s
145:	learn: 0.1560114	total: 6.56s	remaining: 15.9s
146:	learn: 0.1560061	total: 6.61s	remaining: 15.9s
147:	learn: 0.1560022	total: 6.65s	remaining: 15.8s
148:	learn: 0.1559957	total: 6.7s	remaining: 15.8s
149:	learn: 0.1559901	total: 6.73s	remaining: 15.7s
150:	learn: 0.1559882	total: 6.77s	remaining: 15.7s
151:	learn: 0.1559869	total: 6.81s	remaining: 15.6s
152:	learn: 0.1559779	total: 6.85s	remaining: 15.5s
153:	learn: 0.1559753	total: 6.89s	remaining: 15.5s
154:	learn: 0.1559734	total: 6.93s	remaining: 15.4s
155:	learn: 0.1559720	total: 6.97s	remaining: 15.4s
156:	learn: 0.155

296:	learn: 0.1550156	total: 13.5s	remaining: 9.2s
297:	learn: 0.1550116	total: 13.5s	remaining: 9.15s
298:	learn: 0.1550107	total: 13.6s	remaining: 9.12s
299:	learn: 0.1550104	total: 13.6s	remaining: 9.08s
300:	learn: 0.1550094	total: 13.7s	remaining: 9.05s
301:	learn: 0.1550090	total: 13.7s	remaining: 9.01s
302:	learn: 0.1550084	total: 13.8s	remaining: 8.97s
303:	learn: 0.1550076	total: 13.8s	remaining: 8.93s
304:	learn: 0.1550073	total: 13.9s	remaining: 8.88s
305:	learn: 0.1550059	total: 14s	remaining: 8.85s
306:	learn: 0.1550055	total: 14s	remaining: 8.82s
307:	learn: 0.1550030	total: 14.1s	remaining: 8.8s
308:	learn: 0.1549968	total: 14.2s	remaining: 8.76s
309:	learn: 0.1549940	total: 14.2s	remaining: 8.73s
310:	learn: 0.1549933	total: 14.3s	remaining: 8.69s
311:	learn: 0.1549920	total: 14.4s	remaining: 8.65s
312:	learn: 0.1549804	total: 14.4s	remaining: 8.62s
313:	learn: 0.1549757	total: 14.5s	remaining: 8.6s
314:	learn: 0.1549753	total: 14.6s	remaining: 8.56s
315:	learn: 0.15497

457:	learn: 0.1540885	total: 22.4s	remaining: 2.05s
458:	learn: 0.1540881	total: 22.4s	remaining: 2s
459:	learn: 0.1540770	total: 22.5s	remaining: 1.96s
460:	learn: 0.1540752	total: 22.5s	remaining: 1.91s
461:	learn: 0.1540743	total: 22.6s	remaining: 1.86s
462:	learn: 0.1540735	total: 22.6s	remaining: 1.81s
463:	learn: 0.1540728	total: 22.7s	remaining: 1.76s
464:	learn: 0.1540727	total: 22.8s	remaining: 1.71s
465:	learn: 0.1540719	total: 22.8s	remaining: 1.66s
466:	learn: 0.1540710	total: 22.8s	remaining: 1.61s
467:	learn: 0.1540708	total: 22.9s	remaining: 1.57s
468:	learn: 0.1540706	total: 23s	remaining: 1.52s
469:	learn: 0.1540705	total: 23s	remaining: 1.47s
470:	learn: 0.1540680	total: 23.1s	remaining: 1.42s
471:	learn: 0.1540663	total: 23.1s	remaining: 1.37s
472:	learn: 0.1540654	total: 23.2s	remaining: 1.32s
473:	learn: 0.1540652	total: 23.3s	remaining: 1.28s
474:	learn: 0.1540649	total: 23.4s	remaining: 1.23s
475:	learn: 0.1540645	total: 23.5s	remaining: 1.18s
476:	learn: 0.15406

# Train Threat

In [11]:
ratio = get_imb_data_info('threat')

   threat   count
0       0  159093
1       1     478
Neg examples 159093 / Pos examples 478 = 332.8305439330544


In [12]:
# Cat
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
threat = train('threat', random_seed = 4, cat_parameters = cat_parameters)
models['threat'] = threat

BernoulliNB [1]...
Results:  BernoulliNB_1
Optimum Bayes:  0.98
Auc Roc - Train: [0.7184614310366918], and the difference with opt_bayes is: 0.2615385689633082
Auc Roc - Dev: [0.7184614310366918], and the difference with opt_bayes is: 0.2836815100383655
Auc Roc - Test: [0.7184614310366918], and the difference with opt_bayes is: 0.2741168485878607
RidgeClassifier...
Results:  RidgeClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.7818490339867059], and the difference with opt_bayes is: 0.19815096601329407
Auc Roc - Dev: [0.7818490339867059], and the difference with opt_bayes is: 0.22450124781167358
Auc Roc - Test: [0.7818490339867059], and the difference with opt_bayes is: 0.17908965631206042
Ada Boost...
Results:  AdaBoostClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.616076687072164], and the difference with opt_bayes is: 0.363923312927836
Auc Roc - Dev: [0.616076687072164], and the difference with opt_bayes is: 0.3970228517152754
Auc Roc - Test: [0.616076687072164], and the di

134:	learn: 0.0571302	total: 8.33s	remaining: 22.5s
135:	learn: 0.0571277	total: 8.37s	remaining: 22.4s
136:	learn: 0.0571266	total: 8.42s	remaining: 22.3s
137:	learn: 0.0571264	total: 8.46s	remaining: 22.2s
138:	learn: 0.0571228	total: 8.51s	remaining: 22.1s
139:	learn: 0.0570371	total: 8.55s	remaining: 22s
140:	learn: 0.0570095	total: 8.6s	remaining: 21.9s
141:	learn: 0.0569681	total: 8.64s	remaining: 21.8s
142:	learn: 0.0569585	total: 8.69s	remaining: 21.7s
143:	learn: 0.0569533	total: 8.73s	remaining: 21.6s
144:	learn: 0.0569445	total: 8.78s	remaining: 21.5s
145:	learn: 0.0569338	total: 8.82s	remaining: 21.4s
146:	learn: 0.0569269	total: 8.87s	remaining: 21.3s
147:	learn: 0.0568734	total: 8.91s	remaining: 21.2s
148:	learn: 0.0568632	total: 8.95s	remaining: 21.1s
149:	learn: 0.0568325	total: 8.99s	remaining: 21s
150:	learn: 0.0568295	total: 9.04s	remaining: 20.9s
151:	learn: 0.0568256	total: 9.08s	remaining: 20.8s
152:	learn: 0.0568193	total: 9.13s	remaining: 20.7s
153:	learn: 0.056

295:	learn: 0.0539785	total: 17.1s	remaining: 11.8s
296:	learn: 0.0539772	total: 17.2s	remaining: 11.7s
297:	learn: 0.0539735	total: 17.2s	remaining: 11.7s
298:	learn: 0.0539702	total: 17.3s	remaining: 11.6s
299:	learn: 0.0539676	total: 17.3s	remaining: 11.5s
300:	learn: 0.0539598	total: 17.4s	remaining: 11.5s
301:	learn: 0.0539443	total: 17.4s	remaining: 11.4s
302:	learn: 0.0539432	total: 17.5s	remaining: 11.3s
303:	learn: 0.0539430	total: 17.5s	remaining: 11.3s
304:	learn: 0.0539417	total: 17.5s	remaining: 11.2s
305:	learn: 0.0539370	total: 17.6s	remaining: 11.2s
306:	learn: 0.0539364	total: 17.7s	remaining: 11.1s
307:	learn: 0.0539301	total: 17.7s	remaining: 11s
308:	learn: 0.0539279	total: 17.7s	remaining: 11s
309:	learn: 0.0539277	total: 17.8s	remaining: 10.9s
310:	learn: 0.0539163	total: 17.8s	remaining: 10.8s
311:	learn: 0.0539163	total: 17.9s	remaining: 10.8s
312:	learn: 0.0539157	total: 17.9s	remaining: 10.7s
313:	learn: 0.0539148	total: 18s	remaining: 10.6s
314:	learn: 0.0539

455:	learn: 0.0508365	total: 25.2s	remaining: 2.44s
456:	learn: 0.0508351	total: 25.3s	remaining: 2.38s
457:	learn: 0.0508289	total: 25.4s	remaining: 2.33s
458:	learn: 0.0508214	total: 25.4s	remaining: 2.27s
459:	learn: 0.0508199	total: 25.5s	remaining: 2.22s
460:	learn: 0.0508185	total: 25.6s	remaining: 2.16s
461:	learn: 0.0508028	total: 25.7s	remaining: 2.11s
462:	learn: 0.0507860	total: 25.7s	remaining: 2.05s
463:	learn: 0.0507859	total: 25.8s	remaining: 2s
464:	learn: 0.0507858	total: 25.8s	remaining: 1.94s
465:	learn: 0.0507858	total: 25.8s	remaining: 1.89s
466:	learn: 0.0507858	total: 25.9s	remaining: 1.83s
467:	learn: 0.0507858	total: 25.9s	remaining: 1.77s
468:	learn: 0.0507857	total: 26s	remaining: 1.72s
469:	learn: 0.0507857	total: 26s	remaining: 1.66s
470:	learn: 0.0507857	total: 26.1s	remaining: 1.6s
471:	learn: 0.0507727	total: 26.1s	remaining: 1.55s
472:	learn: 0.0507723	total: 26.2s	remaining: 1.49s
473:	learn: 0.0507722	total: 26.2s	remaining: 1.44s
474:	learn: 0.050771

# Train Insult

In [13]:
ratio = get_imb_data_info('insult')

   insult   count
0       0  151694
1       1    7877
Neg examples 151694 / Pos examples 7877 = 19.25783927891329


In [14]:
# Cat
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
insult = train('insult', random_seed = 5, cat_parameters = cat_parameters)
models['insult'] = insult

BernoulliNB [1]...
Results:  BernoulliNB_1
Optimum Bayes:  0.98
Auc Roc - Train: [0.8201858624078954], and the difference with opt_bayes is: 0.1598141375921046
Auc Roc - Dev: [0.8201858624078954], and the difference with opt_bayes is: 0.17470001382394762
Auc Roc - Test: [0.8201858624078954], and the difference with opt_bayes is: 0.1743627323032918
RidgeClassifier...
Results:  RidgeClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.9224391131067932], and the difference with opt_bayes is: 0.05756088689320682
Auc Roc - Dev: [0.9224391131067932], and the difference with opt_bayes is: 0.06476813072133292
Auc Roc - Test: [0.9224391131067932], and the difference with opt_bayes is: 0.06339873527006346
Ada Boost...
Results:  AdaBoostClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.6946106695855874], and the difference with opt_bayes is: 0.2853893304144126
Auc Roc - Dev: [0.6946106695855874], and the difference with opt_bayes is: 0.28666040774877855
Auc Roc - Test: [0.6946106695855874], and 

133:	learn: 0.2033570	total: 5.6s	remaining: 15.3s
134:	learn: 0.2033521	total: 5.64s	remaining: 15.3s
135:	learn: 0.2033367	total: 5.69s	remaining: 15.2s
136:	learn: 0.2033166	total: 5.73s	remaining: 15.2s
137:	learn: 0.2033065	total: 5.78s	remaining: 15.2s
138:	learn: 0.2032964	total: 5.83s	remaining: 15.1s
139:	learn: 0.2032790	total: 5.87s	remaining: 15.1s
140:	learn: 0.2032772	total: 5.92s	remaining: 15.1s
141:	learn: 0.2032438	total: 5.96s	remaining: 15s
142:	learn: 0.2032366	total: 6s	remaining: 15s
143:	learn: 0.2032307	total: 6.05s	remaining: 15s
144:	learn: 0.2032249	total: 6.09s	remaining: 14.9s
145:	learn: 0.2032162	total: 6.14s	remaining: 14.9s
146:	learn: 0.2032131	total: 6.18s	remaining: 14.8s
147:	learn: 0.2032094	total: 6.22s	remaining: 14.8s
148:	learn: 0.2032057	total: 6.28s	remaining: 14.8s
149:	learn: 0.2031767	total: 6.32s	remaining: 14.7s
150:	learn: 0.2031704	total: 6.36s	remaining: 14.7s
151:	learn: 0.2031553	total: 6.41s	remaining: 14.7s
152:	learn: 0.2031518	

296:	learn: 0.2007143	total: 12.5s	remaining: 8.56s
297:	learn: 0.2007122	total: 12.6s	remaining: 8.52s
298:	learn: 0.2007092	total: 12.6s	remaining: 8.48s
299:	learn: 0.2006700	total: 12.7s	remaining: 8.44s
300:	learn: 0.2006561	total: 12.7s	remaining: 8.4s
301:	learn: 0.2006495	total: 12.8s	remaining: 8.36s
302:	learn: 0.2006379	total: 12.8s	remaining: 8.32s
303:	learn: 0.2006294	total: 12.8s	remaining: 8.28s
304:	learn: 0.2006237	total: 12.9s	remaining: 8.24s
305:	learn: 0.2006190	total: 12.9s	remaining: 8.2s
306:	learn: 0.2006128	total: 13s	remaining: 8.16s
307:	learn: 0.2006075	total: 13s	remaining: 8.12s
308:	learn: 0.2005735	total: 13.1s	remaining: 8.08s
309:	learn: 0.2005626	total: 13.1s	remaining: 8.04s
310:	learn: 0.2005519	total: 13.2s	remaining: 8s
311:	learn: 0.2005297	total: 13.2s	remaining: 7.96s
312:	learn: 0.2005049	total: 13.3s	remaining: 7.92s
313:	learn: 0.2004837	total: 13.3s	remaining: 7.88s
314:	learn: 0.2004805	total: 13.3s	remaining: 7.84s
315:	learn: 0.2004497

458:	learn: 0.1981539	total: 19.4s	remaining: 1.74s
459:	learn: 0.1981488	total: 19.5s	remaining: 1.69s
460:	learn: 0.1981423	total: 19.5s	remaining: 1.65s
461:	learn: 0.1981366	total: 19.6s	remaining: 1.61s
462:	learn: 0.1981302	total: 19.6s	remaining: 1.57s
463:	learn: 0.1981269	total: 19.7s	remaining: 1.52s
464:	learn: 0.1981224	total: 19.7s	remaining: 1.48s
465:	learn: 0.1981205	total: 19.8s	remaining: 1.44s
466:	learn: 0.1981027	total: 19.8s	remaining: 1.4s
467:	learn: 0.1981015	total: 19.8s	remaining: 1.36s
468:	learn: 0.1980622	total: 19.9s	remaining: 1.31s
469:	learn: 0.1980584	total: 19.9s	remaining: 1.27s
470:	learn: 0.1980553	total: 20s	remaining: 1.23s
471:	learn: 0.1977742	total: 20s	remaining: 1.19s
472:	learn: 0.1977423	total: 20s	remaining: 1.14s
473:	learn: 0.1977244	total: 20.1s	remaining: 1.1s
474:	learn: 0.1977070	total: 20.1s	remaining: 1.06s
475:	learn: 0.1977036	total: 20.2s	remaining: 1.02s
476:	learn: 0.1976986	total: 20.2s	remaining: 974ms
477:	learn: 0.197690

# Train Identity Hate

In [15]:
ratio = get_imb_data_info('identity_hate')

   identity_hate   count
0              0  158166
1              1    1405
Neg examples 158166 / Pos examples 1405 = 112.57366548042705


In [16]:
# Cat
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
identity_hate = train('identity_hate', random_seed = 6, cat_parameters = cat_parameters)
models['identity_hate'] = identity_hate

BernoulliNB [1]...
Results:  BernoulliNB_1
Optimum Bayes:  0.98
Auc Roc - Train: [0.9177071443264864], and the difference with opt_bayes is: 0.062292855673513614
Auc Roc - Dev: [0.9177071443264864], and the difference with opt_bayes is: 0.09336792532923155
Auc Roc - Test: [0.9177071443264864], and the difference with opt_bayes is: 0.08310948416642094
RidgeClassifier...
Results:  RidgeClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.8609598341059914], and the difference with opt_bayes is: 0.11904016589400856
Auc Roc - Dev: [0.8609598341059914], and the difference with opt_bayes is: 0.13326544778989113
Auc Roc - Test: [0.8609598341059914], and the difference with opt_bayes is: 0.14272304596007768
Ada Boost...
Results:  AdaBoostClassifier
Optimum Bayes:  0.98
Auc Roc - Train: [0.6190787711703019], and the difference with opt_bayes is: 0.3609212288296981
Auc Roc - Dev: [0.6190787711703019], and the difference with opt_bayes is: 0.3529441317625437
Auc Roc - Test: [0.6190787711703019], an

136:	learn: 0.1527176	total: 6.1s	remaining: 16.2s
137:	learn: 0.1527043	total: 6.14s	remaining: 16.1s
138:	learn: 0.1526997	total: 6.19s	remaining: 16.1s
139:	learn: 0.1526939	total: 6.23s	remaining: 16s
140:	learn: 0.1526868	total: 6.28s	remaining: 16s
141:	learn: 0.1526553	total: 6.33s	remaining: 15.9s
142:	learn: 0.1526502	total: 6.37s	remaining: 15.9s
143:	learn: 0.1525815	total: 6.41s	remaining: 15.9s
144:	learn: 0.1525553	total: 6.46s	remaining: 15.8s
145:	learn: 0.1524906	total: 6.5s	remaining: 15.8s
146:	learn: 0.1524836	total: 6.55s	remaining: 15.7s
147:	learn: 0.1524699	total: 6.59s	remaining: 15.7s
148:	learn: 0.1524614	total: 6.64s	remaining: 15.6s
149:	learn: 0.1524362	total: 6.68s	remaining: 15.6s
150:	learn: 0.1524242	total: 6.72s	remaining: 15.5s
151:	learn: 0.1524214	total: 6.78s	remaining: 15.5s
152:	learn: 0.1524194	total: 6.83s	remaining: 15.5s
153:	learn: 0.1524115	total: 6.89s	remaining: 15.5s
154:	learn: 0.1524085	total: 6.95s	remaining: 15.5s
155:	learn: 0.1524

296:	learn: 0.1373463	total: 13.3s	remaining: 9.08s
297:	learn: 0.1373453	total: 13.3s	remaining: 9.04s
298:	learn: 0.1373446	total: 13.4s	remaining: 8.99s
299:	learn: 0.1373442	total: 13.4s	remaining: 8.94s
300:	learn: 0.1373262	total: 13.5s	remaining: 8.9s
301:	learn: 0.1373143	total: 13.5s	remaining: 8.86s
302:	learn: 0.1373107	total: 13.6s	remaining: 8.81s
303:	learn: 0.1373062	total: 13.6s	remaining: 8.77s
304:	learn: 0.1373044	total: 13.6s	remaining: 8.72s
305:	learn: 0.1373032	total: 13.7s	remaining: 8.68s
306:	learn: 0.1373027	total: 13.7s	remaining: 8.63s
307:	learn: 0.1373018	total: 13.8s	remaining: 8.59s
308:	learn: 0.1373017	total: 13.8s	remaining: 8.54s
309:	learn: 0.1372955	total: 13.9s	remaining: 8.49s
310:	learn: 0.1372942	total: 13.9s	remaining: 8.44s
311:	learn: 0.1372938	total: 13.9s	remaining: 8.39s
312:	learn: 0.1372898	total: 14s	remaining: 8.35s
313:	learn: 0.1372510	total: 14s	remaining: 8.3s
314:	learn: 0.1372471	total: 14s	remaining: 8.25s
315:	learn: 0.137244

457:	learn: 0.1340321	total: 20.7s	remaining: 1.9s
458:	learn: 0.1340315	total: 20.7s	remaining: 1.85s
459:	learn: 0.1340311	total: 20.8s	remaining: 1.81s
460:	learn: 0.1340301	total: 20.8s	remaining: 1.76s
461:	learn: 0.1340292	total: 20.9s	remaining: 1.72s
462:	learn: 0.1340287	total: 20.9s	remaining: 1.67s
463:	learn: 0.1340269	total: 21s	remaining: 1.63s
464:	learn: 0.1340261	total: 21s	remaining: 1.58s
465:	learn: 0.1340235	total: 21.1s	remaining: 1.54s
466:	learn: 0.1340233	total: 21.1s	remaining: 1.49s
467:	learn: 0.1340209	total: 21.2s	remaining: 1.45s
468:	learn: 0.1340191	total: 21.2s	remaining: 1.4s
469:	learn: 0.1339540	total: 21.2s	remaining: 1.36s
470:	learn: 0.1339540	total: 21.3s	remaining: 1.31s
471:	learn: 0.1339540	total: 21.3s	remaining: 1.26s
472:	learn: 0.1339538	total: 21.4s	remaining: 1.22s
473:	learn: 0.1339355	total: 21.4s	remaining: 1.18s
474:	learn: 0.1339335	total: 21.5s	remaining: 1.13s
475:	learn: 0.1339334	total: 21.5s	remaining: 1.08s
476:	learn: 0.1339

In [17]:
submit = predict(models)

Loading test data...
Pre processing...
Predictions


In [16]:
test = read_test_data()
test.head(5) # replace don't = n't = [space] not => do not ok :)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [155]:
submit.head(4)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.241211,0.999968,0.235349,0.002367,0.268094,0.516325
1,0000247867823ef7,0.008119,0.5,0.068863,0.001137,0.160699,0.233261
2,00013b17ad220c46,0.035466,0.5,0.166278,0.001137,0.669551,0.187077
3,00017563c3f7919a,0.008119,0.5,0.065649,0.001516,0.190665,0.217391


In [18]:
print(test['comment_text'][0])

Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,


In [18]:
submit.to_csv('../submits/results_preds_cat.csv', index = False) # 0.7543
# 0.9557

The data set is very imbalanced, also until now there is no application of cross validation in the models, possible improving ideas are:

*  Add more iterations to CatBoost [OK].
*  Add more models
*  Add models with ability to deal with imbalanced data
*  Apply cross validation to the first models
*  Tune CatBoost with cross val
*  Add more features to the CatBoost [len of word, and total number of words], it will be more easy, to added to catboost, because they are in a dataframe [OK] == This decreases the performance!!!
*  Add other types of data like bag o words, and merge them with tiff
*  Add the AUC metric to Cat training and validation
*  Add a custom stopwords to tiff

In [91]:
# https://stackoverflow.com/questions/16858652/how-to-find-the-corresponding-class-in-clf-predict-proba
# models[4][0].classes_

array([0, 1])

In [117]:
test = read_test_data()
test.head(5)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [115]:
submit.head(5)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996423,0.631188,0.9987,0.039391,0.996115,0.803285
1,0000247867823ef7,0.02101,3.4e-05,0.000229,0.000541,0.000438,0.000215
2,00013b17ad220c46,0.02101,3.4e-05,0.006245,0.000541,0.011883,0.000215
3,00017563c3f7919a,0.001293,3.4e-05,0.000229,0.000541,0.000438,0.000215
4,00017695ad8997eb,0.02101,3.4e-05,0.006245,0.000541,0.011883,0.000215


In [116]:
submit.to_csv('../submits/results_preds_1v.csv', index = False)

In [None]:
# LB: 0.9343
# LB: 0.9570 [adding more iterations to CatBoost]