# Toxic Comment Classification Challenge

![Model](model_notebook.png)

# Libraries

In [19]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
import gc

In [20]:
# for reproducibility
np.random.seed(5)

# Helper Functions

In [26]:
def read_train_data(label):
    '''Return the train data based in the label
    col index:
        0: id
        1: comment_text
        2: toxic
        3: severe_toxic
        4: obscene
        5: threat
        6: insult
        7: identity_hate
    label:
        - toxic
        - severe_toxic
        - obscene
        - threat
        - insult
        - identity_hate
    '''
    if label == 'toxic':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'toxic': int}, usecols = [1, 2])
    elif label == 'severe_toxic':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'severe_toxic': int}, usecols = [1, 3])
    elif label == 'obscene':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'obscene': int}, usecols = [1, 4])
    elif label == 'threat':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'threat': int}, usecols = [1, 5])
    elif label == 'insult':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'insult': int}, usecols = [1, 6])
    elif label == 'identity_hate':
        data = pd.read_csv("../data/train.csv", sep = ",", dtype = {'comment_text': str, 'identity_hate': int}, usecols = [1, 7])
    else:
        print('Not defined!...')
    return data

def read_test_data():
    '''Return the test data
    0: ids
    1: comment_text
    '''
    return pd.read_csv("../data/test.csv", sep = ",", dtype = {'id': str, 'comment_text': str})

def pre_processing(data):
    '''
    Pre process the data
    data: data set to pre process
    '''
    # to lower
    data['comment_text'].fillna('unknow', inplace = True)
    data['comment_text'] = data['comment_text'].str.lower()
    # replace don't = n't = [space] not => do not
    data.comment_text.replace("n't", value = ' not', inplace = True, regex = True)
    # 'll will
    data.comment_text.replace("'ll", value = ' will', inplace = True, regex = True)
    # remove \n, dots, commas
    data.comment_text.replace('[^a-zA-Z]+', value = ' ', inplace = True, regex = True)
    return data
    
def generate_features(data, label, num_features):
    '''This function creates TfidfVectorizer features
    data: data set
    label:
        - toxic
        - severe_toxic
        - obscene
        - threat
        - insult
        - identity_hate
    num_features: Number of features to generate
    '''
    tiff = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = num_features, stop_words = 'english')
    feature_tiff = tiff.fit_transform(data['comment_text'])
    y = data[label].values
    del data
    gc.collect()
    return feature_tiff, y, tiff


def split_data(data, y, random_seed):
    '''Split the data set
    data: data set
    y: labels
    random_seed: for reproducibility
    '''
    X_train, X_dev, y_train, y_dev = train_test_split(data, y, stratify = y, random_state = random_seed, test_size = .3)
    X_dev, X_test, y_dev, y_test = train_test_split(X_dev, y_dev, stratify = y_dev, random_state = random_seed, test_size = .5)
    
    del data, y # 159571 samples
    gc.collect()
    
    return X_train, X_dev, X_test, y_train, y_dev, y_test

def get_imb_data_info(label):
    '''Show the classes in the data and return the ratio to use in
    catboost hiperparameter scale_pos_weight
    label:
        - toxic
        - severe_toxic
        - obscene
        - threat
        - insult
        - identity_hate
    '''
    data = read_train_data(label)
    count_classes = pd.DataFrame(data[label].value_counts().index, columns = [label])
    count_classes['count'] = list(data[label].value_counts())
    del data
    gc.collect()
    print(count_classes)
    ratio = count_classes['count'][0] / count_classes['count'][1]
    print('Neg examples {} / Pos examples {} = {}'.format(count_classes['count'][0], count_classes['count'][1], ratio))
    return ratio

def individual_predictions(test, model):
    '''Make predictions using a model
    model: Dictionary containing the models and TfidfVectorizer transformer, ex model['insult']
    test: data set used for test
    '''
    test = model['tiff'].transform(test['comment_text']) # transform test
    
    temp = pd.DataFrame(model['BernoulliNB_1'].predict_proba(test)[:, 0], columns = ['BernoulliNB_1_0'])
    temp['BernoulliNB_1_1'] = model['BernoulliNB_1'].predict_proba(test)[:, 1]
    temp['RidgeClassifier'] = model['RidgeClassifier'].predict(test)
    temp['AdaBoostClassifier_0'] = model['AdaBoostClassifier'].predict_proba(test)[:, 0]
    temp['AdaBoostClassifier_1'] = model['AdaBoostClassifier'].predict_proba(test)[:, 1]
    temp['BernoulliNB_2_0'] = model['BernoulliNB_2'].predict_proba(test)[:, 0]
    temp['BernoulliNB_2_1'] = model['BernoulliNB_2'].predict_proba(test)[:, 1]
        
    return model['CatBoostClassifier'].predict_proba(temp)[:, 1]
    
    
def predict(models):
    '''This function predict over the test data and return the submission file
    models: A dictionary containing the models and TfidfVectorizer transformer.
        - models[0]: toxic
        - models[1]: severe_toxic
        - models[2]: obscene
        - models[3]: threat
        - models[4]: insult
        - models[5]: identity_hate
    '''
    print('Loading test data...')
    test = read_test_data()

    print('Pre processing...')
    test = pre_processing(test)
    
    print('Predictions')
    submit = pd.DataFrame(test['id'].values, columns = ['id'])

    submit['toxic'] = individual_predictions(test, models['toxic'])
    submit['severe_toxic'] = individual_predictions(test, models['severe_toxic'])
    submit['obscene'] = individual_predictions(test, models['obscene'])
    submit['threat'] = individual_predictions(test, models['threat'])
    submit['insult'] = individual_predictions(test, models['insult'])
    submit['identity_hate'] = individual_predictions(test, models['identity_hate'])
    
    return submit

In [27]:
def train(label, random_seed, cat_parameters):
    '''Train all the models for a specific class
    label:
        - toxic
        - severe_toxic
        - obscene
        - threat
        - insult
        - identity_hate
    random_seed: For reproducibility
    cat_parameters: catBoost hyperparameters
    '''
    data = read_train_data(label)
    data = pre_processing(data)
    
    feature_tiff, y, tiff = generate_features(data, label, 5000)

    del data
    gc.collect()
    
    X_train, X_dev, X_test, y_train, y_dev, y_test = split_data(feature_tiff, y, random_seed)
    X = [(X_train, y_train), (X_dev, y_dev), (X_test, y_test)]
    
    print('BernoulliNB [1]...')
    berNB_m1 = BernoulliNB(alpha = 1.0)
    berNB_m1.fit(X_train, y_train)
    get_results('BernoulliNB_1', berNB_m1, X)
    
    print('RidgeClassifier...')
    ridgeC = RidgeClassifier(normalize = True, random_state = 7)
    ridgeC.fit(X_train, y_train)
    get_results('RidgeClassifier', ridgeC, X)
    
    print('Ada Boost...')
    ada = AdaBoostClassifier(random_state = 7)
    ada.fit(X_train, y_train)
    get_results('AdaBoostClassifier', ada, X)
    
    print('BernoulliNB [2]...')
    berNB_m2 = BernoulliNB(alpha = 0.5)
    berNB_m2.fit(X_train, y_train)
    get_results('BernoulliNB_2', berNB_m2, X)
       
    print('Creating temp train/dev/test data...')

    temp_train = pd.DataFrame(berNB_m1.predict_proba(X_train)[:, 0], columns = ['BernoulliNB_1_0'])
    temp_train['BernoulliNB_1_1'] = berNB_m1.predict_proba(X_train)[:, 1]
    temp_train['RidgeClassifier'] = ridgeC.predict(X_train)
    temp_train['AdaBoostClassifier_0'] = ada.predict_proba(X_train)[:, 0]
    temp_train['AdaBoostClassifier_1'] = ada.predict_proba(X_train)[:, 1]
    temp_train['BernoulliNB_2_0'] = berNB_m2.predict_proba(X_train)[:, 0]
    temp_train['BernoulliNB_2_1'] = berNB_m2.predict_proba(X_train)[:, 1]
    

    temp_dev = pd.DataFrame(berNB_m1.predict_proba(X_dev)[:, 0], columns = ['BernoulliNB_1_0'])
    temp_dev['BernoulliNB_1_1'] = berNB_m1.predict_proba(X_dev)[:, 1]
    temp_dev['RidgeClassifier'] = ridgeC.predict(X_dev)
    temp_dev['AdaBoostClassifier_0'] = ada.predict_proba(X_dev)[:, 0]
    temp_dev['AdaBoostClassifier_1'] = ada.predict_proba(X_dev)[:, 1]
    temp_dev['BernoulliNB_2_0'] = berNB_m2.predict_proba(X_dev)[:, 0]
    temp_dev['BernoulliNB_2_1'] = berNB_m2.predict_proba(X_dev)[:, 1]
    

    temp_test = pd.DataFrame(berNB_m1.predict_proba(X_test)[:, 0], columns = ['BernoulliNB_1_0'])
    temp_test['BernoulliNB_1_1'] = berNB_m1.predict_proba(X_test)[:, 1]
    temp_test['RidgeClassifier'] = ridgeC.predict(X_test)
    temp_test['AdaBoostClassifier_0'] = ada.predict_proba(X_test)[:, 0]
    temp_test['AdaBoostClassifier_1'] = ada.predict_proba(X_test)[:, 1]
    temp_test['BernoulliNB_2_0'] = berNB_m2.predict_proba(X_test)[:, 0]
    temp_test['BernoulliNB_2_1'] = berNB_m2.predict_proba(X_test)[:, 1]
    
    print('CatBoost...')

    X_temp = [(temp_train, y_train), (temp_dev, y_dev), (temp_test, y_test)]   
        
    cat = CatBoostClassifier(iterations = cat_parameters['iterations'], learning_rate = cat_parameters['learning_rate'], 
                             depth = cat_parameters['depth'], logging_level = 'Verbose', loss_function='Logloss', 
                             scale_pos_weight = cat_parameters['scale_pos_weight'], random_seed = 7)
    
    cat.fit(temp_train, y_train)
    get_results('CatBoostClassifier', cat, X_temp)
    
    del feature_tiff, y, X_train, X_dev, X_test, y_train, y_dev, y_test, temp_train, temp_dev, temp_test
    gc.collect()
    
    results = {'BernoulliNB_1': berNB_m1, 'RidgeClassifier': ridgeC, 
               'AdaBoostClassifier': ada, 'BernoulliNB_2': berNB_m2, 
               'CatBoostClassifier': cat, 'tiff': tiff}
    return results

In [16]:
def get_results(model_name, model, X):
    '''Evaluate the performance of the models using a reference as optimum
    model_name: name of the model
    model: trained model
    X: contains the train, dev and test sets to evaluate the performance
    '''
    optimum_aprox = .98
    train = roc_auc_score(X[0][1], model.predict(X[0][0]))
    dev = roc_auc_score(X[1][1], model.predict(X[1][0]))
    test = roc_auc_score(X[2][1], model.predict(X[2][0]))
    print('Results: ', model_name)
    print('Optimum Bayes: ', optimum_aprox)
    print('Auc Roc - Train: [{}], and the difference with opt_aprox is: {}'. format(train, optimum_aprox - train))
    print('Auc Roc - Dev: [{}], and the difference with opt_aprox is: {}'. format(train, optimum_aprox - dev))
    print('Auc Roc - Test: [{}], and the difference with opt_aprox is: {}'. format(train, optimum_aprox - test))

# Basic EDA

## Loading training dataset

In [28]:
data = read_train_data('toxic')

In [29]:
data.head(5)

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


## Check null values

In [30]:
data.comment_text.isnull().values.any()

False

## Pre processing

In [31]:
data = pre_processing(data)

In [32]:
data.head(7)

Unnamed: 0,comment_text,toxic
0,explanation why the edits made under my userna...,0
1,d aww he matches this background colour i m se...,0
2,hey man i m really not trying to edit war it s...,0
3,more i ca not make any real suggestions on im...,0
4,you sir are my hero any chance you remember wh...,0
5,congratulations from me as well use the tools...,0
6,cocksucker before you piss around on my work,1


## Loading test dataset

In [33]:
test = pd.read_csv("../data/test.csv", sep = ",")

In [34]:
test.head(3)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."


## Check null values

In [35]:
test.comment_text.isnull().values.any()

False

## Pre processing

In [36]:
test = pre_processing(test)

In [37]:
test.head(3)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then you wi...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources zawe ashton on lapland


# Training
## Train Toxic

In [17]:
ratio = get_imb_data_info('toxic') # get the ratio to use in scale_pos_weight

   toxic   count
0      0  144277
1      1   15294
Neg examples 144277 / Pos examples 15294 = 9.433568719759382


In [None]:
# CatBoost hyperparameters
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
models = {} # dictionary used for store the models

toxic = train('toxic', random_seed = 1, cat_parameters = cat_parameters)
models['toxic'] = toxic

__BernoulliNB [1]...__

__Results:  BernoulliNB_1__

Optimum Bayes:  0.98

Auc Roc - Train: [0.783624593680512], and the difference with opt_aprox is: 0.19637540631948802

Auc Roc - Dev: [0.783624593680512], and the difference with opt_aprox is: 0.20484157471905318

Auc Roc - Test: [0.783624593680512], and the difference with opt_aprox is: 0.20020999240473902

__RidgeClassifier...__

__Results:  RidgeClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.8998701469043446], and the difference with opt_aprox is: 0.08012985309565535

Auc Roc - Dev: [0.8998701469043446], and the difference with opt_aprox is: 0.08799057775143693

Auc Roc - Test: [0.8998701469043446], and the difference with opt_aprox is: 0.08549847091696727


__Ada Boost...__

__Results:  AdaBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.758534551363211], and the difference with opt_aprox is: 0.22146544863678896

Auc Roc - Dev: [0.758534551363211], and the difference with opt_aprox is: 0.21990485741382293

Auc Roc - Test: [0.758534551363211], and the difference with opt_aprox is: 0.22258360285753265

__BernoulliNB [2]...__

__Results:  BernoulliNB_2__

Optimum Bayes:  0.98

Auc Roc - Train: [0.7824438206250637], and the difference with opt_aprox is: 0.1975561793749363

Auc Roc - Dev: [0.7824438206250637], and the difference with opt_aprox is: 0.20714976215561998

Auc Roc - Test: [0.7824438206250637], and the difference with opt_aprox is: 0.20092315089802049

Creating temp train/dev/test data...

__Results:  CatBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9035294618775186], and the difference with opt_aprox is: 0.07647053812248139

Auc Roc - Dev: [0.9035294618775186], and the difference with opt_aprox is: 0.08843866349513951

Auc Roc - Test: [0.9035294618775186], and the difference with opt_aprox is: 0.08457861207747175

## Train Severe Toxic

In [19]:
ratio = get_imb_data_info('severe_toxic') # get the ratio to use in scale_pos_weight

   severe_toxic   count
0             0  157976
1             1    1595
Neg examples 157976 / Pos examples 1595 = 99.04451410658307


In [None]:
# CatBoost hyperparameters
cat_parameters = {
    'iterations': 150,
    'depth': 10,
    'learning_rate': 1.7, 
    'scale_pos_weight': ratio
}
severe_toxic = train('severe_toxic', random_seed = 2, cat_parameters = cat_parameters)
models['severe_toxic'] = severe_toxic

__BernoulliNB [1]...__

__Results:  BernoulliNB_1__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9217749301137986], and the difference with opt_aprox is: 0.05822506988620135

Auc Roc - Dev: [0.9217749301137986], and the difference with opt_aprox is: 0.06328550528921906

Auc Roc - Test: [0.9217749301137986], and the difference with opt_aprox is: 0.06386731509011156

__RidgeClassifier...__

__Results:  RidgeClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9336104423680593], and the difference with opt_aprox is: 0.04638955763194064

Auc Roc - Dev: [0.9336104423680593], and the difference with opt_aprox is: 0.051252532072923684

Auc Roc - Test: [0.9336104423680593], and the difference with opt_aprox is: 0.0747108040969825

__Ada Boost...__

__Results:  AdaBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.6453256683857083], and the difference with opt_aprox is: 0.3346743316142917

Auc Roc - Dev: [0.6453256683857083], and the difference with opt_aprox is: 0.3418304073823992

Auc Roc - Test: [0.6453256683857083], and the difference with opt_aprox is: 0.3603490652472119

__BernoulliNB [2]...__

__Results:  BernoulliNB_2__
Optimum Bayes:  0.98

Auc Roc - Train: [0.8982335338249797], and the difference with opt_aprox is: 0.08176646617502026

Auc Roc - Dev: [0.8982335338249797], and the difference with opt_aprox is: 0.0945397254107585

Auc Roc - Test: [0.8982335338249797], and the difference with opt_aprox is: 0.08929247439297694
Creating temp train/dev/test data...

__CatBoost...__

__Results:  CatBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9614800355768387], and the difference with opt_aprox is: 0.018519964423161284

Auc Roc - Dev: [0.9614800355768387], and the difference with opt_aprox is: 0.038912896691424614

Auc Roc - Test: [0.9614800355768387], and the difference with opt_aprox is: 0.04633459772726911

## Train Obscene

In [21]:
ratio = get_imb_data_info('obscene') # get the ratio to use in scale_pos_weight

   obscene   count
0        0  151122
1        1    8449
Neg examples 151122 / Pos examples 8449 = 17.886377086045687


In [None]:
# CatBoost hyperparameters
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
obscene = train('obscene', random_seed = 3, cat_parameters = cat_parameters)
models['obscene'] = obscene

__BernoulliNB [1]...__

__Results:  BernoulliNB_1__

Optimum Bayes:  0.98

Auc Roc - Train: [0.8198153045186166], and the difference with opt_aprox is: 0.16018469548138337

Auc Roc - Dev: [0.8198153045186166], and the difference with opt_aprox is: 0.16932531246538285

Auc Roc - Test: [0.8198153045186166], and the difference with opt_aprox is: 0.16701625949202092

__RidgeClassifier...__

__Results:  RidgeClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.944500391927917], and the difference with opt_aprox is: 0.03549960807208297

Auc Roc - Dev: [0.944500391927917], and the difference with opt_aprox is: 0.04127218903619889

Auc Roc - Test: [0.944500391927917], and the difference with opt_aprox is: 0.036747889908589015

__Ada Boost...__

__Results:  AdaBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.7812237771979266], and the difference with opt_aprox is: 0.1987762228020734

Auc Roc - Dev: [0.7812237771979266], and the difference with opt_aprox is: 0.20161968761533233

Auc Roc - Test: [0.7812237771979266], and the difference with opt_aprox is: 0.2071881884947796

__BernoulliNB [2]...__

__Results:  BernoulliNB_2__

Optimum Bayes:  0.98

Auc Roc - Train: [0.8166738071357879], and the difference with opt_aprox is: 0.16332619286421213

Auc Roc - Dev: [0.8166738071357879], and the difference with opt_aprox is: 0.17464152414860745

Auc Roc - Test: [0.8166738071357879], and the difference with opt_aprox is: 0.1704672134997386

Creating temp train/dev/test data...

__CatBoost...__

__Results:  CatBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9491419169077011], and the difference with opt_aprox is: 0.03085808309229887

Auc Roc - Dev: [0.9491419169077011], and the difference with opt_aprox is: 0.043009306188520724

Auc Roc - Test: [0.9491419169077011], and the difference with opt_aprox is: 0.038944005357914535

## Train Threat

In [23]:
ratio = get_imb_data_info('threat') # get the ratio to use in scale_pos_weight

   threat   count
0       0  159093
1       1     478
Neg examples 159093 / Pos examples 478 = 332.8305439330544


In [None]:
# CatBoost hyperparameters
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
threat = train('threat', random_seed = 4, cat_parameters = cat_parameters)
models['threat'] = threat

__BernoulliNB [1]...__

__Results:  BernoulliNB_1__

Optimum Bayes:  0.98

Auc Roc - Train: [0.7184614310366918], and the difference with opt_aprox is: 0.2615385689633082

Auc Roc - Dev: [0.7184614310366918], and the difference with opt_aprox is: 0.2836815100383655

Auc Roc - Test: [0.7184614310366918], and the difference with opt_aprox is: 0.2741168485878607

__RidgeClassifier...__

__Results:  RidgeClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.7818490339867059], and the difference with opt_aprox is: 0.19815096601329407

Auc Roc - Dev: [0.7818490339867059], and the difference with opt_aprox is: 0.22450124781167358

Auc Roc - Test: [0.7818490339867059], and the difference with opt_aprox is: 0.17908965631206042

__Ada Boost...__

__Results:  AdaBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.616076687072164], and the difference with opt_aprox is: 0.363923312927836

Auc Roc - Dev: [0.616076687072164], and the difference with opt_aprox is: 0.3970228517152754

Auc Roc - Test: [0.616076687072164], and the difference with opt_aprox is: 0.3889535326351572

__BernoulliNB [2]...__

__Results:  BernoulliNB_2__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9030588812698119], and the difference with opt_aprox is: 0.07694111873018805

Auc Roc - Dev: [0.9030588812698119], and the difference with opt_aprox is: 0.10252998472827513

Auc Roc - Test: [0.9030588812698119], and the difference with opt_aprox is: 0.14641082615533973

Creating temp train/dev/test data...

__CatBoost...__

__Results:  CatBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9944820588340937], and the difference with opt_aprox is: -0.014482058834093703

Auc Roc - Dev: [0.9944820588340937], and the difference with opt_aprox is: 0.19384372555592788

Auc Roc - Test: [0.9944820588340937], and the difference with opt_aprox is: 0.17619573717182635


## Train Insult

In [25]:
ratio = get_imb_data_info('insult') # get the ratio to use in scale_pos_weight

   insult   count
0       0  151694
1       1    7877
Neg examples 151694 / Pos examples 7877 = 19.25783927891329


In [None]:
# CatBoost hyperparameters
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
insult = train('insult', random_seed = 5, cat_parameters = cat_parameters)
models['insult'] = insult

__BernoulliNB [1]...__

__Results:  BernoulliNB_1__

Optimum Bayes:  0.98

Auc Roc - Train: [0.8201858624078954], and the difference with opt_aprox is: 0.1598141375921046

Auc Roc - Dev: [0.8201858624078954], and the difference with opt_aprox is: 0.17470001382394762

Auc Roc - Test: [0.8201858624078954], and the difference with opt_aprox is: 0.1743627323032918

__RidgeClassifier...__

__Results:  RidgeClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9224391131067932], and the difference with opt_aprox is: 0.05756088689320682

Auc Roc - Dev: [0.9224391131067932], and the difference with opt_aprox is: 0.06476813072133292

Auc Roc - Test: [0.9224391131067932], and the difference with opt_aprox is: 0.06339873527006346

__Ada Boost...__

__Results:  AdaBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.6946106695855874], and the difference with opt_aprox is: 0.2853893304144126

Auc Roc - Dev: [0.6946106695855874], and the difference with opt_aprox is: 0.28666040774877855

Auc Roc - Test: [0.6946106695855874], and the difference with opt_aprox is: 0.2887656293525379

__BernoulliNB [2]...__

__Results:  BernoulliNB_2__

Optimum Bayes:  0.98

Auc Roc - Train: [0.817788825352185], and the difference with opt_aprox is: 0.16221117464781498

Auc Roc - Dev: [0.817788825352185], and the difference with opt_aprox is: 0.1792456738765158

Auc Roc - Test: [0.817788825352185], and the difference with opt_aprox is: 0.1777028415598484

Creating temp train/dev/test data...

__CatBoost...__

__Results:  CatBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9315620903690014], and the difference with opt_aprox is: 0.04843790963099859

Auc Roc - Dev: [0.9315620903690014], and the difference with opt_aprox is: 0.061975395605845085

Auc Roc - Test: [0.9315620903690014], and the difference with opt_aprox is: 0.0663485522413122

## Train Identity Hate

In [27]:
ratio = get_imb_data_info('identity_hate') # get the ratio to use in scale_pos_weight

   identity_hate   count
0              0  158166
1              1    1405
Neg examples 158166 / Pos examples 1405 = 112.57366548042705


In [None]:
# CatBoost hyperparameters
cat_parameters = {
    'iterations': 500,
    'depth': 7,
    'learning_rate': .7,
    'scale_pos_weight': ratio
}
identity_hate = train('identity_hate', random_seed = 6, cat_parameters = cat_parameters)
models['identity_hate'] = identity_hate

__BernoulliNB [1]...__

__Results:  BernoulliNB_1__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9177071443264864], and the difference with opt_aprox is: 0.062292855673513614

Auc Roc - Dev: [0.9177071443264864], and the difference with opt_aprox is: 0.09336792532923155

Auc Roc - Test: [0.9177071443264864], and the difference with opt_aprox is: 0.08310948416642094

__RidgeClassifier...__

__Results:  RidgeClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.8609598341059914], and the difference with opt_aprox is: 0.11904016589400856

Auc Roc - Dev: [0.8609598341059914], and the difference with opt_aprox is: 0.13326544778989113

Auc Roc - Test: [0.8609598341059914], and the difference with opt_aprox is: 0.14272304596007768

__Ada Boost...__

__Results:  AdaBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.6190787711703019], and the difference with opt_aprox is: 0.3609212288296981

Auc Roc - Dev: [0.6190787711703019], and the difference with opt_aprox is: 0.3529441317625437

Auc Roc - Test: [0.6190787711703019], and the difference with opt_aprox is: 0.35546132371815675

__BernoulliNB [2]...__

__Results:  BernoulliNB_2__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9211667120630912], and the difference with opt_aprox is: 0.058833287936908785

Auc Roc - Dev: [0.9211667120630912], and the difference with opt_aprox is: 0.08835781241416496

Auc Roc - Test: [0.9211667120630912], and the difference with opt_aprox is: 0.09327503633158363

Creating temp train/dev/test data...

__CatBoost...__

__Results:  CatBoostClassifier__

Optimum Bayes:  0.98

Auc Roc - Train: [0.9599980026430752], and the difference with opt_aprox is: 0.02000199735692476

Auc Roc - Dev: [0.9599980026430752], and the difference with opt_aprox is: 0.10260798345976563

Auc Roc - Test: [0.9599980026430752], and the difference with opt_aprox is: 0.11225525497031041

# Predictions

In [29]:
submit = predict(models)

Loading test data...
Pre processing...
Predictions


In [30]:
test = read_test_data()
test.head(5)

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [31]:
submit.head(4)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.99666,0.998809,0.9959,0.9888269,0.985003,0.023413
1,0000247867823ef7,0.151661,0.000887,0.032648,1.489659e-07,0.077905,4.1e-05
2,00013b17ad220c46,0.302894,0.000887,0.016752,1.489659e-07,0.178522,0.000407
3,00017563c3f7919a,0.092115,0.00012,0.016664,1.750237e-11,0.017861,4e-06


In [32]:
submit.to_csv('../submits/results_preds_cat.csv', index = False)

Final score on private leaderboard: __0.9566__