# Baseline model on final data (Michael)

## Setup

In [1]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import pickle
from scipy import sparse
import re
import os

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility functions for testing models and tracking results

In [2]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'data_size': X.shape[0],
            'features_no': X.shape[1],
            'f1': round(f1_score(y_test, y_pred), 5),
            'acc': round(accuracy_score(y_test, y_pred), 5),
            'recall': round(recall_score(y_test, y_pred), 5),
            'prec': round(precision_score(y_test, y_pred), 5),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 5),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [3]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data (final data file)

In [4]:
df = pd.read_csv('data/data_usampl_60_40_FINAL.csv')
df.shape

(360301, 6)

In [5]:
print('Checking for NaN\'s ...')
print(df.isna().sum())
rows_before = df.shape[0]
print("\nRows before dropping:", rows_before)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
rows_after = df.shape[0]
print('Rows after:', rows_after)
print('Rows dropped:', rows_before - rows_after)

Checking for NaN's ...
raw                      0
clean                  232
clean_pp               236
clean_pp_lemma         236
clean_pp_lemma_stop    280
toxic                    0
dtype: int64

Rows before dropping: 360301
Rows after: 360021
Rows dropped: 280


In [6]:
df.head()

Unnamed: 0,raw,clean,clean_pp,clean_pp_lemma,clean_pp_lemma_stop,toxic
0,"Well, what are the chances he will turn out to...","Well, what are the chances he will turn out to...",well what are the chances he will turn out to ...,well what be the chance he will turn out to ha...,chance turn active proponent slavery,0
1,The moment of critical mass is approaching whe...,The moment of critical mass is approaching whe...,the moment of critical mass is approaching whe...,the moment of critical mass be approach when t...,moment critical mass approach deed gupta co li...,0
2,"""Hey listen to me,"" he said. ""I'm not going to...","""Hey listen to me,"" he said. ""I'm not going to...",hey listen to me he said i 'm not going to put...,hey listen to i he say i be not go to put up w...,hey listen say go crap prove reporter say uh a...,1
3,We are already owed $488 M plus interest($2Bil...,We are already owed $ M plus interest($ Billio...,we are already owed $ m plus interest($ billio...,we be already owe $ m plus interest($ billion ...,owe $ m plus interest($ billion audits state c...,0
4,There is a reason there are no teeth to the la...,There is a reason there are no teeth to the la...,there is a reason there are no teeth to the la...,there be a reason there be no tooth to the law...,reason tooth law unlawful law way force free e...,0


## Optional: Create smaller sample from data to speed up experiments

In [7]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 50_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using full data (360021 rows).


## Create label/target variable and check for imbalance

In [8]:
target = df['toxic']

In [9]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 215687 (59.9 %)
Toxic (1): 144334 (40.1 %)


## Function for bag of words

In [10]:
def bow(data):
    vect = CountVectorizer()
    return vect.fit_transform(data)

## Run baseline model (logistic regression) on different data cols

In [11]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "raw"', bow(df['raw']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean"', bow(df['clean']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean_pp"', bow(df['clean_pp']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean_pp_lemma"', bow(df['clean_pp_lemma']), target)
store_test_result(test_result)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words on col "clean_pp_lemma_stop"', bow(df['clean_pp_lemma_stop']), target)
store_test_result(test_result)

## Show test results + total exec time

In [12]:
test_results

Unnamed: 0,model_name,model_params,data_desc,data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""raw""",360021,136663,0.82466,0.86619,0.78491,0.86866,0.92712,"[[39712, 3426], [6209, 22658]]",0m 45s,
1,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean""",360021,122584,0.82506,0.86654,0.78501,0.8694,0.92745,"[[39734, 3404], [6206, 22661]]",0m 40s,
2,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean_pp""",360021,122252,0.82499,0.86655,0.7846,0.86978,0.92748,"[[39747, 3391], [6218, 22649]]",0m 42s,
3,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean_pp_lemma""",360021,108950,0.82336,0.8654,0.78249,0.86874,0.92858,"[[39725, 3413], [6279, 22588]]",0m 46s,
4,BASELINE (logistic regression),{'max_iter': 2000},"bag of words on col ""clean_pp_lemma_stop""",360021,108910,0.82073,0.86359,0.77888,0.86734,0.92744,"[[39699, 3439], [6383, 22484]]",0m 11s,


In [13]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 3m 41s


## Other stuff

### Calculate average comment length on cleaned data (before preproc)

In [14]:
# characters
comm_len_chars = df['clean'].apply(lambda s: len(s))
avg_comm_len_chars = comm_len_chars.sum() / len(comm_len_chars)

# words (rough count)
comm_len_words = df['clean']\
    .apply(lambda s: len(re.findall(r'\S+', s)))
avg_comm_len_words = comm_len_words.sum() / len(comm_len_words)

print('Average comment length:')
print(round(avg_comm_len_chars), 'characters')
print(round(avg_comm_len_words), 'words')

Average comment length:
286 characters
50 words
