# XGBoost experiments (Michael)

## Setup

In [6]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse
import re

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# currently not used and thus commented out
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility function for testing models and tracking results

In [None]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'data_size': X.shape[0],
            'features_no': X.shape[1],
            'f1': round(f1_score(y_test, y_pred), 3),
            'acc': round(accuracy_score(y_test, y_pred), 3),
            'recall': round(recall_score(y_test, y_pred), 3),
            'prec': round(precision_score(y_test, y_pred), 3),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 3),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [None]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [19]:
df = pd.read_csv('data/undersampled_data_60_40_ft.csv')
df.shape

(360301, 7)

## Optional: Create smaller sample from data to speed up experiments

In [None]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 25_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

## Drop rows with NaN's

In [23]:
rows_before = df.shape[0]
print("rows with NaN's before dropping:", df.shape[0])
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
print('rows after:', df.shape[0])
print('rows dropped:', rows_before - df.shape[0])

rows with NaN's before dropping: 360273
rows after: 360273
rows dropped: 0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360273 entries, 0 to 360272
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   Unnamed: 0                360273 non-null  int64 
 1   comment_text              360273 non-null  object
 2   toxic                     360273 non-null  int64 
 3   stopwords_punct_lemma     360273 non-null  object
 4   toxic_label_ft            360273 non-null  object
 5   toxic_label_comment_text  360273 non-null  object
 6   vector_fast_text          360273 non-null  object
dtypes: int64(2), object(5)
memory usage: 19.2+ MB


In [25]:
df

Unnamed: 0.1,Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,toxic_label_ft,toxic_label_comment_text,vector_fast_text
0,0,"Well, what are the chances he will turn out to...",0,chance turn active proponent slavery,__label__0,__label__0 chance turn active proponent slavery,[-5.77833019e-02 4.58838157e-02 -4.87854704e-...
1,1,The moment of critical mass is approaching whe...,0,moment critical mass approach deed gupta co li...,__label__0,__label__0 moment critical mass approach deed ...,[-3.85174714e-02 2.94841994e-02 -3.53648514e-...
2,2,"""Hey listen to me,"" he said. ""I'm not going to...",1,hey listen say go crap prove reporter say u...,__label__1,__label__1 hey listen say go crap prove report...,[ 0.08621803 -0.06944817 0.08360571 0.003052...
3,3,We are already owed $488 M plus interest($2Bil...,0,owe 488 m plus interest 2billion 2006 audits s...,__label__0,__label__0 owe 488 m plus interest 2billion 20...,[-0.02172438 0.01810819 -0.02264511 -0.000863...
4,4,There is a reason there are no teeth to the la...,0,reason tooth law unlawful law way force free e...,__label__0,__label__0 reason tooth law unlawful law way f...,[-0.04083619 0.03226621 -0.03952266 -0.002016...
...,...,...,...,...,...,...,...
360268,360830,Do you still beat your wife? Simple question.,0,beat wife simple question,__label__0,__label__0 beat wife simple question,[-0.11675262 0.09984541 -0.11092692 0.011191...
360269,360831,The fascist dictator continues the insanity ag...,1,fascist dictator continue insanity human civil...,__label__1,__label__1 fascist dictator continue insanity ...,[ 0.03221355 -0.02136803 0.02484508 -0.001221...
360270,360832,Sean Hannity is a lightweight foolish commenta...,0,sean hannity lightweight foolish commentator f...,__label__0,__label__0 sean hannity lightweight foolish co...,[-0.02342204 0.01447476 -0.01676093 -0.005626...
360271,360833,There are a number of countries which make it ...,0,number country impossible national citizenship...,__label__0,__label__0 number country impossible national ...,[-0.01418185 0.01380997 -0.00814179 -0.004513...


## Create label/target variable and check for imbalance

In [None]:
target = df['toxic']

In [None]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

## Create various corpora

### Raw corpus

In [None]:
corp_raw = df['comment_text']
corp_raw.shape

### Pre-processed corpus

In [None]:
corp_pp = df['stopwords_punct_lemma']
corp_pp.shape

### Corpus of fastText vectors

In [None]:
# If smaller sample: Convert vector string in csv file to df
# and cast all cols as float. This takes ~50 min for the full 360,000 rows.
# --> If full data: Load pickle file to save time.

if sample_size != None:
    corp_ft = df['vector_fast_text'].str.strip('[]').str.split(expand=True)
    corp_ft = corp_ft.astype('float')
    display(corp_ft)
    # with open('pickle/ft_vectors.pkl', mode='wb') as f:
    #     pickle.dump(corp_ft, f)

else:
    with open('pickle/ft_vectors.pkl', mode='rb') as f:
        corp_ft = pickle.load(f)
    display(corp_ft)

### Bag of words (default)

In [None]:
vect_bow = CountVectorizer()
corp_bow = vect_bow.fit_transform(corp_raw)
corp_bow

### Bag of words (binary)

In [None]:
vect_bow_bin = CountVectorizer(binary=True)
corp_bow_bin = vect_bow_bin.fit_transform(corp_raw)
corp_bow_bin

### Bag of words (mixed case)

In [None]:
vect_bow_mixc = CountVectorizer(lowercase=False)
corp_bow_mixc = vect_bow_mixc.fit_transform(corp_raw)
corp_bow_mixc

### Bag of words (default) on preprocessed comments (lemmatization, stopword and punctuation removal)

In [None]:
vect_bow = CountVectorizer()
corp_pp_bow = vect_bow.fit_transform(corp_pp)
corp_pp_bow

### Bag of 1/2-grams (default) on preprocessed comments

In [None]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corp_pp_bo12grams = vect_bo12grams.fit_transform(corp_pp)
corp_pp_bo12grams

### Bag of 1/2/3-grams (default) on preprocessed comments

In [None]:
vect_bo123grams = CountVectorizer(ngram_range=(1,3))
corp_pp_bo123grams = vect_bo123grams.fit_transform(corp_pp)
corp_pp_bo123grams

### Bag of 2-grams (default) on preprocessed comments

In [None]:
vect_bo2grams = CountVectorizer(ngram_range=(2,2))
corp_pp_bo2grams = vect_bo2grams.fit_transform(corp_pp)
corp_pp_bo2grams

### Tf_idf

In [None]:
vect_tfidf = TfidfVectorizer()
corp_tfidf = vect_tfidf.fit_transform(corp_raw)
corp_tfidf

### Tf_idf on preprocessed comments (lemmatization, stopword and punctuation removal)

In [None]:
vect_tfidf = TfidfVectorizer()
corp_pp_tfidf = vect_tfidf.fit_transform(corp_pp)
corp_pp_tfidf

## Baseline model (logistic regression)

In [None]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words', corp_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words',
                         corp_bow, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (binary)',
                         corp_bow_bin, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (mixed case)',
                         corp_bow_mixc, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corp_pp_bow, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2-grams (preprocessed)',
                         corp_pp_bo12grams, target)
store_test_result(test_result)

In [None]:
# # parameters for model
# params = {'random_state': 42,
#           'n_jobs': -1}

# # load model with parameters
# xgb = XGBClassifier(**params)

# test_result = test_model(xgb, 'XGBoost', params,
#                          'bag of 1/2/3-grams (preprocessed)',
#                          corp_pp_bo123grams, target)
# store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 2-grams (preprocessed)',
                         corp_pp_bo2grams, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf',
                         corp_tfidf, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corp_pp_tfidf, target)
store_test_result(test_result)

In [None]:
# # parameters for model
# params = {'random_state': 42,
#           'n_jobs': -1,
#           'n_estimators': 1000}

# # load model with parameters
# xgb = XGBClassifier(**params)

# test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
#                          corp_pp_tfidf, target)
# store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'fastText vectors',
                         corp_ft, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'fastText vectors',
                         corp_ft, target)
store_test_result(test_result)

## Show test results + total exec time

In [None]:
test_results

In [None]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

## Other stuff

### Calculate average comment length

In [27]:
# characters
comm_len_chars = df['comment_text'].apply(lambda s: len(s))
avg_comm_len_chars = comm_len_chars.sum() / len(comm_len_chars)

# words (rough count)
comm_len_words = df['comment_text']\
    .apply(lambda s: len(re.findall(r'\S+', s)))
avg_comm_len_words = comm_len_words.sum() / len(comm_len_words)

print('Average comment length:')
print(round(avg_comm_len_chars), 'characters')
print(round(avg_comm_len_words), 'words')

0          17
1          45
2          38
3         106
4          49
         ... 
360268      8
360269     28
360270     25
360271     40
360272      9
Name: comment_text, Length: 360273, dtype: int64

Average comment length:
290 characters
50 words


In [26]:
df.isna().sum()

Unnamed: 0                  0
comment_text                0
toxic                       0
stopwords_punct_lemma       0
toxic_label_ft              0
toxic_label_comment_text    0
vector_fast_text            0
dtype: int64

### Calculate vocabulary size

In [None]:
pass