# XGBoost experiments (Michael)

## Setup

In [None]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

# increase number of displayed df columns, since data has quite many
# (default is 20)
pd.options.display.max_columns = 100

## Utility function for testing models and tracking results

In [None]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'train_data_size',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'train_data_size': X_train.shape[0],
            'features_no': X_train.shape[1],
            'f1': round(f1_score(y_test, y_pred), 3),
            'acc': round(accuracy_score(y_test, y_pred), 3),
            'recall': round(recall_score(y_test, y_pred), 3),
            'prec': round(precision_score(y_test, y_pred), 3),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 3),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [None]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [None]:
df = pd.read_csv('data/undersampled_data_60_40.csv')
df.shape

## Missing values

In [None]:
# check for NaN's
df.isna().sum()

In [None]:
# drop 500+ rows containing NaN
print("# of rows with NaN's before dropping:", df.shape[0])
df.dropna(inplace=True)
print("# of rows after:", df.shape[0])

## Optional: Create smaller sample from data to speed up experiments

In [None]:
sample_size = None

# uncomment to create sample of desired size
sample_size = 25_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

In [None]:
df.info()

In [None]:
df.head()

## Create label/target variable and check for imbalance

In [None]:
target = df['toxic']

In [None]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

## Create various corpora

### Spacy vectors

In [None]:
# If smaller sample: Convert vector string in csv file to df
# and cast all cols as float. This takes ~50 min for the full 360,000 rows.
# --> If full data: Load pickle file to save time.

if sample_size != None:
    corpus_spacy = df['vector_spacy'].str.strip('[]').str.split(expand=True)
    corpus_spacy = corpus_spacy.astype('float')
    display(corpus_spacy)
    # with open('pickle/spacy_vectors.pkl', mode='wb') as f:
    #     pickle.dump(corpus_spacy, f)

else:
    with open('pickle/spacy_vectors.pkl', mode='rb') as f:
        corpus_spacy = pickle.load(f)
    display(corpus_spacy)

### Bag of words (default)

In [None]:
vect_bow = CountVectorizer()
corpus_bow = vect_bow.fit_transform(df['comment_text'])
corpus_bow

In [None]:
# output just a small number of features, else kernel crashes while converting
# sparse matrix to array
n_words = 100
pd.DataFrame(data=corpus_bow[:, 10000:10000+n_words].toarray(),
             columns=vect_bow.get_feature_names_out()[10000:10000+n_words])

### Bag of words (binary)

In [None]:
vect_bow_bin = CountVectorizer(binary=True)
corpus_bow_bin = vect_bow_bin.fit_transform(df['comment_text'])
corpus_bow_bin

### Bag of words (mixed case)

In [None]:
vect_bow_mixc = CountVectorizer(lowercase=False)
corpus_bow_mixc = vect_bow_mixc.fit_transform(df['comment_text'])
corpus_bow_mixc

### Bag of words (default) on preprocessed comments (lemmatization, stopword and punctuation removal)

In [None]:
vect_bow_pp = CountVectorizer()
corpus_bow_pp = vect_bow_pp.fit_transform(df['stopwords_punct_lemma'])
corpus_bow_pp

### Bag of 1/2-grams (default) on preprocessed comments

In [None]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corpus_bo12grams = vect_bo12grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo12grams

### Bag of 1/2/3-grams (default) on preprocessed comments

In [None]:
vect_bo123grams = CountVectorizer(ngram_range=(1,3))
corpus_bo123grams = vect_bo123grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo123grams

### Bag of 2-grams (default) on preprocessed comments

In [None]:
vect_bo2grams = CountVectorizer(ngram_range=(2,2))
corpus_bo2grams = vect_bo2grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo2grams

### Tf_idf

In [None]:
vect_tfidf = TfidfVectorizer()
corpus_tfidf = vect_tfidf.fit_transform(df['comment_text'])
corpus_tfidf

In [None]:
# output just a small number of features, else kernel crashes
n_words = 100
pd.DataFrame(data=corpus_tfidf[:, 10000:10000+n_words].toarray(),
             columns=vect_tfidf.get_feature_names_out()[10000:10000+n_words])

### Tf_idf on preprocessed comments (lemmatization, stopword and punctuation removal)

In [None]:
vect_tfidf_pp = TfidfVectorizer()
corpus_tfidf_pp = vect_tfidf_pp.fit_transform(df['stopwords_punct_lemma'])
corpus_tfidf_pp

## Baseline model (logistic regression)

In [None]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words', corpus_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words',
                         corpus_bow, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (binary)',
                         corpus_bow_bin, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (mixed case)',
                         corpus_bow_mixc, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corpus_bow_pp, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2-grams (preprocessed)',
                         corpus_bo12grams, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2/3-grams (preprocessed)',
                         corpus_bo123grams, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 2-grams (preprocessed)',
                         corpus_bo2grams, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf',
                         corpus_tfidf, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corpus_tfidf_pp, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corpus_tfidf_pp, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'spacy vectors (300-D)',
                         corpus_spacy, target)
store_test_result(test_result)

In [None]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'spacy vectors (300-D)',
                         corpus_spacy, target)
store_test_result(test_result)

In [None]:
test_results

In [None]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

## Notes

- also try LightGBM?