# XGBoost experiments (Michael)

## Setup

In [40]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# currently not used and thus commented out
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility function for testing models and tracking results

In [2]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'train_data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'train_data_size': X_train.shape[0],
            'features_no': X_train.shape[1],
            'f1': round(f1_score(y_test, y_pred), 3),
            'acc': round(accuracy_score(y_test, y_pred), 3),
            'recall': round(recall_score(y_test, y_pred), 3),
            'prec': round(precision_score(y_test, y_pred), 3),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 3),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [3]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [4]:
df = pd.read_csv('data/undersampled_data_60_40.csv')
df.shape

(360835, 6)

## Missing values

In [5]:
# check for NaN's
df.isna().sum()

comment_text               0
toxic                      0
stopwords_punct_lemma    534
vector_spacy               0
pos_tags                   0
pos_tags_str               0
dtype: int64

In [6]:
# drop 500+ rows containing NaN
print("# of rows with NaN's before dropping:", df.shape[0])
df.dropna(inplace=True)
print("# of rows after:", df.shape[0])

# of rows with NaN's before dropping: 360835
# of rows after: 360301


## Optional: Create smaller sample from data to speed up experiments

In [7]:
sample_size = None

# uncomment to create sample of desired size
#sample_size = 25_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using full data (360301 rows).


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 360301 entries, 0 to 360834
Data columns (total 6 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   comment_text           360301 non-null  object
 1   toxic                  360301 non-null  int64 
 2   stopwords_punct_lemma  360301 non-null  object
 3   vector_spacy           360301 non-null  object
 4   pos_tags               360301 non-null  object
 5   pos_tags_str           360301 non-null  object
dtypes: int64(1), object(5)
memory usage: 19.2+ MB


In [9]:
df.head()

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str
0,"Well, what are the chances he will turn out to...",0,chance turn active proponent slavery,[-1.8768581e+00 1.3291241e+00 -9.5301402e-01 ...,"[('Well', 'RB'), (',', ','), ('what', 'WP'), (...","RB , WP VBP DT NNS PRP MD VB RP TO VB VBN DT J..."
1,The moment of critical mass is approaching whe...,0,moment critical mass approach deed Gupta Co li...,[ 0.17821966 0.9195551 -1.5706673 1.148774...,"[('The', 'DT'), ('moment', 'NN'), ('of', 'IN')...",DT NN IN JJ NN VBZ VBG WRB DT NNS IN NNP CC NN...
2,"""Hey listen to me,"" he said. ""I'm not going to...",1,hey listen say go crap prove reporter say \n\n...,[ 2.31782764e-01 1.39464259e+00 -4.36845958e-...,"[('``', '``'), ('Hey', 'NNP'), ('listen', 'VBP...","`` NNP VBP TO PRP , '' PRP VBD . `` PRP VBP RB..."
3,We are already owed $488 M plus interest($2Bil...,0,owe $ 488 M plus interest($2billion 2006 audit...,[-1.4960954 -1.4022146 -3.7696934 1.009247...,"[('We', 'PRP'), ('are', 'VBP'), ('already', 'R...",PRP VBP RB VBN $ CD NNP CC NN ( $ CD ) IN CD V...
4,There is a reason there are no teeth to the la...,0,reason tooth law unlawful law way force free e...,[-7.7496856e-01 1.3410413e+00 -3.7505956e+00 ...,"[('There', 'EX'), ('is', 'VBZ'), ('a', 'DT'), ...",EX VBZ DT NN EX VBP DT NN TO DT NN . PRP VBZ D...


## Create label/target variable and check for imbalance

In [10]:
target = df['toxic']

In [11]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 215967 (59.9 %)
Toxic (1): 144334 (40.1 %)


## Create various corpora

### Spacy vectors

In [12]:
# If smaller sample: Convert vector string in csv file to df
# and cast all cols as float. This takes ~50 min for the full 360,000 rows.
# --> If full data: Load pickle file to save time.

if sample_size != None:
    corpus_spacy = df['vector_spacy'].str.strip('[]').str.split(expand=True)
    corpus_spacy = corpus_spacy.astype('float')
    display(corpus_spacy)
    # with open('pickle/spacy_vectors.pkl', mode='wb') as f:
    #     pickle.dump(corpus_spacy, f)

else:
    with open('pickle/spacy_vectors.pkl', mode='rb') as f:
        corpus_spacy = pickle.load(f)
    display(corpus_spacy)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,-1.876858,1.329124,-0.953014,1.599344,4.758620,1.969032,1.644194,2.212614,-2.556758,-0.726840,5.897321,1.271768,-3.254700,1.844770,0.489058,3.000484,1.962958,1.899946,-1.418413,0.882638,-1.404378,-0.284728,-1.301200,-1.596746,-0.647874,-0.831314,-1.354642,-2.106514,0.563855,2.363371,1.824184,0.645076,0.678293,-2.281040,-1.977940,-0.906659,3.141446,1.403860,1.658996,0.676032,-2.995388,-0.073580,0.311376,1.060174,0.067870,1.884141,1.315900,-1.606600,1.193746,1.177772,...,0.314692,-2.303116,-3.825660,-0.124960,2.201228,-3.703620,1.025276,-0.714834,1.496154,2.200674,-0.539540,1.000658,3.464700,0.439324,0.390740,0.995470,0.128006,-0.310500,-2.886272,0.912790,2.349206,-0.210700,-0.742186,-1.256042,0.465860,-2.263122,0.827026,-3.067520,-1.027482,1.864740,0.623760,-0.029978,1.954254,0.360346,2.957352,0.075011,0.234940,1.879648,-2.826840,1.237960,1.622200,-1.318332,-0.618924,0.935286,-0.406290,-0.583162,1.153468,-1.718428,-2.940184,1.377052
1,0.178220,0.919555,-1.570667,1.148774,1.004319,-0.108878,1.460230,2.574210,-1.160865,0.816906,3.917788,0.750856,-3.211029,1.878966,0.194537,0.904348,2.237262,-0.360423,-0.846340,0.405276,0.047357,0.290036,0.559792,-0.534478,-1.326276,-1.054266,-1.235236,-1.808121,-0.181621,2.273353,0.465318,-0.180469,1.023133,-0.249721,-1.393705,-1.899000,0.929229,0.971887,0.226290,1.414555,0.119452,-1.305921,0.326800,-0.177402,-1.839571,0.850226,1.620764,-1.236989,0.039125,0.565210,...,0.872494,-0.975675,-3.495676,-0.029888,0.688481,-2.102819,0.627258,-0.368229,0.184600,0.187966,-0.521318,3.892880,1.493949,1.371477,1.892959,-1.292760,-0.081649,0.597054,-1.513949,-0.207115,0.794973,-0.814565,-0.462648,-0.723777,0.418734,-1.673088,1.770155,-1.489565,-1.083388,1.098299,0.078915,-0.036943,1.217826,0.945957,2.545912,-0.107364,-0.605812,0.184450,-2.136489,0.458357,-0.227375,-0.700926,0.312599,-0.735937,-2.684446,-0.403464,0.738699,-0.482938,-1.457351,1.375711
2,0.231783,1.394643,-0.436846,0.872901,-0.873929,1.205551,-0.190739,0.595385,-2.330690,0.565156,-0.559397,0.204755,-0.555460,0.391693,1.517020,-0.178187,1.488848,-0.963324,-0.455677,-0.383595,1.502273,-0.512070,0.104973,-1.868849,-0.447031,-0.634103,-0.268539,-0.808187,-0.323371,0.609490,0.392147,-0.538601,0.349113,0.652201,0.253693,-1.508691,-0.336859,-1.105625,2.519260,2.124392,-3.055427,1.184376,2.254857,-1.555315,1.155854,-1.801629,-0.374912,-1.255075,-2.582920,0.947117,...,-1.780528,-0.599643,-2.513770,1.512623,1.247347,-3.149179,0.413104,-0.230227,-0.695783,0.429839,-0.064115,-1.760015,-1.486536,-0.712535,0.712738,-0.099151,-0.135899,0.163236,-1.964166,-0.839410,0.145670,-0.337788,-1.589225,0.020315,-0.301505,-1.868189,3.505292,-3.075654,0.397776,1.880100,-3.265662,-0.314815,-0.643927,-2.195115,1.049930,0.909676,2.397713,0.113505,-2.101089,0.490942,0.596914,0.221833,1.069331,-1.415764,-1.438030,-2.394625,0.581588,2.777426,0.253738,0.622611
3,-1.496095,-1.402215,-3.769693,1.009247,1.611375,-0.546600,2.224479,1.654543,0.212175,-1.055296,0.852094,-1.243658,-3.344540,0.363441,-3.224690,0.932357,1.152727,0.514451,-2.762152,-0.812308,-0.470966,1.867543,-1.989977,1.861667,0.111678,0.080699,0.032056,-1.750833,-0.106862,-1.191644,0.429434,1.923006,-1.014060,-1.041545,0.854108,-1.550336,2.371579,-0.626739,1.177780,2.292537,-0.509904,-0.773231,-0.208756,2.778163,-2.384932,0.694202,2.076382,0.618881,2.649704,-0.150022,...,-0.405490,-2.088290,-0.970617,0.043718,-0.672818,-0.853684,0.722120,-0.622401,1.427861,-1.374651,-1.446672,3.124262,0.029554,1.247367,0.566725,-0.970019,0.445290,0.392493,-0.819960,1.008321,-1.087292,2.045823,0.488627,-0.799836,-0.926586,-1.708573,0.020146,-1.653279,0.062513,-1.014437,5.417111,1.748681,1.715234,-0.698697,1.234976,1.290899,-0.249626,1.084031,-2.098815,1.234153,-1.695750,1.014100,-1.215895,0.974122,-1.026496,1.872361,0.259799,-1.413542,-0.528788,1.903051
4,-0.774969,1.341041,-3.750596,-0.326901,0.790599,0.751118,2.267157,1.762259,-1.575716,-0.910226,4.636760,2.733558,-3.831653,1.735281,-1.099774,0.772782,2.655015,-1.327572,-1.497711,-0.584363,-2.570018,1.979358,1.006254,-0.781976,-1.091087,-2.018609,-0.301271,-0.397149,-1.157471,1.524223,2.141503,-0.030355,-0.036502,-1.155803,0.642407,0.841951,1.308740,-0.643085,0.929900,1.792014,-0.848953,-1.639773,0.899104,2.178003,-0.519734,3.162769,4.125629,-1.919609,0.567289,1.926541,...,1.897984,-2.660985,-4.002786,0.212389,0.860456,-5.259827,-0.733324,-0.509933,-0.484052,-0.606935,-1.522010,3.477342,-0.466279,-0.396518,2.479396,0.127565,0.988401,0.108644,-4.731673,1.573274,0.960340,-0.361042,1.321883,0.078837,0.461005,-2.887803,1.529673,0.038807,-1.027870,3.520603,-1.213863,1.160294,1.862634,0.681051,4.746203,-1.042762,1.983893,0.096426,-3.052233,-0.818427,0.889205,-0.831493,0.939214,-1.590340,-0.992558,0.573658,0.496665,1.326130,-0.797433,1.078983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360830,0.438600,0.386802,0.770474,0.689925,1.025435,0.941450,1.795690,3.834725,-1.273928,1.291466,3.898175,-0.236423,-0.964200,1.620200,0.378887,0.186205,-1.147675,-2.042633,1.380639,-0.526505,-2.410375,-0.362927,1.088223,-1.084355,-0.172103,-1.566400,-2.228939,0.498345,0.637057,-0.117860,1.981120,-0.535460,1.177733,0.384725,0.593875,-2.678800,0.025235,0.607523,0.545250,0.772920,-0.949900,-1.714015,2.104250,-0.616625,2.720370,0.061329,0.674925,-0.634892,-1.839275,2.395250,...,-1.492225,-0.813518,-3.581195,-0.204525,0.296360,-1.995195,1.134325,-1.091150,0.143307,0.712214,-0.348875,-0.163325,-0.716915,0.331823,0.537755,-0.246277,0.323977,1.604525,-3.593775,0.290093,0.737875,-1.704915,-1.783112,1.837185,0.531138,-0.590073,0.117642,-1.676665,1.377483,3.207250,-0.810250,-1.311720,-0.788007,-0.010297,3.920825,-0.066543,-0.645932,-0.310277,-2.005325,-0.821325,3.328725,-2.488193,-0.756567,-1.386500,-0.457523,-0.498116,0.511755,1.422518,-2.187025,0.642060
360831,-0.731760,1.023549,-0.880483,1.261459,3.744050,0.120479,0.479901,0.132337,-0.398597,-1.280783,2.686731,2.163455,-0.721418,0.277151,-0.374275,1.542666,1.513706,0.248503,-0.908726,1.152179,0.189838,0.060716,-2.020255,1.333881,-0.154894,-0.390738,-0.421260,-0.742025,-0.761455,0.502178,0.103339,0.295054,1.282296,0.899385,-2.664856,0.173295,0.402212,-0.024748,-1.301166,-0.326710,-1.280380,-1.018069,-1.203053,1.290722,-1.222853,0.423633,-0.817516,-2.030823,0.238415,-0.589672,...,0.149429,-2.983310,-2.262961,-1.499177,-0.508423,-0.549908,1.087115,-1.225475,-0.059428,1.364049,-0.076814,0.888174,0.657087,0.328292,0.517615,0.427862,0.380889,0.790923,-2.649072,-0.872070,-1.734374,-0.601333,-0.568825,-1.294779,1.551952,-1.306433,-0.234729,-1.263477,-1.148890,2.097442,-0.758325,-1.171131,-0.361187,-0.506694,1.198373,-2.045520,1.452608,1.685178,0.101678,1.484096,1.140523,-1.574645,0.097638,0.354453,-1.473722,-0.473758,2.365229,-1.559208,-1.270816,1.509327
360832,0.855839,0.656231,-2.644993,-0.407922,0.609606,1.602334,0.029623,1.737609,-1.006357,-1.835868,2.098256,0.433678,-1.608566,0.114230,1.440862,1.249042,2.669342,-1.594232,1.285258,2.165919,0.750281,1.341654,2.151910,-2.251080,-0.218899,-0.145945,-1.296137,-0.333146,0.977031,-0.622881,-1.321418,1.268047,1.022724,0.769765,-1.388017,-0.602917,0.049559,-0.300137,-0.225737,-0.289401,0.088407,0.739795,2.084388,0.070974,-0.644359,-0.332131,1.587586,-2.196911,-0.665204,2.079851,...,-1.379357,0.104258,-1.892241,0.300516,0.133904,-1.990719,0.097556,0.801660,-1.823543,0.706570,-0.410259,-1.210783,0.698087,-0.489438,0.273300,-1.481117,0.226795,1.460009,-1.688204,-2.042517,2.212540,0.496921,0.751415,0.005781,-0.540831,-1.800638,-0.020997,-0.732779,-1.695895,1.898199,-1.592688,-1.654946,0.380003,1.327894,2.073243,0.000292,1.112172,-0.502904,-1.768929,-0.852368,-0.033904,-2.122064,-0.589171,0.280024,-1.506298,-0.776137,2.239674,1.431978,-0.561459,0.765462
360833,0.065853,1.974891,-1.838383,0.104045,3.759098,1.344389,1.882558,2.455303,-0.902622,0.136854,5.276650,2.265550,-4.085964,2.374793,-0.323964,0.462181,0.934218,-1.049334,-2.035567,-0.148752,0.373118,-0.396589,-1.361327,0.432653,0.425022,0.277412,-1.772222,-0.968555,-0.483415,-0.614001,1.195285,-0.207309,0.106477,-2.764641,1.155790,-0.177549,-0.972880,0.024876,0.433589,-0.556832,-0.294295,0.472799,0.307101,1.397624,-1.838970,1.702319,1.413903,-0.133204,0.281461,0.830406,...,0.479879,-2.471427,-3.858460,-0.045428,2.504672,-2.984263,3.004584,-2.214887,1.008945,0.649989,-0.979673,3.334303,0.806841,0.350484,1.937235,0.414329,-0.614580,1.500401,-3.432297,1.040411,-0.365987,0.864897,0.826365,-1.572276,0.826047,-1.577271,1.185894,-2.257566,-1.985246,2.188505,2.711424,0.705469,1.421614,0.346289,1.758015,-0.950617,2.534571,3.665683,-0.732914,1.084254,3.160383,-0.128791,-2.271738,-0.084907,0.900696,-0.843260,-0.052169,-1.067732,-1.658587,2.446069


### Bag of words (default)

In [13]:
vect_bow = CountVectorizer()
corpus_bow = vect_bow.fit_transform(df['comment_text'])
corpus_bow

<360301x136896 sparse matrix of type '<class 'numpy.int64'>'
	with 13695102 stored elements in Compressed Sparse Row format>

In [14]:
# output just a small number of features, else kernel crashes while converting
# sparse matrix to array
n_words = 100
pd.DataFrame(data=corpus_bow[:, 10000:10000+n_words].toarray(),
             columns=vect_bow.get_feature_names_out()[10000:10000+n_words])

Unnamed: 0,alternative,alternativecheckmate,alternativefacts,alternatively,alternatives,alternator,alternave,alternet,alters,althea,althealthworks,althewhile,althletes,altho,althogh,although,althought,altima,altimetry,altin,altitude,altitudes,altleft,altletnative,altman,altmed,alto,altogether,altogther,alton,altonbakerpark,altoona,altough,altra,altrettanto,altria,altright,altrightpub,altrightpubs,altrite,altruism,altruist,altruistic,altruistically,altruists,alts,altshiler,altuve,altzheimers,alu,alum,aluminium,aluminum,alumn,alumna,alumnae,alumni,alumnists,alumnus,alums,aluta,alutiiq,alva,alvarado,alvarez,alvaro,alveda,alvey,alvin,alvord,alvq8dek8ms,alwaleed,alwasy,alwat,alway,always,always_skeptical,alwaysconstitution,alwaysnowhere,alwayspuzzled,alwaysthere,alwaze,alwyas,aly,alyeska,alyeskaresort,alyosha,alyssa,alzheimer,alzheimers,alzner,alzres,am,am0,am9ng,ama,amab,amabantu,amabhungane,amabungane
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360296,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
360297,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
360298,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
360299,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Bag of words (binary)

In [15]:
vect_bow_bin = CountVectorizer(binary=True)
corpus_bow_bin = vect_bow_bin.fit_transform(df['comment_text'])
corpus_bow_bin

<360301x136896 sparse matrix of type '<class 'numpy.int64'>'
	with 13695102 stored elements in Compressed Sparse Row format>

### Bag of words (mixed case)

In [16]:
vect_bow_mixc = CountVectorizer(lowercase=False)
corpus_bow_mixc = vect_bow_mixc.fit_transform(df['comment_text'])
corpus_bow_mixc

<360301x180483 sparse matrix of type '<class 'numpy.int64'>'
	with 14031725 stored elements in Compressed Sparse Row format>

### Bag of words (default) on preprocessed comments (lemmatization, stopword and punctuation removal)

In [17]:
vect_bow_pp = CountVectorizer()
corpus_bow_pp = vect_bow_pp.fit_transform(df['stopwords_punct_lemma'])
corpus_bow_pp

<360301x123598 sparse matrix of type '<class 'numpy.int64'>'
	with 7419284 stored elements in Compressed Sparse Row format>

### Bag of 1/2-grams (default) on preprocessed comments

In [18]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corpus_bo12grams = vect_bo12grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo12grams

<360301x3943310 sparse matrix of type '<class 'numpy.int64'>'
	with 15332522 stored elements in Compressed Sparse Row format>

### Bag of 1/2/3-grams (default) on preprocessed comments

In [19]:
vect_bo123grams = CountVectorizer(ngram_range=(1,3))
corpus_bo123grams = vect_bo123grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo123grams

<360301x11022914 sparse matrix of type '<class 'numpy.int64'>'
	with 22998333 stored elements in Compressed Sparse Row format>

### Bag of 2-grams (default) on preprocessed comments

In [20]:
vect_bo2grams = CountVectorizer(ngram_range=(2,2))
corpus_bo2grams = vect_bo2grams.fit_transform(df['stopwords_punct_lemma'])
corpus_bo2grams

<360301x3819712 sparse matrix of type '<class 'numpy.int64'>'
	with 7913238 stored elements in Compressed Sparse Row format>

### Tf_idf

In [21]:
vect_tfidf = TfidfVectorizer()
corpus_tfidf = vect_tfidf.fit_transform(df['comment_text'])
corpus_tfidf

<360301x136896 sparse matrix of type '<class 'numpy.float64'>'
	with 13695102 stored elements in Compressed Sparse Row format>

In [22]:
# output just a small number of features, else kernel crashes
n_words = 100
pd.DataFrame(data=corpus_tfidf[:, 10000:10000+n_words].toarray(),
             columns=vect_tfidf.get_feature_names_out()[10000:10000+n_words])

Unnamed: 0,alternative,alternativecheckmate,alternativefacts,alternatively,alternatives,alternator,alternave,alternet,alters,althea,althealthworks,althewhile,althletes,altho,althogh,although,althought,altima,altimetry,altin,altitude,altitudes,altleft,altletnative,altman,altmed,alto,altogether,altogther,alton,altonbakerpark,altoona,altough,altra,altrettanto,altria,altright,altrightpub,altrightpubs,altrite,altruism,altruist,altruistic,altruistically,altruists,alts,altshiler,altuve,altzheimers,alu,alum,aluminium,aluminum,alumn,alumna,alumnae,alumni,alumnists,alumnus,alums,aluta,alutiiq,alva,alvarado,alvarez,alvaro,alveda,alvey,alvin,alvord,alvq8dek8ms,alwaleed,alwasy,alwat,alway,always,always_skeptical,alwaysconstitution,alwaysnowhere,alwayspuzzled,alwaysthere,alwaze,alwyas,aly,alyeska,alyeskaresort,alyosha,alyssa,alzheimer,alzheimers,alzner,alzres,am,am0,am9ng,ama,amab,amabantu,amabhungane,amabungane
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
360299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Tf_idf on preprocessed comments (lemmatization, stopword and punctuation removal)

In [23]:
vect_tfidf_pp = TfidfVectorizer()
corpus_tfidf_pp = vect_tfidf_pp.fit_transform(df['stopwords_punct_lemma'])
corpus_tfidf_pp

<360301x123598 sparse matrix of type '<class 'numpy.float64'>'
	with 7419284 stored elements in Compressed Sparse Row format>

## Baseline model (logistic regression)

In [24]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words', corpus_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [25]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words',
                         corpus_bow, target)
store_test_result(test_result)

In [26]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (binary)',
                         corpus_bow_bin, target)
store_test_result(test_result)

In [27]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (mixed case)',
                         corpus_bow_mixc, target)
store_test_result(test_result)

In [28]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corpus_bow_pp, target)
store_test_result(test_result)

In [29]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2-grams (preprocessed)',
                         corpus_bo12grams, target)
store_test_result(test_result)

In [30]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2/3-grams (preprocessed)',
                         corpus_bo123grams, target)
store_test_result(test_result)

In [31]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 2-grams (preprocessed)',
                         corpus_bo2grams, target)
store_test_result(test_result)

In [32]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf',
                         corpus_tfidf, target)
store_test_result(test_result)

In [33]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corpus_tfidf_pp, target)
store_test_result(test_result)

In [34]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corpus_tfidf_pp, target)
store_test_result(test_result)

In [35]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'spacy vectors (300-D)',
                         corpus_spacy, target)
store_test_result(test_result)

In [36]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'spacy vectors (300-D)',
                         corpus_spacy, target)
store_test_result(test_result)

## Show test results + total exec time

In [37]:
test_results

Unnamed: 0,model_name,model_params,data_desc,train_data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},bag of words,270225,136896,0.827,0.868,0.785,0.872,0.928,"[[49847, 4145], [7750, 28334]]",0m 42s,
1,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words,270225,136896,0.775,0.841,0.686,0.891,0.91,"[[50962, 3030], [11313, 24771]]",0m 6s,
2,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (binary),270225,136896,0.774,0.84,0.684,0.891,0.91,"[[50985, 3007], [11416, 24668]]",0m 5s,
3,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (mixed case),270225,180483,0.755,0.83,0.655,0.892,0.899,"[[51138, 2854], [12453, 23631]]",0m 7s,
4,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (preprocessed),270225,123598,0.77,0.837,0.682,0.885,0.912,"[[50802, 3190], [11485, 24599]]",0m 5s,
5,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2-grams (preprocessed),270225,3943310,0.771,0.838,0.683,0.885,0.912,"[[50802, 3190], [11428, 24656]]",2m 33s,
6,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2/3-grams (preprocessed),270225,11022914,0.77,0.837,0.681,0.886,0.912,"[[50834, 3158], [11512, 24572]]",40m 27s,
7,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 2-grams (preprocessed),270225,3819712,0.313,0.664,0.192,0.862,0.623,"[[52884, 1108], [29172, 6912]]",17m 11s,
8,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf,270225,136896,0.779,0.843,0.691,0.893,0.912,"[[51012, 2980], [11152, 24932]]",0m 47s,
9,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf (preprocessed),270225,123598,0.785,0.846,0.704,0.888,0.916,"[[50779, 3213], [10671, 25413]]",0m 35s,


In [38]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 193m 26s


## Notes

- also try LightGBM?