# XGBoost experiments (Michael)

## Setup

In [1]:
# import the usual suspects / basics
import time; full_run_time_start = time.time() # start timing exec right away
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import sparse

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score,\
    accuracy_score, precision_score, recall_score, confusion_matrix

# XGBoost
from xgboost import XGBClassifier

# currently not used and thus commented out
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# display all df columns (default is 20)
pd.options.display.max_columns = None

## Utility function for testing models and tracking results

In [2]:
# empty df for storing results
test_results = pd.DataFrame(columns=['model_name',
                                'model_params',
                                'data_desc',
                                'data_size',
                                'features_no',
                                'f1',
                                'acc',
                                'recall',
                                'prec',
                                'roc_auc',
                                'cf_matrix',
                                'train_time',
                                'notes'])

def test_model(model, model_name, model_params, data_desc, X, y, notes=''):
    '''
    test_model(model, model_params, data_desc, X, y, notes='')
    
    Parameters:
    -----------
    model: instance of model to test
    model_name: name of model
    model_params: dict of (hyper)parameters passed to model
    data_desc: description of dataset (preprocessing steps etc.)
    X: feature array 
    y: target/label array
    notes: additional notes (default: empty string)
    '''

    # Split data using default of 75% for train, 25% for test.
    # Make sure test data has same toxic/nontoxic ratio as train data by
    # using stratify parameter.
    X_train, X_test, y_train, y_test =\
        train_test_split(X, y, stratify=y, random_state=42)
    
    # train model and time execution
    train_time_start = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - train_time_start
    train_time_str = f'{int(train_time // 60)}m {round(train_time % 60)}s'

    # Make predictions on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:,1]

    return {'model_name': model_name,
            'model_params': model_params,
            'data_desc': data_desc,
            'data_size': X.shape[0],
            'features_no': X.shape[1],
            'f1': round(f1_score(y_test, y_pred), 3),
            'acc': round(accuracy_score(y_test, y_pred), 3),
            'recall': round(recall_score(y_test, y_pred), 3),
            'prec': round(precision_score(y_test, y_pred), 3),
            'roc_auc': round(roc_auc_score(y_test, y_pred_proba), 3),
            'cf_matrix': confusion_matrix(y_test, y_pred),
            'train_time': train_time_str,
            'notes': notes}

In [3]:
def store_test_result(result):
    test_results.loc[len(test_results)] = result

## Load data

In [4]:
df = pd.read_csv('data/undersampled_data_60_40.csv')
df.shape

(360835, 6)

## Optional: Create smaller sample from data to speed up experiments

In [5]:
sample_size = None

# uncomment to create sample of desired size
sample_size = 25_000

if sample_size != None:
    # ratio toxic/nontoxic
    tox_perc = 0.4
    nontox_perc = 0.6

    # number of toxic/nontoxic rows
    sample_size_tox = int(sample_size * tox_perc)
    sample_size_nontox = int(sample_size * nontox_perc)

    sample_tox = df[df['toxic'] == 1].sample(sample_size_tox,
                                             random_state=42)
    sample_nontox = df[df['toxic'] == 0].sample(sample_size_nontox,
                                                random_state=42)

    df = pd.concat([sample_tox, sample_nontox])
    print(f'Using sample ({df.shape[0]} rows).')

else:
    print(f'Using full data ({df.shape[0]} rows).')

Using sample (25000 rows).


## Drop rows with NaN's

In [6]:
rows_before = df.shape[0]
print("rows with NaN's before dropping:", df.shape[0])
df.dropna(inplace=True)
print('rows after:', df.shape[0])
print('rows dropped:', rows_before - df.shape[0])

rows with NaN's before dropping: 25000
rows after: 24957
rows dropped: 43


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24957 entries, 1477 to 3711
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   comment_text           24957 non-null  object
 1   toxic                  24957 non-null  int64 
 2   stopwords_punct_lemma  24957 non-null  object
 3   vector_spacy           24957 non-null  object
 4   pos_tags               24957 non-null  object
 5   pos_tags_str           24957 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


In [8]:
df.head()

Unnamed: 0,comment_text,toxic,stopwords_punct_lemma,vector_spacy,pos_tags,pos_tags_str
1477,Good. Let's hope the president listens to his...,1,good let hope president listen adviser inste...,[ 1.4473382 0.9890728 -2.8375092 0.658990...,"[('Good', 'JJ'), ('.', '.'), ('Let', 'VB'), (""...","JJ . VB POS VB DT NN VBZ TO PRP$ NNS RB , RB I..."
67285,Actually I'd consider people like Lindsey cons...,1,actually consider people like Lindsey conserva...,[-6.41708255e-01 5.59695244e-01 -1.90619397e+...,"[('Actually', 'RB'), ('I', 'PRP'), (""'d"", 'MD'...","RB PRP MD VB NNS IN NNP JJ , IN DT PRP VBP , R..."
98,Once again the left proves they are the scum o...,1,left prove scum Earth \n\n win election spend ...,[-2.0883 -0.19458179 -0.6892491 1.014075...,"[('Once', 'RB'), ('again', 'RB'), ('the', 'DT'...",RB RB DT NN VBZ PRP VBP DT NN IN DT NN . PRP M...
269922,And Trump should be fired if his tweets are in...,1,Trump fire tweet insensitive childish demente,[ 4.85373288e-01 2.90858328e-01 -1.57731009e+...,"[('And', 'CC'), ('Trump', 'NNP'), ('should', '...","CC NNP MD VB VBN IN PRP$ NNS VBP JJ , JJ , CC ..."
80321,Making decisions based on factors as nebulous ...,1,make decision base factor nebulous Paris Accor...,[-0.34122247 0.5101946 -2.1777446 0.028234...,"[('Making', 'VBG'), ('decisions', 'NNS'), ('ba...",VBG NNS VBN IN NNS RB JJ IN DT NNP NNP VBZ DT ...


## Create label/target variable and check for imbalance

In [9]:
target = df['toxic']

In [10]:
value_counts = target.value_counts()
nontoxic_count = value_counts[0]
toxic_count = value_counts[1]
nontoxic_perc =\
    round((nontoxic_count / (nontoxic_count + toxic_count)) * 100, 1)
toxic_perc =\
    round((toxic_count / (nontoxic_count + toxic_count)) * 100, 1)

print(f'Nontoxic (0): {nontoxic_count} ({nontoxic_perc} %)')
print(f'Toxic (1): {toxic_count} ({toxic_perc} %)')

Nontoxic (0): 14957 (59.9 %)
Toxic (1): 10000 (40.1 %)


## Create various corpora

### Raw corpus

In [11]:
corp_raw = df['comment_text']
corp_raw.shape

(24957,)

### Pre-processed corpus

In [12]:
corp_pp = df['stopwords_punct_lemma']
corp_pp.shape

(24957,)

### Corpus of spacy vectors

In [13]:
# If smaller sample: Convert vector string in csv file to df
# and cast all cols as float. This takes ~50 min for the full 360,000 rows.
# --> If full data: Load pickle file to save time.

if sample_size != None:
    corp_spacy = df['vector_spacy'].str.strip('[]').str.split(expand=True)
    corp_spacy = corp_spacy.astype('float')
    display(corp_spacy)
    # with open('pickle/spacy_vectors.pkl', mode='wb') as f:
    #     pickle.dump(corp_spacy, f)

else:
    with open('pickle/spacy_vectors.pkl', mode='rb') as f:
        corp_spacy = pickle.load(f)
    display(corp_spacy)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
1477,1.447338,0.989073,-2.837509,0.658991,0.672710,1.191676,0.285207,2.426576,-1.692114,-0.176049,3.934427,1.150893,-2.356241,0.828215,-0.992825,0.729421,1.706436,-1.819391,0.097194,0.431922,0.255678,-1.266474,-0.450235,-2.383436,0.009307,-0.240100,-1.485890,-0.873162,-0.050700,1.801850,0.228427,-1.292545,0.655094,-0.801870,-0.541164,0.219838,-1.053595,0.318381,1.679943,2.140065,-1.655223,0.311242,1.078556,-0.442303,-0.744830,0.782592,1.409505,-3.122552,-0.567920,1.365632,-1.327211,0.018819,0.639722,-2.035966,0.039106,-0.378086,-0.113403,1.566775,0.674308,0.935797,2.073059,-0.678601,1.153684,-1.253865,0.296068,1.032703,-1.863666,-3.160026,0.633379,0.949073,2.227246,-0.222558,-0.692850,-0.598044,0.661155,1.116506,-1.085037,1.812174,0.102856,-2.265764,-2.748927,-2.635322,1.254545,0.473484,-0.894685,0.560620,0.064153,-2.546222,1.218625,-0.985504,-0.081563,1.232141,1.038742,-2.360090,1.366507,-2.262342,0.024480,0.120594,0.356569,0.199665,1.181463,1.211473,0.240170,1.895987,-1.337040,0.415248,-2.096530,-0.994066,-0.162791,-1.933663,0.716135,-1.745517,-0.585685,-1.577185,-0.744824,2.713183,-3.170229,-1.943473,0.947423,0.017556,-0.923856,-1.776971,-1.686727,0.173288,-0.638135,-0.346419,3.243218,-0.827596,1.880518,-1.020007,-1.331072,-0.085317,1.712556,-0.242993,-1.069617,-0.945440,-1.918320,-0.973944,2.510781,-1.680616,-2.636132,-0.691855,0.231941,1.484493,-1.024885,-1.107356,-2.590645,0.679043,-0.559594,-0.519887,-1.352685,0.725248,0.410003,1.080092,-1.988682,2.303114,3.084788,0.310003,-0.553435,-2.390043,0.224718,-1.740911,-1.353859,-0.130484,-0.110978,-0.661164,-0.839274,-0.202948,-0.819933,-1.205320,0.651909,0.534960,1.434445,-0.481054,0.601297,-0.603026,-1.133857,0.355430,-0.495364,-0.707020,1.155727,0.389672,2.976284,-0.203934,0.632264,0.594518,1.606209,-2.612810,1.469768,2.093344,1.044416,1.166731,-0.741649,0.625016,-3.061623,-1.348242,-2.119584,0.506404,-0.549953,1.813935,-1.955853,-0.183054,-0.938769,-1.985775,-2.123790,-0.154973,0.126572,0.387217,-0.911598,0.729371,0.330887,0.848217,-0.450927,-0.131697,0.103526,2.735063,-0.264879,-1.151768,0.509372,-0.359525,-1.330164,0.532508,0.564131,0.786888,-0.064036,-0.620366,-0.116083,1.616256,1.487243,1.326860,-0.664096,-3.354927,-1.936418,-1.481669,-0.025682,-0.459806,-1.747816,-0.333415,0.532155,-0.980654,0.216045,0.187075,2.790808,-0.903923,-0.485790,1.244636,-0.749542,-0.568840,0.947629,1.091582,0.024955,-1.113088,-3.488241,-0.628379,1.385689,-2.668185,0.719956,-1.036315,-0.729682,1.257612,-1.230474,1.091475,1.178266,0.170564,2.185019,0.281890,0.478311,0.386657,-2.397473,-0.324701,0.633647,0.588723,-0.340915,-1.158466,-0.022949,-1.624240,2.472658,-2.356169,-1.306742,2.624767,-1.245684,0.180542,1.080140,0.618075,1.281191,0.733621,1.017922,0.498706,-2.452836,-0.123139,0.440162,-1.283612,-0.277114,-0.510127,-0.449223,-0.259792,-0.252739,1.534381,-2.858636,1.419309
67285,-0.641708,0.559695,-1.906194,-0.129815,0.921230,1.241203,1.081625,2.691928,-2.080152,-0.580781,4.053964,1.216598,-2.720687,1.546386,0.538366,0.377215,1.950860,-0.905412,-0.824052,-0.386185,-0.435925,0.827620,-1.012597,-1.695105,-0.283881,-0.591340,-1.036601,-0.645482,-0.407817,2.053149,0.809945,-0.319948,0.260749,-1.005433,-0.323071,-1.272172,-0.049698,0.595434,1.316050,1.764902,-1.350003,-0.762148,0.998031,0.318677,-1.303109,1.746911,1.662343,-1.911362,-0.588617,0.918016,0.045040,1.240582,0.275309,-3.219471,-1.115262,0.066075,-0.794652,0.980209,1.017157,0.530856,1.628347,-0.134305,-1.127451,-1.401372,1.361686,1.561053,-2.498920,-3.110382,0.936345,1.946923,0.351189,0.422724,-0.795529,0.250838,-0.341553,0.900209,-1.777663,1.040262,-0.595483,0.161045,-2.463884,-0.523775,0.970330,0.679265,0.675702,0.154777,-0.376811,-1.783485,0.888975,0.167873,-0.154192,0.396165,0.225207,-3.252264,0.406389,-0.969486,0.635686,-0.429083,0.103791,-0.109948,1.943618,0.598800,0.753389,2.005089,-1.372546,2.445097,-1.615847,-1.558905,-0.804705,-1.301978,0.947151,-0.612800,-1.215500,-0.867176,0.390822,2.844118,-2.260658,-1.880090,0.375071,-0.676644,-2.027436,-1.484894,-1.054893,0.322372,-1.174772,-1.256130,1.403755,-1.277549,1.964653,-0.619997,-1.336609,0.057314,2.150018,-0.239763,-0.957795,-0.371390,-1.453224,-0.682527,1.885311,-1.398921,-1.587937,-0.660967,0.609064,1.018989,-0.297128,-0.362711,-2.789269,-0.171274,-0.019145,0.581243,-0.812334,1.844387,0.689036,1.378456,-0.980862,1.327489,2.751821,0.301462,-0.792278,-1.567384,-0.441471,-1.722630,-0.637397,0.593217,-1.517309,-0.713457,-1.675487,0.601597,-0.002233,-0.302797,0.874462,0.493818,1.735568,-0.062811,0.994157,0.064485,-0.530198,0.913773,-0.560685,-1.397474,-0.337096,0.068363,2.987213,-0.625894,-0.435569,-0.372499,-0.435745,-1.485347,1.870980,1.887902,-0.701938,0.422395,-1.129990,-1.311018,-0.991696,-0.877667,-2.043435,-0.446952,-0.862154,1.358885,-1.142738,-0.557820,-0.866395,-0.949504,0.668735,0.018687,-1.938159,0.249216,-0.374995,-0.643682,0.702955,-0.089370,-1.202125,0.479759,-0.552893,1.273943,0.576265,-1.968608,-0.579406,0.670126,-0.771556,-0.316559,0.186584,0.670175,-0.094056,-0.119557,-0.277650,1.576381,2.120179,0.881033,0.616748,-2.036428,-1.060433,0.199089,1.000322,0.607119,-0.748930,0.833689,0.327063,0.150025,-1.566661,-0.581461,1.005687,0.693677,-0.789508,-0.092286,-1.788872,0.383759,0.854857,0.635942,-0.099746,-1.421498,-2.784177,0.361702,1.441078,-3.287505,0.712903,0.207864,-0.565443,0.884035,-0.653574,1.929151,1.087981,0.422561,1.761921,0.295425,-0.129609,1.166089,-2.366753,-0.308218,0.785025,-0.752789,-0.047040,-0.450340,0.048715,-1.384662,1.242143,-1.988607,-1.057131,2.242220,-0.285853,-0.153432,1.041336,0.005016,1.551370,0.089854,0.725906,0.724910,-1.814196,-0.241015,1.335800,-1.430822,-0.390108,-1.062682,-1.251803,-0.966164,0.740553,-0.001408,-2.858310,1.505569
98,-2.088300,-0.194582,-0.689249,1.014075,1.773172,3.204544,-0.009884,3.116564,0.388368,0.508977,2.814546,-0.886884,-0.977305,1.944135,0.709916,3.014483,0.437692,-2.490436,0.596147,0.524964,0.236191,0.595766,-1.053060,-0.657445,1.018215,0.626356,-2.000727,-1.641607,0.706537,2.513732,2.579704,-0.131070,1.465003,-2.414146,0.753464,-0.262798,-0.349287,1.205424,0.327816,2.656066,-1.401441,1.783595,1.799254,2.584335,-1.783796,0.845610,0.440616,-0.806385,-0.054345,-1.161086,0.239025,-0.186403,0.647564,-1.277864,0.006267,0.740654,-2.395252,-0.587975,2.537822,0.961738,-0.737171,0.802264,-2.263034,-1.301966,3.496786,0.893983,-2.697500,0.765166,0.045515,3.164285,2.223736,-1.319692,1.520556,1.871166,0.102075,-0.482205,2.202736,-0.379663,-1.427367,1.077260,-1.589700,-1.497924,1.775265,0.131104,0.673719,-0.762938,-0.054491,-2.693591,0.322740,0.514098,0.378534,2.005421,0.055121,-2.274734,2.496840,-1.161231,-1.136406,-1.063191,0.202662,-0.264679,0.143142,-1.880804,1.519017,2.778037,-1.243490,1.688700,-2.817066,-0.326985,-1.565758,-0.602642,0.093739,-1.396796,-2.217502,0.375348,-0.946768,2.703766,-2.814725,-0.439471,-0.618791,2.708299,-1.385497,-2.374185,0.268575,-1.606038,-0.011414,1.876164,2.326956,0.555825,0.564104,-1.010089,1.658002,-0.705465,1.463656,-0.180624,-0.768512,0.183623,-1.029494,2.434602,1.757845,-2.303630,-1.809953,1.457896,0.198293,1.886444,-0.070536,2.324221,-5.630461,-3.481980,0.082216,2.439018,-2.580012,1.230351,0.874606,2.408608,-1.466031,1.491643,4.286537,-0.650080,-1.011437,-1.087992,-0.450998,-1.500569,-1.186444,-0.458922,-2.021203,1.221632,-1.671036,0.107586,-0.973570,1.901155,0.621126,3.110109,1.041064,-0.618848,-0.400724,-0.250616,-2.863729,0.089846,-0.528863,0.288667,-0.689752,-0.595298,0.574541,1.098430,-1.949525,1.459457,0.678459,-0.643005,3.424594,2.785467,1.326371,0.088840,-3.554291,-1.832382,-0.650698,-1.648882,-0.552808,0.325951,-3.035421,2.875274,-1.525783,1.175344,-0.196075,-2.268625,-1.112288,0.438348,-1.257121,-2.170716,-0.185285,2.281588,-0.677255,-0.204146,-1.905604,4.441535,-0.476086,2.089861,0.528209,0.750300,-1.975911,-0.040623,-0.729591,0.397897,0.994398,-0.218786,-2.330818,0.284148,-0.120312,3.054300,1.677118,0.629371,-0.343409,-2.290502,-1.330998,2.941239,0.087452,1.052761,0.236777,-1.090527,-1.035894,-2.021767,-1.806500,-0.241798,-1.293100,1.375436,-1.116905,-0.146182,-1.162477,1.162166,1.212161,1.684926,-2.147442,-1.699450,-1.602096,1.320073,1.687590,-3.667754,0.616529,0.207682,2.663318,1.983715,-0.052491,-0.212525,1.005236,-0.384604,0.398221,-1.374366,0.330639,-1.534500,-3.808591,1.312316,0.079674,-0.639030,0.488965,0.136836,0.442858,-1.121584,0.069934,-1.073073,0.338204,2.259819,-1.646480,0.836395,-0.825496,-2.105513,2.820996,-0.063946,1.359465,0.211531,-3.132519,-1.103457,2.731525,-1.003369,-3.412262,4.049537,-1.151525,0.746232,-0.053162,0.538585,-0.274951,-0.817576
269922,0.485373,0.290858,-1.577310,1.109838,3.010221,0.237242,-1.666135,2.027884,-1.671268,0.616792,1.669402,0.434970,-2.985588,-0.274540,0.348060,0.620232,-0.233218,-1.538728,-1.774825,2.942680,2.070350,1.166346,0.929440,-1.017099,-1.147643,-1.110183,0.058933,-0.487133,-0.194907,0.069692,-0.675028,0.765750,-0.666028,2.278764,-1.671470,-2.825500,-1.911503,1.780077,1.621162,1.199935,-0.777713,-2.940166,1.112387,0.086718,-1.216492,0.198987,-0.203033,-2.096447,-1.268285,4.658534,-1.450388,1.312050,1.370017,-1.967800,-0.899040,0.400387,1.204800,-0.203901,1.696218,2.219708,0.285052,0.572715,0.047620,-1.138459,1.620785,1.495903,-2.330093,0.127453,1.343067,2.113723,2.233632,0.485215,-1.199350,2.108188,0.361000,-1.148328,-1.866379,1.704555,-1.787408,-0.182005,-2.261407,-0.780807,1.309065,0.581272,-0.952652,0.748375,-0.161217,-1.665400,2.275822,0.659862,-0.779388,0.870593,2.105128,-1.345227,2.081090,-0.622425,2.322078,0.220838,1.632067,-1.239062,-0.479842,2.682250,2.086167,0.496167,0.848242,0.817010,-0.102350,-1.716269,0.006747,-3.484163,-1.811855,-1.848178,0.959814,0.007243,1.588555,0.841333,-1.747068,-1.426223,-0.884980,2.326225,-0.467855,-3.689017,1.237145,0.774288,-0.159963,-1.642338,2.670385,1.748182,-2.140275,-0.456292,-0.138535,-0.079933,1.911350,0.229667,-2.778690,1.856335,0.385748,-0.026120,3.122566,0.118477,-0.473315,-1.272247,-0.803783,0.647193,-1.767533,-2.452450,-2.600583,2.011895,0.695562,0.307952,-0.567310,3.183060,0.892772,2.375633,1.095320,1.123835,-0.175825,-0.744677,-1.943117,-0.894328,0.535587,-2.341947,-1.129515,0.669365,-0.606657,-2.347683,-1.621855,0.734412,-0.896608,1.774467,-1.996738,0.930745,1.189523,-2.275183,0.829447,0.429328,0.848558,-0.191607,-0.904453,1.065255,-0.149575,-1.706700,-0.582550,2.071266,0.038056,0.298657,-1.434930,-0.933881,1.652857,-1.858140,1.451860,1.630790,-1.760023,1.935267,-0.614947,-2.032509,-0.986445,1.674217,-0.921390,2.125978,-3.332250,-0.111278,0.095485,-0.368485,-0.715500,1.092128,0.778465,0.131995,-1.735367,0.715217,0.577928,1.375294,-1.508188,-0.894548,0.788485,0.760408,2.591650,0.623192,-1.160137,0.320358,0.160559,0.031065,0.467052,0.564815,-1.258070,0.128130,-0.777728,1.244356,-0.228667,-0.443666,0.043183,-0.302460,-1.042821,0.509880,0.001570,-0.936157,-0.848847,0.401693,0.550800,0.601055,0.913905,0.965305,2.513132,-0.030068,-0.167463,1.887603,-0.199861,-1.348290,-1.045663,1.245507,1.987355,-0.682812,-2.797700,-1.365405,-0.850090,-1.391487,0.339250,0.431745,-1.446608,1.217202,-0.256433,0.750813,1.339217,1.048544,0.265984,0.490180,0.342815,3.565459,-3.463783,-0.669497,-0.195283,0.377612,1.225522,-2.894000,-1.031373,0.665877,-0.574625,-1.512212,0.140077,3.472112,-1.398305,-0.159270,-0.549358,0.977245,1.206043,-0.204225,0.122187,-2.215865,-0.232200,1.090988,0.860588,-0.945400,1.500597,0.089817,-1.819740,-0.506423,3.582935,1.170233,-1.791433,2.302200
80321,-0.341222,0.510195,-2.177745,0.028234,1.670810,1.041606,1.163935,2.346615,-1.356198,-0.287730,4.531282,1.228048,-2.545145,1.784957,0.020748,0.881745,1.591349,-0.503128,-1.518800,-0.466977,0.027097,0.054692,-1.667533,-0.594114,-0.411003,-0.517398,-1.632945,-1.071501,-0.033597,1.275333,0.711488,0.194223,0.532764,-0.843543,-0.131587,-0.451721,-0.373905,0.974440,1.191469,2.673757,-1.240756,-0.537135,0.196645,0.059694,-1.813210,0.887415,2.358977,-1.220046,-0.520774,0.647612,0.062012,1.209792,0.009356,-2.887178,-0.181850,0.528154,-0.625046,1.420263,0.659794,-0.181478,1.910737,-0.276304,-0.794814,-1.300410,1.118947,0.993526,-2.443764,-2.535781,0.381211,2.543227,0.274698,0.203838,-0.026649,-0.174528,-0.447133,1.705890,-0.372711,0.965664,-0.372593,0.084350,-2.377938,-0.157200,0.749310,0.654460,0.358833,-0.512890,-0.098796,-2.471838,1.256974,0.361488,-0.872305,0.392323,0.244845,-2.243867,0.933716,-1.015196,0.998939,-1.083738,-0.284999,-0.194853,1.636311,0.926832,0.801874,2.140235,-0.584190,2.582843,-1.334259,-1.215215,-0.325277,-0.770394,1.225500,-1.244650,-0.725100,-0.498113,-0.809644,2.297333,-1.867468,-1.846528,0.175896,-0.300852,-1.531850,-1.378138,-0.849700,0.517417,-0.558113,-1.026066,0.697798,-1.076006,1.554751,-1.284235,-1.897473,0.051787,1.923781,0.206659,-0.873046,-1.029276,-0.979418,-0.950737,1.241175,-1.234997,-0.859799,-0.593105,0.360818,1.218256,-0.243398,0.586997,-2.128582,-0.155304,0.313364,0.644542,-1.397332,0.876566,0.577048,1.534113,-0.610013,1.221732,2.521771,0.589325,0.102555,-2.040551,0.285105,-1.121815,-0.207116,0.369356,-0.931610,0.205453,-1.867797,0.318097,0.329185,-0.440655,1.179753,-0.035059,0.751111,-0.514401,1.118687,-0.160087,0.108431,0.479338,-0.490195,-1.207178,-0.239022,1.803797,1.784577,-0.912062,-0.922601,0.082303,0.382894,-0.273904,0.217897,2.136021,0.826717,-0.460666,-1.733402,-0.946171,-1.097571,-0.012502,-2.371713,-0.807026,-0.082803,1.170299,-1.063237,-1.150945,-0.665705,-1.179986,0.663102,0.257260,-1.682671,0.624991,0.438092,-0.631582,1.230171,-0.519547,-0.450465,0.483710,0.545905,0.961511,0.775446,-1.764574,-0.982833,1.140171,-0.858910,-0.338979,0.396900,0.206901,-0.047820,-1.602112,-0.029350,2.189462,1.538109,0.627186,0.515154,-1.566857,-0.164536,0.178978,0.733653,0.482859,-0.695337,0.080599,0.849127,-0.223598,-1.170713,-0.298158,-0.109342,0.967557,-1.122081,0.209417,-1.480079,0.020078,0.892134,0.734105,0.777269,-0.966357,-2.709950,-0.380757,1.144748,-2.436119,-0.065285,-0.839959,0.405269,0.361628,-1.012419,3.058356,0.665879,1.559467,1.906535,0.617882,0.970765,1.425065,-2.212525,0.076046,-0.290060,0.041588,1.129021,0.315695,-0.037246,-1.364837,0.795027,-2.005992,-1.165561,1.245361,1.060770,0.186754,1.583152,-0.351991,2.342611,-0.309402,0.907461,0.278403,-1.882816,-0.316340,0.508376,-1.184318,0.428536,0.335660,-1.116694,0.753416,-0.020729,0.142670,-1.600680,1.207671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209271,0.289834,-0.040371,-0.322116,0.351184,1.877884,0.462042,0.634225,2.867700,-3.206262,0.327831,4.925367,2.062258,-3.947482,1.998897,0.428703,0.499570,2.204713,0.230369,-1.597188,0.723622,-0.232290,0.173031,-1.363264,-0.460356,-1.220776,-1.115976,-1.452259,-0.223026,-0.886094,1.212274,0.959262,0.104370,-1.277491,-0.581313,0.916429,-0.730012,0.396292,0.188708,3.113662,1.723380,-0.430541,0.451831,0.201586,-0.094822,-1.364909,1.132819,2.668163,-2.777118,-0.344981,-0.062408,0.943112,1.665920,0.020396,-4.121822,-1.972898,0.578219,-0.857702,1.405476,1.556937,0.167740,1.137452,1.404280,-2.268905,-2.102144,0.850453,2.443208,-1.761117,-2.620761,0.433854,3.123202,-0.024756,-0.083203,-0.815950,0.095502,-0.267975,1.734566,-1.004536,0.712514,-1.926916,0.025770,-2.571293,-0.544190,0.638964,1.165985,0.350051,0.618337,0.360010,-1.749083,0.699056,0.534002,-0.383060,1.759624,0.578285,-2.080869,1.038772,-1.352283,0.216697,-1.815452,-0.094986,0.523614,1.476315,1.558815,1.096685,1.412594,-0.944814,3.574575,-2.294637,-1.855954,-0.422423,-2.112584,1.546260,-1.662810,0.084961,-0.567918,0.154899,1.787246,-2.544911,-1.779416,0.354336,-1.098107,-2.545237,-1.930448,-0.528558,1.452305,0.108778,-1.665588,1.485971,-1.465891,0.967002,-1.341807,-1.845048,0.793712,1.878193,0.125987,0.064135,-0.625873,-1.494665,-1.765929,2.504301,-0.405521,-1.214347,-0.148976,0.377241,1.121717,-0.969946,-0.704148,-2.502099,-0.658608,0.140177,1.239331,-0.549474,1.322069,0.728142,1.284398,-0.883777,0.353554,2.570762,0.663212,-0.866254,-1.708422,-1.168942,-0.814279,-0.641322,1.080864,-1.900997,0.191410,-2.215010,0.162327,-0.489435,-1.028443,0.476995,0.972487,1.141407,0.039714,1.864668,0.544366,0.227396,0.815797,-1.044101,-1.097521,0.087050,0.599949,1.949470,-0.595559,-1.303917,0.808671,-0.670429,-2.708363,1.193097,2.001992,-0.643598,0.626015,-1.725012,-0.644585,-0.289121,-0.468578,-2.801064,-0.685391,-0.223885,2.194937,-2.843623,-0.532870,-1.528488,-2.109272,2.060366,0.885509,-2.105752,1.187835,0.294565,-0.417229,0.960969,-0.079183,-0.278117,0.707905,-1.052472,1.441398,1.645565,-3.174458,-0.579522,1.148096,-2.170195,0.405431,0.039818,-0.062520,0.097497,-1.648367,-0.785594,1.492688,1.619324,0.714499,0.121226,-2.856379,-1.168672,0.393869,0.789140,1.163526,0.019301,1.619020,0.236707,0.563417,-2.067912,-1.511568,0.233620,0.756668,-0.597945,0.499239,-2.091843,-0.045241,1.157655,1.040493,0.521351,-1.211279,-3.006891,-0.971207,1.053567,-3.042447,0.252376,-0.355260,0.094916,1.280193,0.178105,2.902768,1.000295,1.179766,2.306987,-0.094957,0.724253,1.529103,-2.695862,-0.707566,-0.068303,-0.494303,0.165885,-1.507798,-0.380113,-1.822710,1.322591,-1.448293,-1.599578,1.916442,0.642716,-0.358932,0.924995,-0.547249,2.862751,-0.067404,1.883625,0.857290,-2.281671,-0.048812,2.271666,-0.019929,-0.166668,0.833213,-0.714422,-0.452875,0.459604,-1.328852,-2.414525,1.335442
139385,0.118397,1.109121,-1.677288,-1.812025,0.331650,0.448095,2.193981,1.567212,-2.150405,-0.039439,3.734215,0.660572,-2.651353,0.379149,1.660669,0.075260,3.045620,-1.544117,-0.341109,0.418609,0.448932,0.679732,-0.751107,-1.874789,-1.160761,-0.314798,-2.127038,-1.069670,0.063003,1.991600,0.702284,-0.028346,-0.262976,-0.856451,0.615329,-0.145536,-0.449365,1.528547,2.629910,1.633307,-0.545104,-0.948345,1.051321,-1.180808,-0.348253,0.591379,0.745007,-2.022793,0.416485,0.607254,0.588985,1.105839,0.221700,-2.575386,-1.373498,0.971558,-0.804942,1.512793,1.505624,0.661722,1.776984,-0.441986,-1.836420,-0.874738,-0.008586,2.131899,-4.313111,-2.366850,-0.100274,1.989803,0.141722,-0.224773,-0.885441,-1.926042,-0.045381,-0.033131,-0.252615,-0.336090,-0.618095,0.646361,-1.725629,0.362868,1.859792,-0.364100,0.500261,0.478603,0.872727,-0.942548,0.820770,-0.245989,-1.663196,2.082642,0.723229,-1.560254,0.723804,-0.991028,2.084427,-2.151832,0.652868,-0.578417,0.412028,1.119422,1.877758,1.220596,-0.665860,1.246134,-1.933756,-0.202664,0.609783,-0.180104,2.242011,-1.179028,0.174302,-0.404870,0.394508,2.201390,-3.046810,-0.930733,-0.253010,-1.063194,-2.157946,-1.464788,-1.398467,0.897966,-0.333275,-1.146628,2.169700,-0.162328,1.830575,0.415421,-0.338615,1.143577,1.308259,-0.893764,0.561969,-0.370790,-2.283133,-0.185090,3.059461,-1.641603,-0.879253,1.005649,-1.119582,1.081637,-2.047978,0.255802,-2.278579,-0.595007,0.219327,1.419133,0.144588,0.341768,1.437888,1.871484,-0.139717,0.439010,2.734780,0.745211,-0.085059,-0.331332,0.203623,-1.155149,-0.402593,-0.103585,-2.271774,0.539598,0.176537,0.024430,-1.263247,-0.604274,0.006508,0.529020,-0.304121,0.145244,0.637546,-0.054397,-0.530280,0.876450,0.451152,-1.306836,0.158468,1.102999,0.903626,0.188448,-0.947602,0.921565,-0.462821,-2.229343,1.064226,0.913553,-1.433540,0.307063,-2.123221,0.240653,0.160052,-2.128429,-1.814632,0.653145,-1.768275,2.587825,-1.469909,-0.459790,-1.130269,-2.460169,0.663942,1.651890,-0.375718,0.624926,-0.768209,-0.154155,1.603428,-0.972500,0.428149,-0.776997,-0.351909,0.706516,1.403749,-0.422546,-0.935176,-0.697312,-0.683820,-0.608867,-0.151644,1.298856,-1.058618,-0.578221,-0.786231,-0.333041,0.932998,0.593599,-1.508400,-2.012998,0.493276,-0.357243,-0.852029,1.894236,0.487566,2.319524,1.230454,-0.318895,-2.330665,-1.613572,-0.641000,-0.224352,-0.659948,-0.522598,-1.270705,0.177350,1.001200,1.417220,-0.198069,-1.177000,-3.358267,-0.388366,0.669764,-0.846588,1.538986,0.296863,-0.255414,1.030626,-0.284649,1.564795,0.241374,-0.517026,0.474875,-0.762768,-0.132060,0.255511,-3.017593,-1.482509,0.761307,-0.767599,-1.382510,-1.670958,0.030008,-1.760331,1.205141,-1.321691,-0.804756,0.721719,-1.620888,-0.207834,0.853436,-1.251607,1.929956,-0.077506,0.812977,0.286892,-1.704327,0.034386,0.840752,-1.278545,0.393682,-0.360267,-0.553247,-1.136830,1.348012,-0.239443,-2.162249,-0.086022
277569,-1.541235,0.175233,-0.793006,0.739173,1.539298,0.070717,1.229869,1.895385,-1.183585,0.064576,3.097919,0.551655,-2.228423,0.952214,0.567169,0.979816,0.928214,0.870011,-0.124880,0.140302,0.194633,-0.586981,-0.155493,-0.634073,0.573204,-1.046775,-1.092088,-0.561637,-0.159125,0.533953,0.616702,1.042315,0.004940,-0.865983,-1.783101,-0.797977,-0.061452,1.062492,0.558546,0.631091,-1.048765,0.103720,0.261045,0.613898,-0.887509,0.324988,1.025189,-0.969267,-0.096546,1.203371,-0.703189,1.360113,1.211514,-1.312090,-0.359717,-0.503746,-0.162644,0.170399,0.650988,-0.470669,-0.013881,-0.041557,-0.399366,-0.731630,1.985383,0.758443,-1.036676,-1.592128,0.270545,2.296951,0.565142,-1.247929,-0.590719,-0.429960,0.092568,1.544740,-1.160462,1.245018,-1.548574,0.078922,-2.306812,-0.565780,0.408645,0.717870,0.955399,0.987217,0.097151,-1.892105,-0.363007,0.357082,-0.482477,0.348432,0.276259,-1.749404,0.551083,-0.672578,0.090007,-0.565810,0.575354,0.754273,1.346739,0.598430,0.875078,1.361645,-0.283448,1.493795,0.022434,-1.032996,-0.074080,-2.525211,-0.156872,-0.845096,-1.465626,-0.136429,0.845039,1.112576,-1.007119,-1.314314,-0.918308,0.173363,-0.700935,-1.418803,0.422350,-0.562712,-0.900919,-1.270405,1.371793,-1.198026,2.699546,-0.980981,-1.475009,-0.016382,2.196066,-1.023076,-0.847837,-0.199186,-1.076104,-0.039742,2.420853,-0.829545,-1.217761,0.126630,0.024384,0.683461,0.274310,-0.040206,-3.156538,-0.012219,-0.350222,0.426913,0.300346,1.354074,0.627517,1.673161,-1.623128,1.540025,1.537750,-0.385835,-0.926583,-1.153391,-0.290234,-2.167664,-0.237076,0.238672,-1.851959,0.032868,-2.055028,-0.122794,-0.037431,-1.045172,0.811012,0.400058,1.269907,-0.861611,1.369161,0.099638,0.000579,-0.081587,-0.404590,-1.410094,-0.182317,0.277590,0.487406,0.766052,0.000748,-0.026781,0.521783,-0.854157,1.076984,2.491236,0.144612,-1.006517,-0.857115,0.404358,-0.331884,-0.694043,-0.701646,-0.163830,-0.089766,1.732681,-0.781401,-0.228810,-0.707161,-0.856083,0.951036,0.479320,-0.133731,-0.153588,0.148102,0.195972,-0.681871,-0.353582,-0.381592,1.321544,-0.304727,1.030466,0.755089,0.289352,-0.540071,0.106720,-1.035486,0.248403,0.191651,0.665364,-0.745423,-0.260132,-0.602532,2.005052,0.663840,0.012752,1.291116,-1.513654,-0.772496,-0.010702,0.894833,0.619875,-0.645380,0.741932,0.126711,0.194913,-0.576264,0.193363,0.553234,0.806461,0.019130,0.043808,-0.722238,-0.041715,1.030582,1.296391,-0.617875,-1.441526,-2.657742,-0.687432,0.529615,-1.989931,0.287366,-1.292363,-0.176090,0.725388,-0.333935,1.490623,0.665890,0.226178,0.653739,0.599284,0.378539,0.520465,-1.917525,-0.653579,1.262889,-0.946040,-0.532401,-0.886072,0.979066,-0.297980,0.478383,-1.537304,-0.919427,1.140136,0.828457,-0.002171,0.008924,-0.199975,1.724013,0.087073,0.714379,0.995919,-1.868982,-0.430459,0.988014,-1.223383,-0.174327,-0.045537,-1.274414,0.011453,0.789338,-0.356653,-1.113665,0.996951
80172,-1.225687,-0.098471,-1.934309,-1.651363,1.852539,1.603341,2.248695,1.467571,-0.442029,-0.010313,2.091521,0.334661,-1.031543,0.930621,0.863144,0.707086,1.416704,-1.249160,0.113143,2.389217,-0.136226,-0.069479,-0.936976,-0.267828,-0.071707,-0.758396,-1.719043,-1.467535,0.306569,1.002697,1.591058,-0.267820,1.227852,-0.099632,-0.366877,0.666469,1.077373,0.755405,0.514448,1.967334,-2.779780,-0.550636,-0.593446,0.660816,-1.338166,-0.111008,-0.411015,-2.851682,0.330085,0.773345,-0.143037,0.889715,1.402730,-2.298717,-1.392155,-0.460210,1.465152,-0.136232,0.373021,1.567817,2.025820,-0.384619,-0.085050,-1.024218,1.790975,-0.875146,-1.091480,-1.626961,-0.141084,1.071607,0.607148,0.509970,-0.179434,-1.133245,-0.115250,0.380086,0.411769,1.771670,0.510082,0.441459,-1.322677,0.420041,1.928552,0.189891,0.986676,0.214715,-0.613117,-1.036410,0.762175,-1.339390,-0.231851,0.145659,0.597368,-2.030222,0.740584,-1.713482,-0.965368,0.704080,0.538228,-1.328918,2.338451,1.049053,0.887333,0.214748,-2.157653,0.715748,-0.875553,-0.616944,-1.038556,-1.030650,-0.846408,-1.087513,-2.004720,-0.601329,0.403827,2.109346,-2.694010,-2.362698,0.933833,0.564246,-0.157636,-2.030070,-1.529550,1.404258,0.059594,0.417634,1.890822,-0.460619,0.962488,-0.639838,-1.655982,-0.625520,0.229868,-1.582422,-0.618394,0.369614,-0.946956,-0.162548,1.585780,-1.958533,-1.604966,0.362881,0.481952,1.466753,-0.337474,0.287821,-3.038438,-1.493207,-0.999354,0.230678,-0.221586,0.837258,1.101240,1.912296,-1.069929,0.812331,1.864262,0.979129,-1.639338,0.578322,0.750291,-0.243933,-1.163529,-0.052524,-2.227056,-1.146458,-2.081175,0.237102,0.005277,0.325975,0.438030,0.487707,2.165734,-0.451930,1.273238,-0.518611,-0.800589,0.649844,0.294086,-1.437873,0.163210,-0.035135,1.409285,-0.000598,-0.789683,-0.682131,0.949607,-0.903827,1.989360,1.216859,-0.221967,-0.457766,-1.646260,0.388045,0.418438,-1.258313,-1.474682,0.334412,-1.452146,2.255529,-1.146345,0.560251,-0.415926,0.263164,0.159458,-0.015485,-1.119736,-0.711285,-0.043849,0.716682,-0.158008,-0.876208,-1.063150,0.492794,0.713633,0.440569,0.782778,0.478673,-0.106094,0.706139,-0.592736,-1.027307,1.880137,1.111603,-1.880890,0.358996,-0.320340,1.630530,1.251240,1.487716,-0.105483,-0.763297,-0.426077,0.628646,-0.120351,1.607040,-1.654252,1.608269,-1.381573,-0.583874,-1.104126,-0.534259,0.606653,1.512677,-0.535744,-1.199274,0.010330,0.009739,0.829072,1.157700,0.556370,-1.229123,-2.529040,0.468668,1.570638,-1.837408,0.922997,-1.182858,0.663490,0.202262,-0.661705,1.589341,0.219236,-0.773867,0.246830,0.248193,0.216004,0.870043,-3.134130,0.568428,0.439080,-0.176664,-0.678542,-0.428576,-0.863393,-1.456634,0.644144,-1.302753,-0.764837,2.819664,-1.136054,-0.679845,1.698389,-0.901757,1.006468,-0.337365,0.379254,0.569129,-1.266482,1.245100,2.120600,-1.240791,0.228663,1.007253,-0.455588,-0.284255,1.114375,2.130364,-2.267570,1.180186


### Bag of words (default)

In [14]:
vect_bow = CountVectorizer()
corp_bow = vect_bow.fit_transform(corp_raw)
corp_bow

<24957x41168 sparse matrix of type '<class 'numpy.int64'>'
	with 954037 stored elements in Compressed Sparse Row format>

In [15]:
# output just a small number of features, else kernel crashes while converting
# sparse matrix to array
n_words = 100
pd.DataFrame(data=corp_bow[:, 10000:10000+n_words].toarray(),
             columns=vect_bow.get_feature_names_out()[10000:10000+n_words])

Unnamed: 0,defenceman,defences,defend,defendant,defendants,defended,defender,defenders,defending,defends,defense,defenseless,defenseman,defensemen,defenses,defensive,defensively,defer,deference,deferential,defermements,deferment,deferments,deferral,deferrals,deferred,deferred_action_for_childhood_arrivals,deferring,defiance,defiant,deficiences,deficiencies,deficiency,deficient,deficit,deficits,defied,defies,defile,define,defined,defines,defining,definite,definitely,definition,definitions,definitive,definitively,deflated,deflationary,deflect,deflected,deflecting,deflection,deflections,deflects,deformed,defraud,defrauded,defrauding,defrauds,defrock,defrocked,defrocking,defrocks,defunct,defund,defunded,defunding,defuse,defy,defying,degeneracy,degenerate,degenerated,degenerates,degenerating,degenerative,degette,degettes,degollado,degradation,degrade,degraded,degrading,degrassi,degree,degreed,degrees,dehavilland,dehumanize,dehumanized,dehumanizes,dehumanizing,dehydration,dei,deified,deigned,deity
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24952,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24953,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24954,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24955,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Bag of words (binary)

In [16]:
vect_bow_bin = CountVectorizer(binary=True)
corp_bow_bin = vect_bow_bin.fit_transform(corp_raw)
corp_bow_bin

<24957x41168 sparse matrix of type '<class 'numpy.int64'>'
	with 954037 stored elements in Compressed Sparse Row format>

### Bag of words (mixed case)

In [17]:
vect_bow_mixc = CountVectorizer(lowercase=False)
corp_bow_mixc = vect_bow_mixc.fit_transform(corp_raw)
corp_bow_mixc

<24957x51449 sparse matrix of type '<class 'numpy.int64'>'
	with 977589 stored elements in Compressed Sparse Row format>

### Bag of words (default) on preprocessed comments (lemmatization, stopword and punctuation removal)

In [18]:
vect_bow = CountVectorizer()
corp_pp_bow = vect_bow.fit_transform(corp_pp)
corp_pp_bow

<24957x33999 sparse matrix of type '<class 'numpy.int64'>'
	with 516347 stored elements in Compressed Sparse Row format>

### Bag of 1/2-grams (default) on preprocessed comments

In [19]:
vect_bo12grams = CountVectorizer(ngram_range=(1,2))
corp_pp_bo12grams = vect_bo12grams.fit_transform(corp_pp)
corp_pp_bo12grams

<24957x464367 sparse matrix of type '<class 'numpy.int64'>'
	with 1067013 stored elements in Compressed Sparse Row format>

### Bag of 1/2/3-grams (default) on preprocessed comments

In [20]:
vect_bo123grams = CountVectorizer(ngram_range=(1,3))
corp_pp_bo123grams = vect_bo123grams.fit_transform(corp_pp)
corp_pp_bo123grams

<24957x989750 sparse matrix of type '<class 'numpy.int64'>'
	with 1600456 stored elements in Compressed Sparse Row format>

### Bag of 2-grams (default) on preprocessed comments

In [21]:
vect_bo2grams = CountVectorizer(ngram_range=(2,2))
corp_pp_bo2grams = vect_bo2grams.fit_transform(corp_pp)
corp_pp_bo2grams

<24957x430368 sparse matrix of type '<class 'numpy.int64'>'
	with 550666 stored elements in Compressed Sparse Row format>

### Tf_idf

In [22]:
vect_tfidf = TfidfVectorizer()
corp_tfidf = vect_tfidf.fit_transform(corp_raw)
corp_tfidf

<24957x41168 sparse matrix of type '<class 'numpy.float64'>'
	with 954037 stored elements in Compressed Sparse Row format>

In [23]:
# output just a small number of features, else kernel crashes
n_words = 100
pd.DataFrame(data=corp_tfidf[:, 10000:10000+n_words].toarray(),
             columns=vect_tfidf.get_feature_names_out()[10000:10000+n_words])

Unnamed: 0,defenceman,defences,defend,defendant,defendants,defended,defender,defenders,defending,defends,defense,defenseless,defenseman,defensemen,defenses,defensive,defensively,defer,deference,deferential,defermements,deferment,deferments,deferral,deferrals,deferred,deferred_action_for_childhood_arrivals,deferring,defiance,defiant,deficiences,deficiencies,deficiency,deficient,deficit,deficits,defied,defies,defile,define,defined,defines,defining,definite,definitely,definition,definitions,definitive,definitively,deflated,deflationary,deflect,deflected,deflecting,deflection,deflections,deflects,deformed,defraud,defrauded,defrauding,defrauds,defrock,defrocked,defrocking,defrocks,defunct,defund,defunded,defunding,defuse,defy,defying,degeneracy,degenerate,degenerated,degenerates,degenerating,degenerative,degette,degettes,degollado,degradation,degrade,degraded,degrading,degrassi,degree,degreed,degrees,dehavilland,dehumanize,dehumanized,dehumanizes,dehumanizing,dehydration,dei,deified,deigned,deity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Tf_idf on preprocessed comments (lemmatization, stopword and punctuation removal)

In [24]:
vect_tfidf = TfidfVectorizer()
corp_pp_tfidf = vect_tfidf.fit_transform(corp_pp)
corp_pp_tfidf

<24957x33999 sparse matrix of type '<class 'numpy.float64'>'
	with 516347 stored elements in Compressed Sparse Row format>

## Baseline model (logistic regression)

In [25]:
# parameters for model
params = {'max_iter': 2_000}

# load model with parameters
lr = LogisticRegression(**params)

test_result = test_model(lr, 'BASELINE (logistic regression)', params,
                    'bag of words', corp_bow, target)
store_test_result(test_result)

## XGBoost experiments

In [26]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words',
                         corp_bow, target)
store_test_result(test_result)

In [27]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (binary)',
                         corp_bow_bin, target)
store_test_result(test_result)

In [28]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (mixed case)',
                         corp_bow_mixc, target)
store_test_result(test_result)

In [29]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'bag of words (preprocessed)',
                         corp_pp_bow, target)
store_test_result(test_result)

In [30]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2-grams (preprocessed)',
                         corp_pp_bo12grams, target)
store_test_result(test_result)

In [31]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 1/2/3-grams (preprocessed)',
                         corp_pp_bo123grams, target)
store_test_result(test_result)

In [32]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params,
                         'bag of 2-grams (preprocessed)',
                         corp_pp_bo2grams, target)
store_test_result(test_result)

In [33]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf',
                         corp_tfidf, target)
store_test_result(test_result)

In [34]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corp_pp_tfidf, target)
store_test_result(test_result)

In [35]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'tf_idf (preprocessed)',
                         corp_pp_tfidf, target)
store_test_result(test_result)

In [36]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'spacy vectors (300-D)',
                         corp_spacy, target)
store_test_result(test_result)

In [37]:
# parameters for model
params = {'random_state': 42,
          'n_jobs': -1,
          'n_estimators': 1000}

# load model with parameters
xgb = XGBClassifier(**params)

test_result = test_model(xgb, 'XGBoost', params, 'spacy vectors (300-D)',
                         corp_spacy, target)
store_test_result(test_result)

## Show test results + total exec time

In [38]:
test_results

Unnamed: 0,model_name,model_params,data_desc,train_data_size,features_no,f1,acc,recall,prec,roc_auc,cf_matrix,train_time,notes
0,BASELINE (logistic regression),{'max_iter': 2000},bag of words,18717,41168,0.777,0.832,0.728,0.832,0.894,"[[3373, 367], [679, 1821]]",0m 2s,
1,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words,18717,41168,0.756,0.83,0.655,0.893,0.897,"[[3544, 196], [862, 1638]]",0m 1s,
2,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (binary),18717,41168,0.747,0.825,0.646,0.885,0.898,"[[3530, 210], [884, 1616]]",0m 1s,
3,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (mixed case),18717,51449,0.733,0.818,0.626,0.885,0.885,"[[3536, 204], [934, 1566]]",0m 1s,
4,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of words (preprocessed),18717,33999,0.758,0.829,0.669,0.874,0.899,"[[3500, 240], [828, 1672]]",0m 1s,
5,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2-grams (preprocessed),18717,464367,0.76,0.831,0.668,0.883,0.9,"[[3518, 222], [830, 1670]]",0m 6s,
6,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 1/2/3-grams (preprocessed),18717,989750,0.76,0.831,0.668,0.883,0.9,"[[3518, 222], [830, 1670]]",0m 12s,
7,XGBoost,"{'random_state': 42, 'n_jobs': -1}",bag of 2-grams (preprocessed),18717,430368,0.251,0.641,0.15,0.763,0.595,"[[3623, 117], [2124, 376]]",0m 3s,
8,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf,18717,41168,0.754,0.827,0.659,0.88,0.892,"[[3516, 224], [853, 1647]]",0m 5s,
9,XGBoost,"{'random_state': 42, 'n_jobs': -1}",tf_idf (preprocessed),18717,33999,0.768,0.835,0.681,0.88,0.902,"[[3508, 232], [798, 1702]]",0m 4s,


In [39]:
full_run_time = time.time() - full_run_time_start
print(f'Full run time: {int(full_run_time // 60)}m {round(full_run_time % 60)}s')

Full run time: 1m 56s


## Notes

- also try LightGBM?