This is submission code for the Kaggle competition

Playground Series - Season 3, Episode 4

https://www.kaggle.com/competitions/playground-series-s3e4/overview

which came in position 48 of 643, which is good for a first serious attempt at a competition.

A tidier version of this code is used for a later competion submission.

The code for finding the best XGBoost parameters and averaging the best predidctions is from

https://www.kaggle.com/code/kirkdco/xgboost-s03e03?scriptVersionId=116904858

# Packages and Data

In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from collections import defaultdict
import random

# Scikit stuff
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import *
from sklearn.metrics import *

# XGB
from xgboost import XGBRegressor, XGBClassifier

In [2]:
data = pd.read_csv('C:/Users/medion/playground-series-s3e4/train.csv')
eval_data = pd.read_csv('C:/Users/medion/playground-series-s3e4/test.csv')
sample_submit = pd.read_csv('C:/Users/medion/playground-series-s3e4/sample_submission.csv')

data = data.drop("id", axis=1)
eval_data = eval_data.drop('id', axis=1)

target_col = 'Class'

# Data Processing

We have classes for the train and test data. It contains possible functions we can use for data processing. Some are used for data exploration so are not used in the final notebook. For this set all we do is a power transformation.

In [3]:
class DataProcess():
    def __init__(self, data):
        self.data_original = data
        self.data = self.data_original.copy()
        
    def histplot(self):
        '''
        Plots a histogram for each feature
        '''
        for column in self.data.columns:
            sns.histplot(data = self.data[column])
            plt.show()
    
    def drop(self, col_list):
        '''
        Drops a list of features
        '''
        for col in col_list:
            self.data = self.data.drop(col, axis=1)
    
    def value_counts(self):
        '''
        Returns data counts for each feature
        '''
        for column in self.data.columns:
            print(f"{column}")
            print(self.data[column].value_counts())
            print("-------------------------------------")


    def show_correlation(self, n):
        '''
        Show n highest and lowest correlated variables for each variable
        '''
        corr = pd.DataFrame(self.data.corr())
        for column in self.data.columns:
            print(f'Three lowest correlation for {column}')
            print(pd.DataFrame(corr[column]).sort_values(column).iloc[0:n])
            print('----------------------------------------------------')
            print(f'Three highest correlation for {column}')
            print(pd.DataFrame(corr[column]).sort_values(column).iloc[-n-1:-1])
            print('-----------------------------------------------------')
            
    def power_transform(self):
        target_list = [target_col]
        power_data = [item for item in self.data.columns if item not in target_list]
        
        for item in power_data:
            pt = PowerTransformer()
            self.data['trans'+item] = pt.fit_transform(pd.DataFrame(self.data[item]))
            self.data = self.data.drop(item, axis=1)

    def pipeline(self):
        '''
        Pipeline of chosen data processing methods, here we have just one
        '''
        self.power_transform()  
        
# Process data and split features and target        
        
train_class = DataProcess(data)
train_class.pipeline()
X_train = train_class.data.drop(target_col, axis=1)
y_train = train_class.data[target_col]

In [4]:
class EvalProcess(DataProcess):
    def __init__(self, data):
        self.data_original = data
        self.data = self.data_original.copy()
    
    def cols(self):
        '''
        Run this to ensure train and eval data have the same columns
        '''
        eval_cols = [
            column for column in self.data.columns if column in train_class.data.columns
        ]
        self.data = self.data[eval_cols]
    
    def pipeline(self):
        self.power_transform()
        self.cols()

# Process eval data        
        
eval_class = EvalProcess(eval_data)
eval_class.pipeline()

eval_data = eval_class.data

# Model Selection

We search over a selection of parameters to find the best XGB model. This is clumisly written and is tidied further in a later notebook.

In [5]:
# A selection of parameters, these will be randomly selected from for

n_estimators_values = [10, 25, 50, 100, 150, 200, 250, 300] 
eta_values = [ v / 10 for v in range(10) ]
max_depth_values = [2, 4, 6, 8, 10]
subsample_values = [0.25, 0.50, 0.75, 0.90]
colsample_bytree_values = [0.25, 0.50, 0.75, 0.90]

# Fixed parameters

cv_folds = 5
tuning_iterations = 50 
random.seed(2201020)
skf_seed = random.randint(0, 2023)

# K-fold split

skf = StratifiedKFold(n_splits = cv_folds, random_state = skf_seed, shuffle = True)

# Initalize data frame for results storage

tuning_results = defaultdict(list)
col_names = [f'XGB_Step_{step}_Fold_{fold}' 
                 for step in range(tuning_iterations) 
                     for fold in range(cv_folds)]
eval_predictions = pd.DataFrame(0, index = eval_data.index, columns = col_names)
valid_predictions = pd.DataFrame(0, index = X_train.index, columns = col_names)

# The loop

for step in range(tuning_iterations):
    # Choose random XGB parameters
    
    n_estimators = random.choice(n_estimators_values)
    eta = random.choice(eta_values)
    max_depth = random.choice(max_depth_values)
    subsample = random.choice(subsample_values)
    colsample_bytree = random.choice(colsample_bytree_values)
    
    aucs = []
    test_probs = []

    for i, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
        X_train_split, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_split, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

        xgb_seed = random.randint(0, 2023)
        xgb = XGBClassifier(n_estimators = n_estimators,
                            eta = eta,
                            max_depth = max_depth,
                            subsample = subsample,
                            colsample_bytree = colsample_bytree,
                            random_state = xgb_seed).fit(X_train_split.values, y_train_split.values)
        
        val_probs = [probs[1] for probs in xgb.predict_proba(X_val.values)]
        valid_predictions.loc[val_index, f'XGB_Step_{step}_Fold_{i}'] = val_probs
        auc_result = roc_auc_score(y_val, val_probs)
        aucs.append(auc_result)
        
        eval_predictions[f'XGB_Step_{step}_Fold_{i}'] = \
            [probs[1] for probs in xgb.predict_proba(eval_data.values)]
    
    tuning_results['step'].append(step)
    tuning_results['auc'].append(np.mean(aucs))
    tuning_results['n_estimators'].append(n_estimators)
    tuning_results['eta'].append(eta)
    tuning_results['max_depth'].append(max_depth)
    tuning_results['subsample'].append(subsample)
    tuning_results['colsample_bytree'].append(colsample_bytree)
    tuning_results['skf_seed'].append(skf_seed)
    tuning_results['xgb_seed'].append(xgb_seed)
    
    print(f'Step: {step}  AUC: {np.mean(aucs)}')

tuning_results = pd.DataFrame(tuning_results)
tuning_results.sort_values(by = 'auc', axis = 0, inplace = True, ascending = False)
tuning_results

Step: 0  AUC: 0.8020274495704929
Step: 1  AUC: 0.7395156323086252
Step: 2  AUC: 0.6919561942907051
Step: 3  AUC: 0.7252305469894488
Step: 4  AUC: 0.7454934176873121
Step: 5  AUC: 0.6962275940498461
Step: 6  AUC: 0.740335597472369
Step: 7  AUC: 0.5
Step: 8  AUC: 0.7196894269358823
Step: 9  AUC: 0.7135025640669905
Step: 10  AUC: 0.5
Step: 11  AUC: 0.7531278245107788
Step: 12  AUC: 0.6953687426711683
Step: 13  AUC: 0.6929718651653821
Step: 14  AUC: 0.7440259830770156
Step: 15  AUC: 0.6963948756075331
Step: 16  AUC: 0.6702841581435797
Step: 17  AUC: 0.6855408691545097
Step: 18  AUC: 0.7071563776813844
Step: 19  AUC: 0.6782971994224593
Step: 20  AUC: 0.8056717411053198
Step: 21  AUC: 0.6689669624220385
Step: 22  AUC: 0.6908855089849378
Step: 23  AUC: 0.7086461580543106
Step: 24  AUC: 0.672224107714334
Step: 25  AUC: 0.5
Step: 26  AUC: 0.6943266986227108
Step: 27  AUC: 0.6872801538867078
Step: 28  AUC: 0.7105616397356138
Step: 29  AUC: 0.7316594487151342
Step: 30  AUC: 0.7127506105437075
Ste

Unnamed: 0,step,auc,n_estimators,eta,max_depth,subsample,colsample_bytree,skf_seed,xgb_seed
20,20,0.805672,100,0.1,6,0.9,0.9,1054,1391
0,0,0.802027,100,0.2,2,0.5,0.5,1054,1117
34,34,0.800657,50,0.2,4,0.5,0.25,1054,1390
42,42,0.775247,200,0.3,2,0.5,0.25,1054,1833
35,35,0.771977,100,0.2,10,0.9,0.5,1054,289
40,40,0.767341,150,0.3,2,0.5,0.75,1054,1597
48,48,0.762481,150,0.5,2,0.9,0.25,1054,986
46,46,0.760839,100,0.2,10,0.75,0.25,1054,733
36,36,0.75678,10,0.8,2,0.5,0.9,1054,1931
11,11,0.753128,250,0.2,6,0.75,0.9,1054,1570


# Results and Submission

In [6]:
# We select the 10 best predictions and take an average

num_models = [10]

submission = sample_submit.copy()

for num in num_models:
    best_cols = [f'XGB_Step_{step}_Fold_{fold}' 
                   for step in tuning_results['step'][0:num]
                       for fold in range(cv_folds)]
    cv_probs = eval_predictions[best_cols].mean(axis = 1).round(decimals = 4)

    submission[target_col] = cv_probs
submission.to_csv('submission.csv', index=False)