## Imports

In [137]:
import numpy as np
import pandas as pd
import sklearn as sklearn
from sklearn.ensemble import HistGradientBoostingClassifier
print('Now using sklearn version '+sklearn.__version__)
print('Packages are Ready!!!')

Now using sklearn version 1.0.2
Packages are Ready!!!


## Loading data

In [138]:
def load_augmented_data(): #Loads the augmented dataset
    train_augmented=pd.read_csv(filepath_or_buffer='/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/train_augmented.csv')
    test_augmented=pd.read_csv(filepath_or_buffer='/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/test_augmented.csv')
    print('Augmented datasets are ready')
    return train_augmented , test_augmented

def load_original_data(): # Loads the original datasets of the competition
    test=pd.read_csv('/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/test.csv')
    train=pd.read_csv('/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/train.csv')
    print('Original datasets are ready')
    return train, test

In [139]:
train_augmented , test_augmented = load_augmented_data()

Augmented datasets are ready


## Resampling
We are going to resample the missrepresented class in the training dataset, in this way we are going to balance out the classes.

In [140]:
training_set=x_train[x_train.columns[1:]].to_numpy()
training_labels=train['target'].to_numpy()

In [141]:

"""Since we are dealing with an imbalanced data set we are going to deal with it  by oversampling the underrepresented class"""

from imblearn.over_sampling import RandomOverSampler
# we define a function that are going to balance the classes in a training set
def resample_data(dataset): #gets a dataset and returns the balanced version of them
    rov=RandomOverSampler()
    x_resampled, y_resampled = rov.fit_resample(
        X=dataset[dataset.columns[2:]].to_numpy(),
        y=dataset['target'].to_numpy())
    return x_resampled, y_resampled # return the dataset and their targets

""" Let's use the function that we just declared"""
x_resampled, y_resampled = resample_data(train_augmented)
""" We can now see the number example for each class"""
pd.Series(name='Number of examples', data=y_resampled).value_counts()

0    179902
1    179902
Name: Number of examples, dtype: int64

## Boosting time

In [142]:
"""initializes an instance of a gradient boosting classifier """
def get_hgbc():
    hgbc=sklearn.ensemble.HistGradientBoostingClassifier(
        #Hyper parameters
        max_iter=20, #Trees to be build
        early_stopping=True,
        #validation_fraction=0.02,
        learning_rate=0.1,
        warm_start=True,
        #Metrics and over-fitting
        loss='binary_crossentropy',
        l2_regularization=0.01,
        scoring='roc_auc',
        # Tree parameters
        min_samples_leaf=120,
        max_leaf_nodes=13,
        max_depth=None,
        #ETC
        verbose=1
    )
    return hgbc
model=get_hgbc()

### cross validation scheme DONT TOUCH YET

In [143]:
#from sklearn.model_selection import cross_val_score
#val_scores=cross_val_score(model, x_resampled, y_resampled, cv=5, scoring='roc_auc')
#val_scores

### normal training DO NOT TOUCH

In [144]:
"""Creates and fits model with the resampled data"""
model.fit(x_resampled, y_resampled, sample_weight=None)
print('Training done!')

Binning 1.036 GB of training data: 8.500 s
Binning 0.115 GB of validation data: 0.438 s
Fitting gradient boosted rounds:
[1/20] 1 tree, 13 leaves, max depth = 9, train score: 0.64946, val score: 0.65699, in 0.211s
[2/20] 1 tree, 13 leaves, max depth = 9, train score: 0.66996, val score: 0.67472, in 0.209s
[3/20] 1 tree, 13 leaves, max depth = 10, train score: 0.68298, val score: 0.68674, in 0.209s
[4/20] 1 tree, 13 leaves, max depth = 9, train score: 0.69463, val score: 0.69586, in 0.190s
[5/20] 1 tree, 13 leaves, max depth = 10, train score: 0.71434, val score: 0.71199, in 0.191s
[6/20] 1 tree, 13 leaves, max depth = 9, train score: 0.72721, val score: 0.72520, in 0.232s
[7/20] 1 tree, 13 leaves, max depth = 11, train score: 0.73361, val score: 0.73369, in 0.198s
[8/20] 1 tree, 13 leaves, max depth = 12, train score: 0.74316, val score: 0.74401, in 0.208s
[9/20] 1 tree, 13 leaves, max depth = 7, train score: 0.74929, val score: 0.75011, in 0.240s
[10/20] 1 tree, 13 leaves, max depth =

## Getting predictions

In [145]:
predictions_prob=model.predict_proba(test_augmented[test_augmented.columns[1:]].to_numpy())
predictions_prob

array([[0.43771888, 0.56228112],
       [0.52591029, 0.47408971],
       [0.36161216, 0.63838784],
       ...,
       [0.71576694, 0.28423306],
       [0.4180953 , 0.5819047 ],
       [0.47697832, 0.52302168]])

In [146]:
""" We create a fucntion that uses the trained model to save a CVS file ready for sumbmission"""
def get_submission():
    submission=pd.DataFrame(data={"ID_code":test_augmented['ID_code'].to_numpy(), "target": predictions_prob[:,1] })
    """Saves the submission file"""
    submission.to_csv('final_submission.csv', index=False)
    return submission
submission = get_submission()


In [148]:
!ls

Boosting copy.ipynb       final_submission.csv      train.csv
DataGeneration copy.ipynb test.csv                  train_augmented.csv
README.md                 test_augmented.csv
