## Imports

In [49]:
import numpy as np
import pandas as pd
import sklearn as sklearn
from sklearn.ensemble import HistGradientBoostingClassifier
print('Now using sklearn version '+sklearn.__version__)
print('Packages are Ready!!!')

Now using sklearn version 1.0.2
Packages are Ready!!!


## Loading data

In [50]:
def load_augmented_data(): #Loads the augmented dataset
    train_augmented=pd.read_csv(filepath_or_buffer='/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/train_augmented.csv')
    test_augmented=pd.read_csv(filepath_or_buffer='/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/test_augmented.csv')
    print('Augmented datasets are ready')
    return train_augmented , test_augmented

def load_original_data(): # Loads the original datasets of the competition
    test=pd.read_csv('/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/test.csv')
    train=pd.read_csv('/Users/juansmacbook/PycharmProjects/Santander_Transaction/Santander-Transaction-Competition/train.csv')
    print('Original datasets are ready')
    return train, test

In [51]:
train_augmented , test_augmented = load_augmented_data()

Augmented datasets are ready


## Resampling
We are going to resample the missrepresented class in the training dataset, in this way we are going to balance out the classes.

In [4]:
training_set=x_train[x_train.columns[1:]].to_numpy()
training_labels=train['target'].to_numpy()


In [5]:

"""since we are dealing with an imbalanced data set we are going to deal with it  by oversampling the underrepresented class"""

from imblearn.over_sampling import RandomOverSampler
rov=RandomOverSampler()
x_resampled, y_resampled = rov.fit_resample(X=training_set, y=training_labels)

print('_.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._')
print('training set shape before resampling: '+str(training_set.shape))
print('training set shape after: '+str(x_resampled.shape))


print('_.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._')
print('training labels shape before resampling: '+str(training_labels.shape))
print('training labels shape after :'+str(y_resampled.shape))
print('_.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._')

""" We can now see the number example for each class"""
pd.Series(name='Number of examples', data=y_resampled).value_counts()

_.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._
training set shape before resampling: (200000, 400)
training set shape after: (359804, 400)
_.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._
training labels shape before resampling: (200000,)
training labels shape after :(359804,)
_.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._.-._


In [12]:
"""initializes an instance of a gradient boosting classifier """
def get_hgbc():
    hgbc=sklearn.ensemble.HistGradientBoostingClassifier(
        #Hyper parameters
        max_iter=20, #Trees to be build
        early_stopping=True,
        #validation_fraction=0.02,
        learning_rate=0.1,
        warm_start=True,
        #Metrics and over-fitting
        loss='binary_crossentropy',
        l2_regularization=0.01,
        scoring='roc_auc',
        # Tree parameters
        min_samples_leaf=120,
        max_leaf_nodes=13,
        max_depth=None,
        #ETC
        verbose=0
    )
    return hgbc
model=get_hgbc()

## CROSS VALIDATION SCHEME TRAINING

In [13]:

""" TEST DO NOT TOUCH"""

from sklearn.model_selection import cross_val_score
val_scores=cross_val_score(model, x_resampled, y_resampled, cv=5, scoring='roc_auc')


In [14]:
val_scores

array([0.80078877, 0.80131791, 0.79696717, 0.79688244, 0.79710308])

In [15]:
"""Creates and fits model with the resampled data"""
model.fit(x_resampled, y_resampled, sample_weight=None)
print('Training done!')


Training done!


## Getting predictions

In [17]:
"""Let's make predictions with the model"""
super_test_set=x_test[x_test.columns[1:]].to_numpy()
predictions=model.predict(super_test_set)
#TEST
predictions_prob=model.predict_proba(super_test_set)
predictions_prob=pd.DataFrame(predictions_prob)
######
predictions_prob
super_test_set

array([[ 8.5374, -1.3222, 12.022 , ...,  0.    ,  0.    ,  1.    ],
       [17.3035, -2.4212, 13.3989, ...,  0.    ,  1.    ,  0.    ],
       [10.6137, -2.1898,  8.909 , ...,  0.    ,  0.    ,  1.    ],
       ...,
       [ 8.2964, -2.3119, 11.2139, ...,  0.    ,  0.    ,  0.    ],
       [11.636 ,  2.2769, 11.2074, ...,  0.    ,  0.    ,  0.    ],
       [13.5745, -0.5134, 13.6584, ...,  0.    ,  0.    ,  0.    ]])

In [48]:
""" We create a fucntion that uses the trained model to save a CVS file ready for sumbmission"""
def get_sumbmission(model):

    return 0


"""Let's get ready for submission. Creating a DataFrame containing ID codes and targets"""
submission_df=pd.DataFrame()
submission_df['ID_code']=x_test['ID_code']

#TEST
#submission_df['target']=predictions THIS IS THE REAL DEAL
submission_df['target']=predictions_prob[1].values #TEST
########################

"""we are also creating a dummy columns to help us sort our data by ID code"""
submission_df['dummy_index']=submission_df['ID_code'].apply(lambda x: int(x[5:]))

submission_df

"""Let's sort our data and save it as a CSV file! We are done!"""
juan_submission=submission_df.sort_values(by='dummy_index')
juan_submission.drop(['dummy_index'],axis=1).to_csv('juan_submission.csv', index=False)
final_submission=pd.read_csv(filepath_or_buffer='juan_submission.csv')

In [52]:
!ls

Boosting copy.ipynb       juan_submission.csv       x_test.csv
DataGeneration copy.ipynb test.csv                  x_train.csv
README.md                 train.csv


In [51]:
final_submission

Unnamed: 0,ID_code,target
0,test_0,0.503196
1,test_1,0.791701
2,test_2,0.769796
3,test_3,0.487299
4,test_4,0.378554
...,...,...
199995,test_199995,0.320715
199996,test_199996,0.065481
199997,test_199997,0.104253
199998,test_199998,0.472951


Boosting copy.ipynb       juan_submission.csv       x_test.csv
DataGeneration copy.ipynb test.csv                  x_train.csv
README.md                 train.csv
