In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Amex model evaluation metric that is provided in the competition
#We won't be using this as its slower than the NP one below
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [3]:
#Faster Amex metric sourced from the following discussion post.
#https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(target: np.ndarray, preds: np.ndarray) -> float:
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos

    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight * (1 / weight.sum())).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / n_pos

    lorentz = (target * (1 / n_pos)).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    g = gini / gini_max
    return 0.5 * (g + d)

In [4]:
#This was used to plot % NaN graph.
def plot_NaN(training: np.ndarray) -> None:
    total = []
    for f in training.columns:
        total.append(((len(training.loc[training[f] == -127]) / len(training[f])) * 100, f))
    sorted_list = sorted(total)
    values, cols = zip(*sorted_list)
    fig = plt.figure(figsize =(10, 20))
    plt.xlabel("Percentage makeup")
    plt.ylabel("Feature")
    plt.title("Top 75 features by % makeup of NaN")
    plt.grid()
    plt.barh(cols[-75:],values[-75:])
    plt.show()
    
    del total, sorted_list
    gc.collect()

In [5]:
#Change to the directory where I have the files on my computer
os.chdir("/mnt/d/Kaggle/amex")
print(os.getcwd())

/mnt/d/Kaggle/amex


In [6]:
#We will read the data into RAM and do our preprocessing.
def get_x_data(filename):
    #Read the parquet file into a data frame
    df = pd.read_parquet(filename)
    #Reducing the customer ID from a 64 byte string to a 8 byte Int64 to reduce the memory footprint
    df['customer_ID'] = df['customer_ID'].apply(lambda x: int(x[-16:], 16) ).astype('int64')
    #Convert the datetime to a time
    df.S_2 = pd.to_datetime(df.S_2)
    #Replace NaN values with -127, the lowest you can go in an 8 bit integer. As that lowest common datatype of this parquet dataset.
    df = df.fillna(-127)
    #We will drop all the categorical features
    cat_features = ["B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126", "D_63", "D_64", "D_66", "D_68"]
    #We will drop all features which have > 40% constitution of NaN values of the training set. This is also done on the test set.
    to_drop = ["D_77", "S_9", "D_56", "D_105", "B_17", "D_50", "D_53", "D_142", "D_42", "D_76", "D_132", "B_29", "D_134", "B_42", "D_73", "B_39", "D_110", "D_88"]
    
    #Remove categorical features, features > 40% NaN and the dates
    df.drop(to_drop + cat_features + ['S_2'], axis=1, inplace=True)
    print('shape of data:', df.shape)
    return df

df = get_x_data("train.parquet")

shape of data: (5531451, 160)


In [7]:
#We now standardise our data. It is normalised but not standardised.
from sklearn.preprocessing import StandardScaler
index = df['customer_ID']
temp_df = pd.DataFrame(StandardScaler().fit_transform(df))
temp_df.columns = df.columns
temp_df.index = df.index
temp_df['customer_ID'] = index

#Free memory as memory is the main issue with this project.
del df, index
gc.collect()

0

In [8]:
#We now create our statistical features which we will use to do machine learning upon.
from sklearn.impute import SimpleImputer

def aggregate_data(df):
    #Aggregate the time series data for each customer into the mean, standard deviation, minimum, max and last values as features
    columns = [c for c in list(df.columns) if c not in ['customer_ID', 'S_2']]
    data_agg = df.groupby("customer_ID")[columns].agg(['min', 'max', 'last'])
    data_agg.columns = ['_'.join(x) for x in data_agg.columns]

    #Impute by replacing all NaN's with -1
    imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-1)
    itrain = pd.DataFrame(imputer.fit_transform(data_agg))
    itrain.columns = data_agg.columns
    itrain.index = data_agg.index

    #Free memory
    del df
    gc.collect()
    
    #Impute by replacing all -127's with -1
    imputer1 = SimpleImputer(missing_values=-127, strategy='constant', fill_value=-1)
    iitrain = pd.DataFrame(imputer1.fit_transform(itrain))
    iitrain.columns = itrain.columns
    iitrain.index = itrain.index

    #Garbage collect
    del data_agg, itrain
    gc.collect()
    
    print(iitrain.shape)
    
    return iitrain

#Uncomment to plot the graph of top 75 features consisting of NaN
#plot_NaN(temp_df)


training = aggregate_data(temp_df)

del temp_df
gc.collect()

(458913, 477)


0

In [9]:
#We will now append our labels to our dataset so we only have to move one object around.
raw_y = pd.read_csv("train_labels.csv")
raw_y['customer_ID'] = raw_y['customer_ID'].apply(lambda x: int(x[-16:], 16) ).astype('int64')
raw_y.set_index('customer_ID', inplace=True)

training = training.merge(raw_y, left_index=True, right_index=True, how='left')
training.target = training.target.astype('int8')

del raw_y
gc.collect()

#We sort our index and reset as sometimes it doesn't reset nicely after appending the Y column
training = training.sort_index().reset_index()

#Features are all the columns except customer_ID
features = training.columns[1:-1]

In [10]:
#We begin teaching our models off this data
from sklearn.model_selection import StratifiedKFold as KFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import lightgbm as lgbm
import xgboost as xgb

#Random Seed for training and repeatability
#Number of CV Folds we want to do
seed = 22
folds = 5

In [None]:
#Our 10,000 samples which we are using to 'fit' our hyperparameters to.
param_eval_train = training.head(10000)
x_param_train = param_eval_train.loc[:, features]
y_param_train = param_eval_train.loc[:, 'target']

print(x_param_train.shape)
print(param_eval_train.shape)

In [None]:
#Use ROC-AUC as our scorer, as the AMEX one does not work for whatever reason.
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score

amex_scorer = make_scorer(roc_auc_score, greater_is_better=True)

In [None]:
#Check for SVM optimal 'C' or inverse learning rate.
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
opt = BayesSearchCV(
    LinearSVC(max_iter=1000, ),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform')
    },
    cv=Kfolder,
    iid=False,
    n_iter=32,
    n_points=3,
    random_state=seed,
    scoring = amex_scorer,
    optimizer_kwargs={'base_estimator': 'GP'}
)

_ = opt.fit(x_param_train, y_param_train)

In [None]:
#Print what was found as the optimal one.
#Main issue here is that the algorithm never converges, maybe as we are missing too much data when dropped earlier.
print(opt.best_params_)

In [None]:
#Logistic regression hyperparameter search using Bayes theorem.
Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)

opt_log = BayesSearchCV(
    LogisticRegression(penalty='l2', max_iter=1000),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'class_weight': Categorical(['None', 'balanced'])
    },
    cv=Kfolder,
    iid=False,
    n_iter=32,
    n_points=3,
    random_state=seed,
    scoring = amex_scorer,
    optimizer_kwargs={'base_estimator': 'GP'}
)

_ = opt_log.fit(x_param_train, y_param_train)

In [None]:
print(opt_log.best_params_)
print(opt_log.cv_results_)

In [None]:
#Hyperparameter searching for our LightGBM model.
#We are searching for as said, the learning rate, max_depth and L2 regularisation.
w_lgbm = lgbm.LGBMRegressor(
    num_leaves=32, 
    max_depth=32,
    objective='binary',
    learning_rate=0.05,
    n_estimators=100,
    n_jobs=8,
    random_state=seed,
    num_iterations = 300
)

params = {
    'learning_rate': Real(1e-6, 1, prior='log-uniform'),
    'max_depth': Integer(1, 256),
    'reg_lambda': Real(1e-6, 10, prior='log-uniform')
}

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)

opt_lgbm = BayesSearchCV(
    w_lgbm,
    params,
    cv=Kfolder,
    iid=False,
    n_iter=32,
    n_points=3,
    random_state=seed,
    scoring = amex_scorer,
    optimizer_kwargs={'base_estimator': 'GP'}
)

_ = opt_lgbm.fit(x_param_train, y_param_train)

In [None]:
print(opt_lgbm.best_params_)

In [None]:
#Hyperparameter search for XGBoost model.
#Searching for optimal learning rate (eta), max_depth and L2 regularisation hyperparameter.
w_xgb = xgb.XGBRegressor(
    objective='binary:logistic',
)

params = {
    'eta': Real(1e-6, 1, prior='log-uniform'),
    'max_depth': Integer(1, 256),
    'reg_lambda': Real(1e-6, 10, prior='log-uniform')
}

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)

opt_xgb = BayesSearchCV(
    w_xgb,
    params,
    cv=Kfolder,
    iid=False,
    n_iter=32,
    n_points=3,
    random_state=seed,
    scoring = amex_scorer,
    optimizer_kwargs={'base_estimator': 'GP'}
)

_ = opt_xgb.fit(x_param_train, y_param_train)

In [None]:
print(opt_xgb.best_params_)

In [None]:
#Clean this Dataframe as its not needed anymore
print("Buffer")
del y_param_train, x_param_train, param_eval_train
gc.collect()

In [None]:
#We will now fit our Logistic regression model using the 'optimal' weights found earlier.
#In fact, since it didn't converge earlier we are not using optimal weights hence why we get a worse performance than naive theorem
logistic_regressor = LogisticRegression(solver='newton-cg',
                                        penalty='l2',
                                        C=0.003974000605090588,
                                        class_weight='balanced',
                                        verbose=20,
                                        max_iter=100,
                                        n_jobs=8)

total_l_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    logistic_regressor.fit(x_train, y_train)
    oof_predict = logistic_regressor.predict(x_test)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_l_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")

    del x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
#Print the average Kaggle metric obtained and that is our final measure.
print(f"Average Logistic Regression acc = {sum(total_l_acc)/len(total_l_acc)}")

In [None]:
#We fit our default Logistic Regression model.
logistic_regressor = LogisticRegression()

total_ln_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    logistic_regressor.fit(x_train, y_train)
    oof_predict = logistic_regressor.predict(x_test)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_ln_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")

    del x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
print(f"Average Logistic Regression (naive) acc = {sum(total_ln_acc)/len(total_ln_acc)}")

In [None]:
#We fit our suboptimal SVM model due to the fact that the hyperparameter function diverged.
svm = LinearSVC(
            penalty = 'l2',
            C = 0.0002953245610713147,
            class_weight = 'balanced',
            verbose = 20,
            random_state = seed
        )

total_svm_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    svm.fit(x_train, y_train)
    oof_predict = svm.predict(x_test)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_svm_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")
    
    del x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
print(f"Average LinearSVC acc = {sum(total_svm_acc)/len(total_svm_acc)}")

In [None]:
#Our default / unparameterised linear SVM model.
svm = LinearSVC()

total_svmn_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    svm.fit(x_train, y_train)
    oof_predict = svm.predict(x_test)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_svmn_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")
    
    del x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
print(f"Average LinearSVC (naive) acc = {sum(total_svmn_acc)/len(total_svmn_acc)}")

In [None]:
#Our LightGBM model using the optimised coefficients found earlier. This is heavily overfitted though to the test data.
lgbm_model = lgbm.LGBMRegressor(
    num_leaves=32, 
    max_depth=14,
    objective='binary',
    learning_rate=0.06538186544824388,
    n_estimators=100,
    n_jobs=8,
    random_state=seed,
    num_iterations = 300,
    reg_lambda = 0.0013722567290885153
)

#Good hyperparameters to tune: num_leaves, min_data_in_leaf, max_depth, learning rate
total_lgbm_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    lgbm_model.fit(x_train, y_train, 
                   eval_set=[(x_test, y_test)],
                   callbacks=[lgbm.log_evaluation(period=20)]
                  )
    oof_predict = lgbm_model.predict(x_test)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_lgbm_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")
    #Save model in case it crashes so we can load back.
    lgbm_model.booster_.save_model(f'LGBM_fold{fold}.lgbm')
    
    del x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
#Plot the top 20 important features to the model and its average accuracy.
print(f"Average LGBM acc = {sum(total_lgbm_acc)/len(total_lgbm_acc)}")
lgbm.plot_importance(lgbm_model, max_num_features=20)

In [None]:
#No parameters LGBM model.
lgbmn_model = lgbm.LGBMRegressor(objective='binary')

#Good hyperparameters to tune: num_leaves, min_data_in_leaf, max_depth, learning rate
total_lgbmn_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    lgbmn_model.fit(x_train, y_train, 
                   eval_set=[(x_test, y_test)],
                   callbacks=[lgbm.log_evaluation(period=20)]
                  )
    oof_predict = lgbmn_model.predict(x_test)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_lgbmn_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")
    
    del x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
print(f"Average LGBM acc = {sum(total_lgbmn_acc)/len(total_lgbmn_acc)}")
lgbm.plot_importance(lgbmn_model, max_num_features=20)

In [None]:
#Our 'optimised' XGB model.
#One annoying thing is I think the XGB.fit() method is broken. Could not get it to work but xgb.train works just fine.
xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.15242435183974648, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'hist',
    'predictor':'cpu_predictor',
    'random_state':seed,
    'nthread': 15,
    'reg_lambda': 0.07285728814355859
}
#Most important parameters:
# How many subtrees, maximum tree depth, learning rate, the L1 and L2, 

total_xgb_acc = []
Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    dtrain = xgb.DMatrix(data=x_train, label=y_train)
    dtest = xgb.DMatrix(data=x_test, label=y_test)
    xgb_model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dtest,'test')],
                num_boost_round=300,
                early_stopping_rounds=25,
                verbose_eval=50)
    #Save model for future use in case it crashes
    xgb_model.save_model(f'XGB_fold{fold}.xgb')
    oof_predict = xgb_model.predict(dtest)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_xgb_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")
    del dtrain, dtest, x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
print(f"Average XGBoost acc = {sum(total_xgb_acc)/len(total_xgb_acc)}")
xgb.plot_importance(xgb_model, max_num_features=20)

In [None]:
#Default XGB Model.
xgb_parms = { 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'hist',
    'predictor':'cpu_predictor',
    'random_state':seed,
    'nthread': 15,
}
#Most important parameters:
# How many subtrees, maximum tree depth, learning rate, the L1 and L2, 

total_xgbn_acc = []

Kfolder = KFold(n_splits=folds, random_state=seed, shuffle=True)
for fold, (train_index, test_index) in enumerate(Kfolder.split(training, training.target)):
    print(f'Fold: {fold}')
    x_train = training.loc[train_index, features]
    y_train = training.loc[train_index, 'target']
    x_test = training.loc[test_index, features]
    y_test = training.loc[test_index, 'target']
    dtrain = xgb.DMatrix(data=x_train, label=y_train)
    dtest = xgb.DMatrix(data=x_test, label=y_test)
    xgbn_model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dtest,'test')],
                num_boost_round=300,
                early_stopping_rounds=25,
                verbose_eval=50) 
    oof_predict = xgbn_model.predict(dtest)
    acc = amex_metric_np(y_test.values, oof_predict)
    total_xgbn_acc.append(acc)
    print(f"Kaggle metric: {acc}\n")
    del dtrain, dtest, x_train, y_train, x_test, y_test
    gc.collect()

In [None]:
print(f"Average XGBoost acc = {sum(total_xgbn_acc)/len(total_xgbn_acc)}")
xgb.plot_importance(xgbn_model, max_num_features=20)

In [None]:
#The accuracy on the training data with only the categorical features removed.
lbgm_sum = 0.7885413245585315 + 0.7948476986706113 + 0.7859216651728327 + 0.7908956765698341 + 0.7878287049822715
xgboost_sum = 0.7905795204886231 + 0.7957968257272143 + 0.7905462609705127 + 0.7921078535024547 + 0.7909747870189094
print("Lbgm avg Kaggle score on data with only dropped cat features: {:.3f}".format(lbgm_sum/5))
print("xgboost avg Kaggle score on data with only dropped cat features: {:.3f}".format(xgboost_sum/5))

In [None]:
#A plot of the data with only categories removed vs post processed with most NaN removed and imputed.
lgbm_scores = [lbgm_sum/5, sum(total_lgbm_acc)/len(total_lgbm_acc)]
xgb_scores = [xgboost_sum/5, sum(total_xgb_acc)/len(total_xgb_acc)]
lgbm_y = ["LGBM", "LGBM_modified_data"]
xgb_y = ["XGB", "XGB_modified_data"]

fig, (ax1, ax2) = plt.subplots(1, 2)
fig.suptitle("Boosted Tree Algorithms | Raw data vs Modified data model acc")
a = ax1.bar(lgbm_y, lgbm_scores)

ax1.bar_label(a)
b = ax2.bar(xgb_y, xgb_scores)

ax2.bar_label(b)

In [None]:
#Plot of optimised vs Default, not Naive
old_scores = [0.5399766712400272, 0.5631376385226396, 0.5137486641058764, 0.5407231366405525, 0.7835906474680508, 0.7808382713687521, 0.7853680973318556, 0.7789781451577884]
y_vals = ["Optimised-logistic", "Default-logistic", "Optimised-SVM", "Default-SVM", "Optimised-LGBM", "Default-LGBM", "Optimised-XGB", "Default-XGB"]

In [None]:
#Plot into a bargraph to compare optimised vs default to see if it made a difference.
plt.figure(figsize =(10, 20))
a = plt.barh(y_vals, old_scores)
plt.xlabel("Kaggle Score")
plt.ylabel("Model")
plt.title("Optimised vs Naive Model")
plt.bar_label(a)
plt.show()

In [None]:
#To run the following test boxes, you need to first run the first 9 boxes, 
# then you can start from here to do validation on test data!

In [15]:
#Training not needed anymore as we are moving on to the test data
del training
gc.collect()

NameError: name 'training' is not defined

In [16]:
#We are going to split the test data into smaller groups as its much larger than the train data and doesn't fit into RAM nicely.
groups = 4
test = get_x_data('test.parquet')[['customer_ID']]
#We want to get the unique customer ID's and flatten it to a list.
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
group_pop = len(customers) // groups #How big are the groups of people we are working with?
rows = []
for i in range(groups):
    if i == groups - 1: 
        customer_group_pop = customers[i * group_pop:] #If we are on the final group of people, its the left overs so get all from end of last group to end
    else: 
        customer_group_pop = customers[i * group_pop : (i + 1) * group_pop] #Get the group pop of people 
    s = test.loc[test.customer_ID.isin(customer_group_pop)].shape[0]
    #Get Dataframes of our groups' unique ID's
    rows.append(s)

shape of data: (11363762, 160)


In [23]:
skip_rows = 0
customers_predicted = 0
#Load in our model from Memory as generally running this separate from training / after restarting to clean RAM.
test_predict = [] #Our predict values for each individual
my_model = lgbm.Booster(model_file="LGBM_v3.3.2_fold0.lgbm")
for k in range(groups):
    test = get_x_data('test.parquet')
    test = test.iloc[skip_rows:skip_rows + rows[k]] #Get the test data for our first group of people
    skip_rows += rows[k] #Increment our skip row as we will jump past this next loop
    test = aggregate_data(test) #Do our magic to the data
    if k == groups - 1: 
        test = test.loc[customers[customers_predicted:]] #If we are the final group, get from last customer to the end of data
    else: 
        test = test.loc[customers[customers_predicted : customers_predicted + group_pop]] #Get from where we left off to the start of the next group of people
    customers_predicted += group_pop #Increment our starting position.
    x_test = test[features] #our test data is consists of only the features which we are using.
    predict = my_model.predict(x_test)
    test_predict.append(predict)

    del x_test, test
    gc.collect()

shape of data: (11363762, 160)
(231155, 477)
shape of data: (11363762, 160)
(231155, 477)
shape of data: (11363762, 160)
(231155, 477)
shape of data: (11363762, 160)
(231156, 477)


In [24]:
test_predict = np.concatenate(test_predict) #We wish to join all our prediction into one list
final_to_csv = pd.DataFrame(index=customers,data={'prediction':test_predict}) #Create our pseudo CSV
temp = pd.read_csv('sample_submission.csv')[['customer_ID']] #Open our template handin
temp['customer_ID_hash'] = temp['customer_ID'].apply(lambda x: int(x[-16:], 16) ).astype('int64') #Hash those customer_ID's with how we've done to space reduce so we can map our predictions
temp = temp.set_index('customer_ID_hash') #Set this temporary hash as the index.
temp = temp.merge(final_to_csv[['prediction']], left_index=True, right_index=True, how='left') #Merge on the common Customer_IDs
temp = temp.reset_index(drop=True)#Reset our index to the original customer ID
temp.to_csv(f'submission_lgbm.csv',index=False) #Write to disk.