In [1]:
import warnings
warnings.filterwarnings("ignore")

from evaluation_metric import *
from baseline_model import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import gc

In [2]:
train_data = pd.read_parquet('Data/train_data_aggV3.parquet')
train_data.set_index('customer_ID', inplace=True)
train_labels = pd.read_pickle('Data/train_labels.pkl').loc[train_data.index]

train_data.shape, train_labels.shape

((458913, 637), (458913, 1))

In [3]:
train_data.drop(columns=['target', 'cid', 'S_2'],axis=1, inplace=True)
train_data.dtypes.unique()

array([Float32Dtype(), dtype('int16'), dtype('float32'), dtype('int8'),
       Float64Dtype(), dtype('float64')], dtype=object)

In [4]:
Float32D_columns = train_data.columns[train_data.dtypes == 'Float32'].values
Float64D_columns = train_data.columns[train_data.dtypes == 'Float64'].values
int16_columns = train_data.columns[train_data.dtypes == 'int16'].values
int8_columns = train_data.columns[train_data.dtypes == 'int8'].values

train_data[Float32D_columns] = train_data[Float32D_columns].astype('float32')
train_data[Float64D_columns] = train_data[Float64D_columns].astype('float32')
train_data[int16_columns] = train_data[int16_columns].astype('float32')
train_data[int8_columns] = train_data[int8_columns].astype('int32')

In [5]:
train_data.dtypes.unique()

array([dtype('float32'), dtype('int32'), dtype('float64')], dtype=object)

In [6]:
models, importances, df_results, score_cv = base_model_lgbm(train_data, train_labels)

Fold: 0 - seed: 0
[LightGBM] [Info] Number of positive: 95062, number of negative: 272068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 111695
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 629
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258933 -> initscore=-1.051523
[LightGBM] [Info] Start training from score -1.051523
Training until validation scores don't improve for 1500 rounds
[500]	valid_0's binary_logloss: 0.23137	valid_0's AMEX: 0.775454
[1000]	valid_0's binary_logloss: 0.22334	valid_0's AMEX: 0.785643
[1500]	valid_0's binary_logloss: 0.220837	valid_0's AMEX: 0.789515
[2000]	valid_0's binary_logloss: 0.219684	valid_0's AMEX: 0.790374
[2500]	valid_0's binary_logloss: 0.21911	valid_0's AMEX: 0.791341
[3000]	valid_0's binary_logloss: 0.218776	valid_0's AMEX: 0.791821
[3500]	valid_0's binary_logloss: 0.218527	valid_0's AMEX: 0.791893
[4000]	valid_0's binary_logloss: 0.218389	valid_0's AMEX: 0

KeyboardInterrupt: 

In [None]:
df_results

In [None]:
pickle.dump(models, open("Models/models_baseline_7.pkl", "wb"))
pickle.dump(importances, open("Models/importances_baseline_7.pkl", "wb"))

In [None]:
def plot_importance(ii, features, PLOT_TOP_N = 50, figsize=(10, 10)):
    importance_df = pd.DataFrame(data=importances, columns=features)
    sorted_indices = importance_df.median(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()

    
plot_importance(np.array(importances),train_data.columns, PLOT_TOP_N = 100, figsize=(10, 20))

In [None]:
models = pd.read_pickle('Models/models_baseline_7.pkl')
prediction_list = []
for keys in models.keys():
    prediction_list.append(models[keys].predict(train_data))

prediction_df = pd.DataFrame(prediction_list).T
prediction_df.index = train_data.index

In [None]:
prediction_train = prediction_df.mean(axis=1)

In [None]:
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(train_labels, prediction_train, n_bins=10)

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(mean_predicted_value, fraction_of_positives, 's-')
plt.plot([0, 1], [0, 1], '--', color='gray')

sns.despine(left=True, bottom=True)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.title("Calibration Curve", fontsize=20); pass

In [None]:
from sklearn.isotonic import IsotonicRegression
calibr = IsotonicRegression()
calibr.fit(prediction_train,train_labels.target)

In [None]:
prediction_cali_train = calibr.predict(prediction_train)
fraction_of_positives, mean_predicted_value = calibration_curve(train_labels, prediction_cali_train, n_bins=10)

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(mean_predicted_value, fraction_of_positives, 's-')
plt.plot([0, 1], [0, 1], '--', color='gray')

sns.despine(left=True, bottom=True)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.title("Calibration Curve", fontsize=20); pass

In [None]:
test_data = pd.read_pickle('Data/test_agg_mo.pkl')
test_data.shape

In [None]:
prediction_list = []
for keys in models.keys():
    prediction_list.append(models[keys].predict(test_data))

prediction_df = pd.DataFrame(prediction_list).T
prediction_df.index = test_data.index

In [None]:
prediction_test = prediction_df.mean(axis=1)

In [None]:
prediction_test_cali = calibr.predict(prediction_test)

In [None]:
prediction_test_cali_df = pd.DataFrame(prediction_test_cali, index=test_data.index, columns=['prediction'])


In [None]:
indx = prediction_test_cali_df[prediction_test_cali_df['prediction'].isnull()].index.to_list()

In [None]:
prediction_df.mean(axis=1).loc[indx]

In [None]:
prediction_test_cali_df.loc[indx]['prediction'] = prediction_df.mean(axis=1).loc[indx].values

In [None]:
prediction_test_cali_df.fillna(0.999, inplace=True)

In [None]:
prediction_test_cali_df.to_csv('Output/b7_calibrated.csv')

In [None]:
prediction_df.mean(axis=1).to_csv('Output/b7.csv', header=['prediction'])

In [None]:
prediction_df.loc['639c24b93e9cd49257a59e5b31abf955f2339d536771983c6acddc50050f1945']