In [1]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from baseline_model import base_model_lgbm

import gc

In [None]:
train_data = pd.read_pickle('Data/train_data.pkl')
test_data = pd.read_pickle('Data/test_data.pkl')
train_labels = pd.read_pickle('Data/train_labels.pkl')
train = pd.read_pickle('Data/train.pkl')
categorical_features = ['B_30', 'B_31', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
train_data.shape, test_data.shape

In [None]:
train_data[categorical_features] = train_data[categorical_features].astype(str)
test_data[categorical_features] = test_data[categorical_features].astype(str)
numerical_features = train_data.columns[train_data.dtypes == 'float16']

In [None]:
for feature in categorical_features:
    print(feature, train_data[feature].unique())

for feature in categorical_features:
    print(feature, test_data[feature].unique())

for feature in categorical_features:
    print(feature, set(train_data[feature].unique()).difference(set(test_data[feature].unique())))

In [None]:
numeric_transformer = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='median')),
    ('Scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('Imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('OneHotEncoder', OneHotEncoder(drop='first'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('Numerical', numeric_transformer, numerical_features),
        ('Categorical', categorical_transformer, categorical_features)])

In [None]:
train_processed = pd.DataFrame(preprocessor.fit_transform(train_data), index=train_data.index)
train_processed.shape

In [None]:
train = train_processed.groupby('customer_ID').tail(1)
train.to_pickle('Data/train.pkl')

In [None]:
del train_data, train_processed
gc.collect()

In [2]:
train_labels = pd.read_pickle('Data/train_labels.pkl')
train = pd.read_pickle('Data/train.pkl')

In [3]:
models, df_scores, importances, df_results = base_model_lgbm(train, train_labels)

Fold: 0 - seed: 0
[LightGBM] [Info] Number of positive: 59414, number of negative: 170042
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40080
[LightGBM] [Info] Number of data points in the train set: 229456, number of used features: 210
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258934 -> initscore=-1.051516
[LightGBM] [Info] Start training from score -1.051516
Training until validation scores don't improve for 200 rounds
[100]	training's binary_logloss: 0.408692	training's AMEX: 0.74684	valid_1's binary_logloss: 0.410069	valid_1's AMEX: 0.738791
[200]	training's binary_logloss: 0.333025	training's AMEX: 0.753374	valid_1's binary_logloss: 0.335438	valid_1's AMEX: 0.745024
[300]	training's binary_logloss: 0.292166	training's AMEX: 0.758513	valid_1's binary_logloss: 0.295431	valid_1's AMEX: 0.749744
[400]	training's binary_logloss: 0.268368	training's AMEX: 0.764071	valid_1's binary_logloss: 0.272422	valid_1's AMEX: 0.754394
[500]	training'

In [4]:
df_results

seed,0,1,fold_mean
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.771198,0.770856,0.771027
1,0.773418,0.773547,0.773483
seed_mean,0.772308,0.772201,0.772255


In [None]:
test_processed = pd.DataFrame(preprocessor.transform(test_data), index=test_data.index)
test = test_processed.groupby('customer_ID').tail(1)

In [None]:
prediction_list = []
for keys in models.keys():
    prediction_list.append(models[keys].predict(test))

prediction_df = pd.DataFrame(prediction_list).T
prediction_df.index = test.index

In [None]:
prediction_df.mean(axis = 1).to_csv('Data/prediction.csv')