In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("./input"))

import lightgbm as lgb
from bayes_opt import BayesianOptimization

from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

['.DS_Store', 'baseline_sub_lag_25_all_df.csv', 'baseline_sub_lag_29_non_zero.csv', 'baseline_sub_lag_29_zerokill.csv', 'feature_report.csv', 'sample_submission.csv', 'test.csv', 'test_leak_org.csv', 'test_leak_paradox.csv', 'test_leak_paradox_and_new_1.csv', 'test_leak_paradox_and_new_and_extreme.csv', 'test_leak_paradox_and_newlast_2_6.csv', 'test_leak_paradox_and_newlast_v8_6new.csv', 'test_leak_paradox_extend.csv', 'train.csv', 'train_leak_org.csv', 'train_leak_paradox.csv', 'train_leak_paradox_and_new_1.csv', 'train_leak_paradox_and_new_and_extreme.csv', 'train_leak_paradox_and_newlast_2_6.csv', 'train_leak_paradox_and_newlast_v8_6new.csv', 'train_leak_paradox_extend.csv']


In [4]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]

In [5]:
train_leak = pd.read_csv("./input/train_leak_paradox_and_new_and_extreme.csv")
test_leak = pd.read_csv("./input/test_leak_paradox_and_new_and_extreme.csv")
train_leak = train_leak.replace(np.nan,0.0)
test_leak = test_leak.replace(np.nan,0.0)

In [6]:
def rewrite_compiled_leak(leak_df, lag):
    leak_df["compiled_leak"] = 0
    for i in range(lag):
        c = "leaked_target_"+str(i)
        zeroleak = leak_df["compiled_leak"]==0
        leak_df.loc[zeroleak, "compiled_leak"] = leak_df.loc[zeroleak, c]
    return leak_df

In [7]:
leaky_cols = [c for c in train_leak.columns if 'leaked_target_' in c]

In [19]:
best_lag = 36
train_leak = rewrite_compiled_leak(train_leak, best_lag)
test_leak = rewrite_compiled_leak(test_leak, best_lag)
train_leak.head(5)

Unnamed: 0,ID,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,leaked_target_8,...,leaked_target_29,leaked_target_30,leaked_target_31,leaked_target_32,leaked_target_33,leaked_target_34,leaked_target_35,leaked_target_36,leaked_target_37,compiled_leak
0,000d6aaf2,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,...,38000000.0,0.0,38000000.0,0.0,38000000.0,38000000.0,0.0,0.0,38000000.0,38000000.0
1,000fbd867,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0
2,0027d6b71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,0.0,0.0,0.0,2000000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0
4,002a68644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
new_train = train.copy()
new_train["leak"] = train_leak['compiled_leak']
new_train['log_leak'] = np.log1p(new_train["leak"])

new_test = test.copy()
new_test['leak'] = test_leak['compiled_leak']
new_test['log_leak'] = np.log1p(new_test["leak"])

In [21]:
report = pd.read_csv("./input/feature_report.csv")
good_features = report.loc[report['rmse'] <= 0.7925]["feature"].values
rmses = report.loc[report['rmse'] <= 0.7925, 'rmse'].values

In [22]:
target = np.log1p(new_train['target'])

folds = KFold(n_splits=5, shuffle=True, random_state=1)

features = [f for f in new_train if f not in ['ID', 'leak', 'log_leak', 'target']]

new_train.replace(0, np.nan, inplace=True)
new_train['log_of_mean'] = np.log1p(new_train[features].replace(0, np.nan).mean(axis=1))
new_train['mean_of_log'] = np.log1p(new_train[features]).replace(0, np.nan).mean(axis=1)
new_train['log_of_median'] = np.log1p(new_train[features][features].replace(0, np.nan).median(axis=1))
new_train['nb_nans'] = new_train[features].isnull().sum(axis=1)
new_train['the_sum'] = np.log1p(new_train[features].sum(axis=1))
new_train['the_std'] = new_train[features].std(axis=1)
new_train['the_kur'] = new_train[features].kurtosis(axis=1)

new_test.replace(0, np.nan, inplace=True)
new_test['log_of_mean'] = np.log1p(new_test[features].replace(0, np.nan).mean(axis=1))
new_test['mean_of_log'] = np.log1p(new_test[features]).replace(0, np.nan).mean(axis=1)
new_test['log_of_median'] = np.log1p(new_test[features].replace(0, np.nan).median(axis=1))
new_test['nb_nans'] = new_test[features].isnull().sum(axis=1)
new_test['the_sum'] = np.log1p(new_test[features].sum(axis=1))
new_test['the_std'] = new_test[features].std(axis=1)
new_test['the_kur'] = new_test[features].kurtosis(axis=1)

features = good_features.tolist()
features = features + ['log_leak', 'log_of_mean', 'mean_of_log', 'log_of_median', 'nb_nans', 'the_sum', 'the_std', 'the_kur']

In [23]:
dtrain = lgb.Dataset(data=new_train[features], 
                     label=target, free_raw_data=False)
new_test['target'] = 0
dtrain.construct()

oof_preds = np.zeros(new_train.shape[0])

for trn_idx, val_idx in folds.split(new_train):
    lgb_params = {
        'objective': 'regression',
        'num_leaves': int(28.6946),
        'subsample': 0.3260,
        'colsample_bytree': 0.9960,
        'min_split_gain': 0.0486,
        'reg_alpha': 0.7855,
        'reg_lambda': 0.6919,
        'min_child_weight': 0.7665,
        'verbose': -1,
        'seed': 3,
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'learning_rate': 0.0613,
        'metric': 'l2',
    }

    clf = lgb.train(
        params=lgb_params,
        train_set=dtrain.subset(trn_idx),
        valid_sets=dtrain.subset(val_idx),
        num_boost_round=10000, 
        early_stopping_rounds=100,
        verbose_eval=0
    )

    oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
    new_test['target'] += clf.predict(new_test[features]) / folds.n_splits
    #print(mean_squared_error(target.iloc[val_idx], 
    #                         oof_preds[val_idx]) ** .5)

new_train['predictions'] = oof_preds
new_train.loc[new_train['leak'].notnull(), 'predictions'] = np.log1p(new_train.loc[new_train['leak'].notnull(), 'leak'])
print('OOF SCORE : %9.6f' 
      % (mean_squared_error(target, oof_preds) ** .5)) #[:len(train)]
print('OOF SCORE with LEAK : %9.6f' 
      % (mean_squared_error(target, new_train['predictions']) ** .5))

OOF SCORE :  0.516708
OOF SCORE with LEAK :  0.510376


In [24]:
new_test['target'] = np.expm1(new_test['target'])
new_test.head(11)

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,leak,log_leak,log_of_mean,mean_of_log,log_of_median,nb_nans,the_sum,the_std,the_kur,target
0,000137c73,,,,,,,,,,...,,,17.159699,14.608343,15.428193,4913,21.516408,112156200.0,63.987338,1804613.0
1,00021489f,,,,,,,,,,...,,,15.854051,13.910498,13.983117,4979,18.338957,10213670.0,1.32253,1794844.0
2,0004d7953,,,,,,,,,,...,,,15.59572,14.059439,14.508658,4892,20.19084,11525890.0,19.21089,1928073.0
3,00056a333,,,,,,,,,,...,,,16.507243,15.878736,16.198495,4871,21.294735,13984620.0,10.568926,3847431.0
4,00056d8eb,,,,,,,,,,...,,,16.626928,14.869434,15.036607,4983,18.706369,36913640.0,7.851651,1077397.0
5,0005fc190,,,,,,,,,,...,,,16.238255,15.286862,15.753069,4969,19.329298,13522890.0,0.916292,2104279.0
6,000787e86,,,,,,,,,,...,,,16.688396,13.88565,14.607098,4966,19.907272,30246050.0,4.019249,1757611.0
7,0008510a0,,,,,,,,,,...,,,16.302419,15.266272,15.332173,4984,18.248329,16637160.0,1.843609,1736364.0
8,000895faf,,,24617120.0,,,,,,,...,,,16.25726,14.830404,15.456254,4963,19.589464,15894700.0,4.704776,3822482.0
9,000986fba,,,,,,,,,12675000.0,...,,,15.701677,14.765847,15.053885,4843,20.698889,10194390.0,20.910049,1371428.0


In [None]:
#sub = test[["ID"]]
#sub["target"] = test_leak["compiled_leak"]
#sub.loc[sub["target"] > 0, "target"] = np.expm1(oof_preds[len(train):])
#sub.loc[sub["target"] == 0, "target"] = new_test['target'].values
#sub.head(12)

In [34]:
sub = test[["ID"]]
sub["target"] = test_leak["compiled_leak"]
sub.loc[sub["target"] == 0, "target"] = new_test.loc[test_leak["compiled_leak"] == 0, 'target'].values
sub.head(12)

Unnamed: 0,ID,target
0,000137c73,1804613.0
1,00021489f,1794844.0
2,0004d7953,1928073.0
3,00056a333,3847431.0
4,00056d8eb,1077397.0
5,0005fc190,2104279.0
6,000787e86,1757611.0
7,0008510a0,1736364.0
8,000895faf,3822482.0
9,000986fba,1371428.0


In [36]:
sub.to_csv(f"lgb_org_train_{best_lag}.csv", index=False)
print(f"lgb_org_train_{best_lag}.csv saved")

lgb_org_train_36.csv saved


In [None]:
#train_res = train_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#train_res.to_csv('train_leak.csv', index=False)
#print(f"train_leak.csv saved")

In [None]:
#test_res = test_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#test_res.to_csv('test_leak.csv', index=False)
#print(f"test_leak.csv saved")