In [3]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("./input"))

import lightgbm as lgb
from bayes_opt import BayesianOptimization

from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

['.DS_Store', 'feature_report_new_v1.csv', 'lstm_ae_df.csv', 'old', 'org_feature.csv', 'sample_submission.csv', 'test.csv', 'test_leak_081613_7854_114sets.csv', 'total.csv.zip', 'train.csv', 'train_leak_081613_114sets.csv']


In [4]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]

In [5]:
org_feature = pd.read_csv("./input/org_feature.csv") 

lstm_feature = pd.read_csv("./input/lstm_ae_df.csv") 
lstm_feature = lstm_feature.drop(["Unnamed: 0"], axis=1)

In [6]:
train = pd.concat([train, org_feature.iloc[:len(train)].reset_index(drop=True)], axis=1) 
test = pd.concat([test, org_feature.iloc[len(train):].reset_index(drop=True)], axis=1)

train = pd.concat([train, lstm_feature.iloc[:len(train)].reset_index(drop=True)], axis=1) 
test = pd.concat([test, lstm_feature.iloc[len(train):].reset_index(drop=True)], axis=1)

In [7]:
train_leak = pd.read_csv("./input/train_leak_081613_114sets.csv")
test_leak = pd.read_csv("./input/test_leak_081613_7854_114sets.csv")
train_leak = train_leak.replace(np.nan,0.0)
test_leak = test_leak.replace(np.nan,0.0)

In [8]:
def rewrite_compiled_leak(leak_df, lag):
    leak_df["compiled_leak"] = 0
    for i in range(lag):
        c = "leaked_target_"+str(i)
        zeroleak = leak_df["compiled_leak"]==0
        leak_df.loc[zeroleak, "compiled_leak"] = leak_df.loc[zeroleak, c]
    return leak_df

In [9]:
leaky_cols = [c for c in train_leak.columns if 'leaked_target_' in c]

In [347]:
best_lag = 38
train_leak = rewrite_compiled_leak(train_leak, best_lag)
test_leak = rewrite_compiled_leak(test_leak, best_lag)
train_leak.head(5)

Unnamed: 0,ID,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,leaked_target_8,...,leaked_target_29,leaked_target_30,leaked_target_31,leaked_target_32,leaked_target_33,leaked_target_34,leaked_target_35,leaked_target_36,leaked_target_37,compiled_leak
0,000d6aaf2,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,...,38000000.0,0.0,38000000.0,0.0,38000000.0,38000000.0,0.0,0.0,38000000.0,38000000.0
1,000fbd867,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0
2,0027d6b71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,0.0,0.0,0.0,2000000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0
4,002a68644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [348]:
new_train = test.loc[test_leak["compiled_leak"] > 0].copy()
new_train["target"] = test_leak["compiled_leak"].loc[test_leak["compiled_leak"] > 0]
new_train["leak"] = new_train["target"]
new_train['log_leak'] = np.log1p(new_train["leak"])

_temp_train = train.copy()
_temp_train["leak"] = train_leak['compiled_leak']
_temp_train['log_leak'] = np.log1p(_temp_train["leak"])

new_train = pd.concat([_temp_train, new_train]).reset_index(drop=True)
new_test = test.loc[test_leak["compiled_leak"] == 0].copy().reset_index(drop=True)
new_test['leak'] = 0
new_test['log_leak'] = 0

In [None]:
#new_train.to_csv("new_train_081415_111sets.csv", index=False)

In [349]:
new_train.replace(0, np.nan, inplace=True)
new_test.replace(0, np.nan, inplace=True)

In [369]:
feature_criterion = 0.62509 #0.62509 #
report= pd.read_csv("./input/feature_report_new_v1.csv")
good_features = report.loc[report['rmse'] <= feature_criterion]["feature"].values
features = good_features.tolist()
features = features + ['log_leak', 'log_of_mean', 'mean_of_log', 'log_of_median', 'nb_nans', 'the_sum', 'the_std', 'the_kur']
len(report.loc[report['rmse'] <= feature_criterion])

45

In [370]:
target = np.log1p(new_train['target'])

folds = KFold(n_splits=5, shuffle=True, random_state=1)

dtrain = lgb.Dataset(data=new_train[features], 
                     label=target, free_raw_data=False)
new_test['target'] = 0
dtrain.construct()

oof_preds = np.zeros(new_train.shape[0])

for trn_idx, val_idx in folds.split(new_train):
    lgb_params = {
        'objective': 'regression',
        'num_leaves': #int(18.2178), #int(12.024701808594909), #
        'subsample': 0.1,
        'colsample_bytree': 1.0, #0.9999998944771263, #
        'min_split_gain': 1e-05, #1e-05,
        'reg_alpha': 0, #2.926647409600368, #
        'reg_lambda': 17.7483, #0.27039842736589326, #
        'min_child_weight': 8, #0.0, #
        'verbose': -1,
        'seed': 3,
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'learning_rate': 0.1786, #0.1, #
        'metric': 'l2',
    }

    clf = lgb.train(
        params=lgb_params,
        train_set=dtrain.subset(trn_idx),
        valid_sets=dtrain.subset(val_idx),
        num_boost_round=10000, 
        early_stopping_rounds=100,
        verbose_eval=0
    )

    oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
    new_test['target'] += clf.predict(new_test[features]) / folds.n_splits
    #print(mean_squared_error(target.iloc[val_idx], 
    #                         oof_preds[val_idx]) ** .5)

new_train['predictions'] = oof_preds
new_train.loc[new_train['leak'].notnull(), 'predictions'] = np.log1p(new_train.loc[new_train['leak'].notnull(), 'leak'])
print('OOF SCORE : %9.6f' 
      % (mean_squared_error(target[:len(train)], oof_preds[:len(train)]) ** .5)) #[:len(train)]
print('OOF SCORE with LEAK : %9.6f' 
      % (mean_squared_error(target[:len(train)], new_train['predictions'][:len(train)]) ** .5))

OOF SCORE :  0.493458
OOF SCORE with LEAK :  0.492165


In [371]:
new_test['target'] = np.expm1(new_test['target'])
new_test.head(11) 

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,lstm_AE_col112_3,lstm_AE_col112_4,lstm_AE_col113_0,lstm_AE_col113_1,lstm_AE_col113_2,lstm_AE_col113_3,lstm_AE_col113_4,leak,log_leak,target
0,000137c73,,,,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,1855936.0
1,00021489f,,,,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,1565374.0
2,0004d7953,,,,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,1941901.0
3,00056a333,,,,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,3784817.0
4,00056d8eb,,,,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,1111175.0
5,0005fc190,,,,,,,,,,...,-0.055327,-0.038958,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,2749239.0
6,000787e86,,,,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,1807883.0
7,0008510a0,,,,,,,,,,...,-0.170009,-0.143331,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,1580224.0
8,000895faf,,,24617120.0,,,,,,,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,3423985.0
9,000986fba,,,,,,,,,12675000.0,...,-0.706841,-0.104855,-0.020593,-0.041008,-0.082631,0.117623,0.108979,,,2123442.0


In [254]:
#sub = test[["ID"]]
#sub["target"] = test_leak["compiled_leak"]
#sub.loc[sub["target"] > 0, "target"] = np.expm1(oof_preds[len(train):])
#sub.loc[sub["target"] == 0, "target"] = new_test['target'].values
#sub.head(12)

In [372]:
sub = test[["ID"]]
sub["target"] = test_leak["compiled_leak"]
sub.loc[sub["target"] == 0, "target"] = new_test['target'].values
sub.head(12)

Unnamed: 0,ID,target
0,000137c73,1855936.0
1,00021489f,1565374.0
2,0004d7953,1941901.0
3,00056a333,3784817.0
4,00056d8eb,1111175.0
5,0005fc190,2749239.0
6,000787e86,1807883.0
7,0008510a0,1580224.0
8,000895faf,3423985.0
9,000986fba,2123442.0


In [373]:
sub.to_csv(f"lgb_new_train_{best_lag}.csv", index=False)
print(f"lgb_new_train_{best_lag}.csv saved")

lgb_new_train_38.csv saved


In [None]:
#train_res = train_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#train_res.to_csv('train_leak.csv', index=False)
#print(f"train_leak.csv saved")

In [None]:
#test_res = test_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#test_res.to_csv('test_leak.csv', index=False)
#print(f"test_leak.csv saved")