In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("./input"))

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
from bayes_opt import BayesianOptimization

from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

['.DS_Store', 'baseline_sub_lag_25_all_df.csv', 'baseline_sub_lag_29_non_zero.csv', 'baseline_sub_lag_29_zerokill.csv', 'feature_report.csv', 'new_train_0808.csv', 'new_train_081415_111sets.csv', 'sample_submission.csv', 'test.csv', 'test_for_train.csv', 'test_leak_081204_7880.csv', 'test_leak_081316_7856.csv', 'test_leak_081415_7854_111sets.csv', 'test_leak_org.csv', 'test_leak_paradox.csv', 'test_leak_paradox_and_new_1.csv', 'test_leak_paradox_and_new_and_extreme.csv', 'test_leak_paradox_and_newlast_2_6.csv', 'test_leak_paradox_and_newlast_v8_6new.csv', 'test_leak_paradox_extend.csv', 'total.csv', 'total.csv.zip', 'train.csv', 'train_leak_081204_3886.csv', 'train_leak_081316_all.csv', 'train_leak_081415_111sets.csv', 'train_leak_org.csv', 'train_leak_paradox.csv', 'train_leak_paradox_and_new_1.csv', 'train_leak_paradox_and_new_and_extreme.csv', 'train_leak_paradox_and_newlast_2_6.csv', 'train_leak_paradox_and_newlast_v8_6new.csv', 'train_leak_paradox_extend.csv']


  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]

In [3]:
train_leak = pd.read_csv("./input/train_leak_081415_111sets.csv")
test_leak = pd.read_csv("./input/test_leak_081415_7854_111sets.csv")
train_leak = train_leak.replace(np.nan,0.0)
test_leak = test_leak.replace(np.nan,0.0)

In [4]:
def rewrite_compiled_leak(leak_df, lag):
    leak_df["compiled_leak"] = 0
    for i in range(lag):
        c = "leaked_target_"+str(i)
        zeroleak = leak_df["compiled_leak"]==0
        leak_df.loc[zeroleak, "compiled_leak"] = leak_df.loc[zeroleak, c]
    return leak_df

In [5]:
leaky_cols = [c for c in train_leak.columns if 'leaked_target_' in c]

In [6]:
best_lag = 37
train_leak = rewrite_compiled_leak(train_leak, best_lag)
test_leak = rewrite_compiled_leak(test_leak, best_lag)
train_leak.head(5)

Unnamed: 0,ID,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,leaked_target_8,...,leaked_target_29,leaked_target_30,leaked_target_31,leaked_target_32,leaked_target_33,leaked_target_34,leaked_target_35,leaked_target_36,leaked_target_37,compiled_leak
0,000d6aaf2,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,0.0,...,38000000.0,0.0,38000000.0,0.0,38000000.0,38000000.0,0.0,0.0,38000000.0,38000000.0
1,000fbd867,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,600000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0
2,0027d6b71,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,0.0,0.0,0.0,2000000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0
4,002a68644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
new_train = test.loc[test_leak["compiled_leak"] > 0].copy()
new_train["target"] = test_leak["compiled_leak"].loc[test_leak["compiled_leak"] > 0]
new_train["leak"] = new_train["target"]
new_train['log_leak'] = np.log1p(new_train["leak"])

_temp_train = train.copy()
_temp_train["leak"] = train_leak['compiled_leak']
_temp_train['log_leak'] = np.log1p(_temp_train["leak"])

new_train = pd.concat([_temp_train, new_train]).reset_index(drop=True)
new_test = test.loc[test_leak["compiled_leak"] == 0].copy().reset_index(drop=True)
new_test['leak'] = 0
new_test['log_leak'] = 0

In [None]:
#new_train.to_csv("new_train_081415_111sets.csv", index=False)

In [8]:
target = np.log1p(new_train['target'])

folds = KFold(n_splits=5, shuffle=True, random_state=1)

features = [f for f in new_train if f not in ['ID', 'leak', 'log_leak', 'target']]

new_train.replace(0, np.nan, inplace=True)
new_train['log_of_mean'] = np.log1p(new_train[features].replace(0, np.nan).mean(axis=1))
new_train['mean_of_log'] = np.log1p(new_train[features]).replace(0, np.nan).mean(axis=1)
new_train['log_of_median'] = np.log1p(new_train[features][features].replace(0, np.nan).median(axis=1))
new_train['nb_nans'] = new_train[features].isnull().sum(axis=1)
new_train['the_sum'] = np.log1p(new_train[features].sum(axis=1))
new_train['the_std'] = new_train[features].std(axis=1)
new_train['the_kur'] = new_train[features].kurtosis(axis=1)

new_test.replace(0, np.nan, inplace=True)
new_test['log_of_mean'] = np.log1p(new_test[features].replace(0, np.nan).mean(axis=1))
new_test['mean_of_log'] = np.log1p(new_test[features]).replace(0, np.nan).mean(axis=1)
new_test['log_of_median'] = np.log1p(new_test[features].replace(0, np.nan).median(axis=1))
new_test['nb_nans'] = new_test[features].isnull().sum(axis=1)
new_test['the_sum'] = np.log1p(new_test[features].sum(axis=1))
new_test['the_std'] = new_test[features].std(axis=1)
new_test['the_kur'] = new_test[features].kurtosis(axis=1)

In [13]:
feature_criterion = 0.7907
report = pd.read_csv("./input/feature_report.csv")
good_features = report.loc[report['rmse'] <= feature_criterion]["feature"].values
features = good_features.tolist()
features = features + ['log_leak', 'log_of_mean', 'mean_of_log', 'log_of_median', 'nb_nans', 'the_sum', 'the_std', 'the_kur']

cat_train = new_train[features].copy()

In [15]:
%%time
new_test['target'] = 0
oof_preds = np.zeros(new_train.shape[0])
for trn_idx, val_idx in folds.split(new_train):
    train_pool = Pool(cat_train.iloc[trn_idx], target.iloc[trn_idx])
    valid_pool = Pool(cat_train.iloc[val_idx], target.iloc[val_idx])
    
    model = CatBoostRegressor(iterations=200,
                              learning_rate=0.05,
                              depth=12,
                              reg_lambda = 0.27039842736589326,
                              bootstrap_type = "Bayesian",
                              bagging_temperature = 0.2,
                              od_type='Iter',
                              od_wait=20,
                              random_seed = 3,
                              eval_metric='RMSE',
                              verbose = 2)

    model.fit(train_pool, eval_set=valid_pool, use_best_model=True, verbose=False)
    oof_preds[val_idx] = model.predict(cat_train.iloc[val_idx])
    
    new_test['target'] += model.predict(new_test[features]) / folds.n_splits
    print(mean_squared_error(target.iloc[val_idx], 
                             oof_preds[val_idx]) ** .5)
    
new_train['predictions'] = oof_preds
new_train.loc[new_train['leak'].notnull(), 'predictions'] = np.log1p(new_train.loc[new_train['leak'].notnull(), 'leak'])
print('OOF SCORE : %9.6f' 
      % (mean_squared_error(target[:len(train)], oof_preds[:len(train)]) ** .5)) #[:len(train)]
print('OOF SCORE with LEAK : %9.6f' 
      % (mean_squared_error(target[:len(train)], new_train['predictions'][:len(train)]) ** .5))

0.5706684809794174
0.5566110400904399
0.574616006388636
0.5670554602752552
0.5650717826700655
OOF SCORE :  0.750048
OOF SCORE with LEAK :  0.632932
CPU times: user 10min 19s, sys: 1min 55s, total: 12min 14s
Wall time: 3min 52s


In [None]:
new_test['target'] = np.expm1(new_test['target'])
new_test.head(11)

In [None]:
#sub = test[["ID"]]
#sub["target"] = test_leak["compiled_leak"]
#sub.loc[sub["target"] > 0, "target"] = np.expm1(oof_preds[len(train):])
#sub.loc[sub["target"] == 0, "target"] = new_test['target'].values
#sub.head(12)

In [None]:
sub = test[["ID"]]
sub["target"] = test_leak["compiled_leak"]
sub.loc[sub["target"] == 0, "target"] = new_test['target'].values
sub.head(12)

In [None]:
sub.to_csv(f"lgb_new_train_{best_lag}.csv", index=False)
print(f"lgb_new_train_{best_lag}.csv saved")

In [None]:
#train_res = train_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#train_res.to_csv('train_leak.csv', index=False)
#print(f"train_leak.csv saved")

In [None]:
#test_res = test_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#test_res.to_csv('test_leak.csv', index=False)
#print(f"test_leak.csv saved")