In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("./input"))

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

['sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [4]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']

In [5]:
def _get_leak(df, cols, lag=0):
    d1 = df[cols[:-lag-2]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2 = df[cols[lag+2:]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['pred'] = df[cols[lag]]
    d2 = d2[d2.pred != 0] ### to make output consistent with Hasan's function
    d3 = d2[~d2.duplicated(['key'], keep='first')]
    return d1.merge(d3, how='left', on='key').pred.fillna(0)

In [6]:
test["target"] = train["target"].mean()

all_df = pd.concat([train, test]).reset_index(drop=True)

In [7]:
def compiled_leak_result():
    
    max_nlags = len(cols) - 2
    all_leak = all_df[["ID", "target"] + cols]
    all_leak["compiled_leak"] = 0
    all_leak["nonzero_mean"] = all_df[transact_cols].apply(
        lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1
    )
    
    scores = []
    leaky_value_counts = []
    leaky_value_corrects = []
    leaky_cols = []
    
    for i in range(max_nlags):
        c = "leaked_target_"+str(i)
        
        print('Processing lag', i)
        all_leak[c] = _get_leak(all_leak, cols, i)
        
        leaky_cols.append(c)
        all_leak = all_df.join(
            all_leak.set_index("ID")[leaky_cols+["compiled_leak", "nonzero_mean"]], 
            on="ID", how="left"
        )[["ID", "target"] + cols + leaky_cols+["compiled_leak", "nonzero_mean"]]
        
        zeroleak = all_leak["compiled_leak"]==0
        all_leak.loc[zeroleak, "compiled_leak"] = all_leak.loc[zeroleak, c]
        
        train_leak = train.join(all_leak.set_index("ID")[leaky_cols+["compiled_leak", "nonzero_mean"]]
                                , on="ID", how="left"
                               )[["ID", "target"] + cols + leaky_cols+["compiled_leak", "nonzero_mean"]]
        
        leaky_value_counts.append(sum(train_leak["compiled_leak"] > 0))
        _correct_counts = sum(train_leak["compiled_leak"]==train_leak["target"])
        leaky_value_corrects.append(_correct_counts/leaky_value_counts[-1])
        print("Leak values found in train & test: ", leaky_value_counts[-1], 
              sum(all_leak["compiled_leak"] > 0) - leaky_value_counts[-1])
        print(
            "% of correct leaks values in train ", 
            leaky_value_corrects[-1]
        )
        tmp = train_leak.copy()
        tmp.loc[zeroleak, "compiled_leak"] = tmp.loc[zeroleak, "nonzero_mean"]
        scores.append(np.sqrt(mean_squared_error(y, np.log1p(tmp["compiled_leak"]).fillna(14.49))))
        print(
            'Score (filled with nonzero mean)', 
            scores[-1]
        )
    result = dict(
        score=scores, 
        leaky_count=leaky_value_counts,
        leaky_correct=leaky_value_corrects,
    )
    return all_leak, result

In [8]:
all_leak, result = compiled_leak_result()

Processing lag 0
Leak values found in train & test:  1719 26071
% of correct leaks values in train  0.796393251890634
Score (filled with nonzero mean) 1.5138333391635188
Processing lag 1
Leak values found in train & test:  2334 27469
% of correct leaks values in train  0.8410454155955441
Score (filled with nonzero mean) 1.2851148974795277
Processing lag 2
Leak values found in train & test:  2726 28335
% of correct leaks values in train  0.8606016140865738
Score (filled with nonzero mean) 1.1731452616255769
Processing lag 3
Leak values found in train & test:  2974 28996
% of correct leaks values in train  0.8702084734364492
Score (filled with nonzero mean) 1.0846181414970784
Processing lag 4
Leak values found in train & test:  3148 29528
% of correct leaks values in train  0.875158831003812
Score (filled with nonzero mean) 1.0371119114150071
Processing lag 5
Leak values found in train & test:  3304 29957
% of correct leaks values in train  0.8768159806295399
Score (filled with nonzero m

In [9]:
best_score = np.min(result['score'])
best_lag = np.argmin(result['score'])
print('best_score', best_score, '\nbest_lag', best_lag)

best_score 0.8468959707608168 
best_lag 22


In [None]:
def rewrite_compiled_leak(leak_df, lag):
    leak_df["compiled_leak"] = 0
    for i in range(lag):
        c = "leaked_target_"+str(i)
        zeroleak = leak_df["compiled_leak"]==0
        leak_df.loc[zeroleak, "compiled_leak"] = leak_df.loc[zeroleak, c]
    return leak_df

In [None]:
leaky_cols = [c for c in all_leak.columns if 'leaked_target_' in c]

In [None]:
all_leak_best_lag = rewrite_compiled_leak(all_leak, best_lag)
test_leak = test.join(
            all_leak_best_lag.set_index("ID")[leaky_cols+["compiled_leak", "nonzero_mean"]], 
            on="ID", how="left"
        )[["ID", "target"] + cols + leaky_cols+["compiled_leak", "nonzero_mean"]]
test_leak.head()

In [None]:
test_leak.loc[test_leak["compiled_leak"]==0, "compiled_leak"] = test_leak.loc[test_leak["compiled_leak"]==0, "nonzero_mean"]
test_leak.head()

In [None]:
sub = test[["ID"]]
sub["target"] = test_leak["compiled_leak"]
sub.to_csv(f"baseline_sub_lag_{best_lag}.csv", index=False)
print(f"baseline_sub_lag_{best_lag}.csv saved")