In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("./input"))

import lightgbm as lgb
from sklearn.model_selection import *
from sklearn.metrics import mean_squared_error, make_scorer
from scipy.stats import mode, skew, kurtosis, entropy
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

import dask.dataframe as dd
from dask.multiprocessing import get

from tqdm import tqdm, tqdm_notebook
tqdm.pandas(tqdm_notebook)

['.DS_Store', 'feature_report.csv', 'sample_submission.csv', 'test.csv', 'train.csv']


In [3]:
train = pd.read_csv("./input/train.csv")
test = pd.read_csv("./input/test.csv")

transact_cols = [f for f in train.columns if f not in ["ID", "target"]]
y = np.log1p(train["target"]).values

In [4]:
test["target"] = train["target"].mean()

In [5]:
cols = ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1',
       '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9',
       'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', 
       '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212',  '66ace2992',
       'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', 
       '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a',
       '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2',  '0572565c2',
       '190db8488',  'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98']

In [6]:
def _get_leak(df, cols, lag=0):
    d1 = df[cols[:-lag-2]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2 = df[cols[lag+2:]].apply(tuple, axis=1).to_frame().rename(columns={0: 'key'})
    d2['pred'] = df[cols[lag]]
    #all_zero = tuple(np.repeat(0,len(cols)-2-lag))
    #d2 = d2[d2.pred != 0] ### to make output consistent with Hasan's function
    #d2 = d2[~(d2["key"] == all_zero)]
    
    d3 = d2[~d2.duplicated(['key'], keep=False)]
    return d1.merge(d3, how='left', on='key').pred.fillna(0)

In [7]:
def compiled_leak_result():
    
    max_nlags = len(cols) - 2
    train_leak = train[["ID", "target"] + cols]
    train_leak["compiled_leak"] = 0
    train_leak["nonzero_mean"] = train[transact_cols].apply(
        lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1
    )
    
    scores = []
    leaky_value_counts = []
    leaky_value_corrects = []
    leaky_cols = []
    
    for i in range(max_nlags):
        c = "leaked_target_"+str(i)
        
        print('Processing lag', i)
        train_leak[c] = _get_leak(train_leak, cols, i)
        
        leaky_cols.append(c)
        train_leak = train.join(
            train_leak.set_index("ID")[leaky_cols+["compiled_leak", "nonzero_mean"]], 
            on="ID", how="left"
        )[["ID", "target"] + cols + leaky_cols+["compiled_leak", "nonzero_mean"]]
        zeroleak = train_leak["compiled_leak"]==0
        train_leak.loc[zeroleak, "compiled_leak"] = train_leak.loc[zeroleak, c]
        leaky_value_counts.append(sum(train_leak["compiled_leak"] > 0))
        _correct_counts = sum(train_leak["compiled_leak"]==train_leak["target"])
        leaky_value_corrects.append(_correct_counts/leaky_value_counts[-1])
        print("Leak values found in train", leaky_value_counts[-1])
        print(
            "% of correct leaks values in train ", 
            leaky_value_corrects[-1]
        )
        tmp = train_leak.copy()
        tmp.loc[zeroleak, "compiled_leak"] = tmp.loc[zeroleak, "nonzero_mean"]
        scores.append(np.sqrt(mean_squared_error(y, np.log1p(tmp["compiled_leak"]).fillna(14.49))))
        print(
            'Score (filled with nonzero mean)', 
            scores[-1]
        )
    result = dict(
        score=scores, 
        leaky_count=leaky_value_counts,
        leaky_correct=leaky_value_corrects,
    )
    return train_leak, result

In [8]:
train_leak, result = compiled_leak_result()

Processing lag 0
Leak values found in train 1351
% of correct leaks values in train  0.9955588452997779
Score (filled with nonzero mean) 1.5138333391635188
Processing lag 1
Leak values found in train 1947
% of correct leaks values in train  0.9964047252182845
Score (filled with nonzero mean) 1.2922048129527162
Processing lag 2
Leak values found in train 2340
% of correct leaks values in train  0.9935897435897436
Score (filled with nonzero mean) 1.1732829046778304
Processing lag 3
Leak values found in train 2586
% of correct leaks values in train  0.9930394431554525
Score (filled with nonzero mean) 1.084326373672634
Processing lag 4
Leak values found in train 2754
% of correct leaks values in train  0.9934640522875817
Score (filled with nonzero mean) 1.0327870440015579
Processing lag 5
Leak values found in train 2899
% of correct leaks values in train  0.9931010693342532
Score (filled with nonzero mean) 0.9940324299498775
Processing lag 6
Leak values found in train 3014
% of correct lea

In [9]:
best_score = np.min(result['score'])
best_lag = np.argmin(result['score'])
print('best_score', best_score, '\nbest_lag', best_lag)

best_score 0.7371911838169722 
best_lag 29


In [10]:
def rewrite_compiled_leak(leak_df, lag):
    leak_df["compiled_leak"] = 0
    for i in range(lag):
        c = "leaked_target_"+str(i)
        zeroleak = leak_df["compiled_leak"]==0
        leak_df.loc[zeroleak, "compiled_leak"] = leak_df.loc[zeroleak, c]
    return leak_df

In [11]:
leaky_cols = [c for c in train_leak.columns if 'leaked_target_' in c]

In [12]:
train_leak = rewrite_compiled_leak(train_leak, best_lag)
train_leak[['ID', 'target']+leaky_cols+['compiled_leak']].head()

Unnamed: 0,ID,target,leaked_target_0,leaked_target_1,leaked_target_2,leaked_target_3,leaked_target_4,leaked_target_5,leaked_target_6,leaked_target_7,...,leaked_target_29,leaked_target_30,leaked_target_31,leaked_target_32,leaked_target_33,leaked_target_34,leaked_target_35,leaked_target_36,leaked_target_37,compiled_leak
0,000d6aaf2,38000000.0,38000000.0,38000000.0,38000000.0,0.0,38000000.0,0.0,38000000.0,0.0,...,38000000.0,0.0,38000000.0,0.0,38000000.0,38000000.0,0.0,0.0,0.0,38000000.0
1,000fbd867,600000.0,600000.0,0.0,0.0,0.0,0.0,0.0,600000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600000.0
2,0027d6b71,10000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0028cbf45,2000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2000000.0
4,002a68644,14400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
train_res = train_leak[leaky_cols+['compiled_leak']].replace(0.0, np.nan)

In [14]:
def compiled_leak_result_test(max_nlags):
    test_leak = test[["ID", "target"] + cols]
    test_leak["compiled_leak"] = 0
    test_leak["nonzero_mean"] = test[transact_cols].apply(
        lambda x: np.expm1(np.log1p(x[x!=0]).mean()), axis=1
    )
    
    scores = []
    leaky_value_counts = []
    # leaky_value_corrects = []
    leaky_cols = []
    
    for i in range(max_nlags):
        c = "leaked_target_"+str(i)
        
        print('Processing lag', i)
        test_leak[c] = _get_leak(test_leak, cols, i)
        
        leaky_cols.append(c)
        test_leak = test.join(
            test_leak.set_index("ID")[leaky_cols+["compiled_leak", "nonzero_mean"]], 
            on="ID", how="left"
        )[["ID", "target"] + cols + leaky_cols+["compiled_leak", "nonzero_mean"]]
        zeroleak = test_leak["compiled_leak"]==0
        test_leak.loc[zeroleak, "compiled_leak"] = test_leak.loc[zeroleak, c]
        leaky_value_counts.append(sum(test_leak["compiled_leak"] > 0))
        #_correct_counts = sum(train_leak["compiled_leak"]==train_leak["target"])
        #leaky_value_corrects.append(_correct_counts/leaky_value_counts[-1])
        print("Leak values found in test", leaky_value_counts[-1])
        #print(
        #    "% of correct leaks values in train ", 
        #    leaky_value_corrects[-1]
        #)
        #tmp = train_leak.copy()
        #tmp.loc[zeroleak, "compiled_leak"] = tmp.loc[zeroleak, "nonzero_mean"]
        #scores.append(np.sqrt(mean_squared_error(y, np.log1p(tmp["compiled_leak"]).fillna(14.49))))
        #print(
        #    'Score (filled with nonzero mean)', 
        #    scores[-1]
        #)
    result = dict(
        # score=scores, 
        leaky_count=leaky_value_counts,
        # leaky_correct=leaky_value_corrects,
    )
    return test_leak, result

In [15]:
test_leak, test_result = compiled_leak_result_test(max_nlags=38)

Processing lag 0
Leak values found in test 2963
Processing lag 1
Leak values found in test 4215
Processing lag 2
Leak values found in test 4960
Processing lag 3
Leak values found in test 5503
Processing lag 4
Leak values found in test 5917
Processing lag 5
Leak values found in test 6208
Processing lag 6
Leak values found in test 6426
Processing lag 7
Leak values found in test 6583
Processing lag 8
Leak values found in test 6742
Processing lag 9
Leak values found in test 6872
Processing lag 10
Leak values found in test 6983
Processing lag 11
Leak values found in test 7080
Processing lag 12
Leak values found in test 7159
Processing lag 13
Leak values found in test 7243
Processing lag 14
Leak values found in test 7303
Processing lag 15
Leak values found in test 7366
Processing lag 16
Leak values found in test 7420
Processing lag 17
Leak values found in test 7462
Processing lag 18
Leak values found in test 7499
Processing lag 19
Leak values found in test 7540
Processing lag 20
Leak values 

In [16]:
test_result = pd.DataFrame.from_dict(test_result, orient='columns')
test_result.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
leaky_count,2963,4215,4960,5503,5917,6208,6426,6583,6742,6872,...,7835,7887,7949,8025,8117,8277,8478,8685,9012,9501


In [17]:
test_leak = rewrite_compiled_leak(test_leak, best_lag)

In [18]:
test_leak["compiled_leak"].loc[test_leak["compiled_leak"] > 0].head(5)

10    20000000.00
43     2943750.00
48     4000000.00
54     2022666.66
59    20000000.00
Name: compiled_leak, dtype: float64

In [19]:
test.loc[test_leak["compiled_leak"] > 0].copy().head(5)

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466,target
10,0009efcc5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5944923.0
43,0041861b4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5944923.0
48,0045c123e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21700000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5944923.0
54,004dd971b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5944923.0
59,00551f45d,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21910000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5944923.0


In [20]:
new_train = test.loc[test_leak["compiled_leak"] > 0].copy()
new_train["target"] = test_leak["compiled_leak"].loc[test_leak["compiled_leak"] > 0]
new_train["leak"] = new_train["target"]
new_train['log_leak'] = np.log1p(new_train["leak"])

_temp_train = train.copy()
_temp_train["leak"] = train_leak['compiled_leak']
_temp_train['log_leak'] = np.log1p(_temp_train["leak"])

new_train = pd.concat([_temp_train, new_train]).reset_index(drop=True)
new_test = test.loc[test_leak["compiled_leak"] == 0].copy().reset_index(drop=True)
new_test['leak'] = 0
new_test['log_leak'] = 0

In [21]:
report = pd.read_csv("./input/feature_report.csv")
good_features = report.loc[report['rmse'] <= 0.7925]["feature"].values
rmses = report.loc[report['rmse'] <= 0.7925, 'rmse'].values

In [22]:
target = np.log1p(new_train['target'])

folds = KFold(n_splits=5, shuffle=True, random_state=1)

features = [f for f in new_train if f not in ['ID', 'leak', 'log_leak', 'target']]

new_train.replace(0, np.nan, inplace=True)
new_train['log_of_mean'] = np.log1p(new_train[features].replace(0, np.nan).mean(axis=1))
new_train['mean_of_log'] = np.log1p(new_train[features]).replace(0, np.nan).mean(axis=1)
new_train['log_of_median'] = np.log1p(new_train[features][features].replace(0, np.nan).median(axis=1))
new_train['nb_nans'] = new_train[features].isnull().sum(axis=1)
new_train['the_sum'] = np.log1p(new_train[features].sum(axis=1))
new_train['the_std'] = new_train[features].std(axis=1)
new_train['the_kur'] = new_train[features].kurtosis(axis=1)

new_test.replace(0, np.nan, inplace=True)
new_test['log_of_mean'] = np.log1p(new_test[features].replace(0, np.nan).mean(axis=1))
new_test['mean_of_log'] = np.log1p(new_test[features]).replace(0, np.nan).mean(axis=1)
new_test['log_of_median'] = np.log1p(new_test[features].replace(0, np.nan).median(axis=1))
new_test['nb_nans'] = new_test[features].isnull().sum(axis=1)
new_test['the_sum'] = np.log1p(new_test[features].sum(axis=1))
new_test['the_std'] = new_test[features].std(axis=1)
new_test['the_kur'] = new_test[features].kurtosis(axis=1)

features = good_features.tolist()
features = features + ['log_leak', 'log_of_mean', 'mean_of_log', 'log_of_median', 'nb_nans', 'the_sum', 'the_std', 'the_kur']

In [23]:
dtrain = lgb.Dataset(data=new_train[features], 
                     label=target, free_raw_data=False)
new_test['target'] = 0
dtrain.construct()
oof_preds = np.zeros(new_train.shape[0])

for trn_idx, val_idx in folds.split(new_train):
    lgb_params = {
        'objective': 'regression',
        'num_leaves': 58,
        'subsample': 0.6143,
        'colsample_bytree': 0.6453,
        'min_split_gain': np.power(10, -2.5988),
        'reg_alpha': np.power(10, -2.2887),
        'reg_lambda': np.power(10, 1.7570),
        'min_child_weight': np.power(10, -0.1477),
        'verbose': -1,
        'seed': 3,
        'boosting_type': 'gbdt',
        'max_depth': -1,
        'learning_rate': 0.05,
        'metric': 'l2',
    }

    clf = lgb.train(
        params=lgb_params,
        train_set=dtrain.subset(trn_idx),
        valid_sets=dtrain.subset(val_idx),
        num_boost_round=10000, 
        early_stopping_rounds=100,
        verbose_eval=0
    )

    oof_preds[val_idx] = clf.predict(dtrain.data.iloc[val_idx])
    new_test['target'] += clf.predict(new_test[features]) / folds.n_splits
    print(mean_squared_error(target.iloc[val_idx], 
                             oof_preds[val_idx]) ** .5)

new_train['predictions'] = oof_preds
new_train.loc[new_train['leak'].notnull(), 'predictions'] = np.log1p(new_train.loc[new_train['leak'].notnull(), 'leak'])
print('OOF SCORE : %9.6f' 
      % (mean_squared_error(target, oof_preds) ** .5)) #[:len(train)]
print('OOF SCORE with LEAK : %9.6f' 
      % (mean_squared_error(target, new_train['predictions']) ** .5))



0.43709352821423636
0.38842839561550246
0.4178494131247261
0.3756164175243576
0.36897310192136373
OOF SCORE :  0.398438
OOF SCORE with LEAK :  0.391405


In [24]:
new_test['target'] = np.expm1(new_test['target'])
new_test.head(11)

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,target,leak,log_leak,log_of_mean,mean_of_log,log_of_median,nb_nans,the_sum,the_std,the_kur
0,000137c73,,,,,,,,,,...,1961166.0,,,17.159699,14.608343,15.428193,4913,21.516408,112156200.0,63.987338
1,00021489f,,,,,,,,,,...,1402277.0,,,15.854051,13.910498,13.983117,4979,18.338957,10213670.0,1.32253
2,0004d7953,,,,,,,,,,...,1559789.0,,,15.59572,14.059439,14.508658,4892,20.19084,11525890.0,19.21089
3,00056a333,,,,,,,,,,...,3682574.0,,,16.507243,15.878736,16.198495,4871,21.294735,13984620.0,10.568926
4,00056d8eb,,,,,,,,,,...,999647.6,,,16.626928,14.869434,15.036607,4983,18.706369,36913640.0,7.851651
5,0005fc190,,,,,,,,,,...,2617852.0,,,16.238255,15.286862,15.753069,4969,19.329298,13522890.0,0.916292
6,000787e86,,,,,,,,,,...,1756854.0,,,16.688396,13.88565,14.607098,4966,19.907272,30246050.0,4.019249
7,0008510a0,,,,,,,,,,...,1762901.0,,,16.302419,15.266272,15.332173,4984,18.248329,16637160.0,1.843609
8,000895faf,,,24617120.0,,,,,,,...,3242100.0,,,16.25726,14.830404,15.456254,4963,19.589464,15894700.0,4.704776
9,000986fba,,,,,,,,,12675000.0,...,1567430.0,,,15.701677,14.765847,15.053885,4843,20.698889,10194390.0,20.910049


In [None]:
#sub = test[["ID"]]
#sub["target"] = test_leak["compiled_leak"]
#sub.loc[sub["target"] > 0, "target"] = np.expm1(oof_preds[len(train):])
#sub.loc[sub["target"] == 0, "target"] = new_test['target'].values

In [25]:
sub = test[["ID"]]
sub["target"] = test_leak["compiled_leak"]
sub.loc[sub["target"] == 0, "target"] = new_test['target'].values
sub.head(12)

Unnamed: 0,ID,target
0,000137c73,1961166.0
1,00021489f,1402277.0
2,0004d7953,1559789.0
3,00056a333,3682574.0
4,00056d8eb,999647.6
5,0005fc190,2617852.0
6,000787e86,1756854.0
7,0008510a0,1762901.0
8,000895faf,3242100.0
9,000986fba,1567430.0


In [None]:
sub.to_csv(f"lgb_and_leak_{best_lag}.csv", index=False)
print(f"lgb_and_leak_{best_lag}.csv saved")

In [34]:
#train_res = train_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#train_res.to_csv('train_leak.csv', index=False)
#print(f"train_leak.csv saved")

train_leak.csv saved


In [35]:
#xtest_res = test_leak[["ID"]+leaky_cols+['compiled_leak']].replace(0.0, np.nan)
#xtest_res.to_csv('test_leak.csv', index=False)
#xprint(f"test_leak.csv saved")

test_leak.csv saved
