In [1]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

In [2]:
df_train = pd.read_csv('train_feature.csv')
df_test = pd.read_csv('test_feature.csv')

In [17]:
train_clean = pd.read_csv('train_clean.csv')
test_clean = pd.read_csv('test_clean.csv', sep='\t')

In [4]:
df_train.head(3)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,elapsed_time,hist_transactions_count,hist_category_1_sum,hist_category_1_mean,hist_category_2_1.0_mean,...,installments_purchase_amount_max,installments_purchase_amount_std,city_id_purchase_amount_mean,city_id_purchase_amount_min,city_id_purchase_amount_max,city_id_purchase_amount_std,category_1_installments_mean,category_1_installments_min,category_1_installments_max,category_1_installments_std
0,2017-06-01,C_ID_92a2005557,5,2,1,245,13.0,0.0,0.0,1.0,...,-0.575835,,-0.458993,-0.606593,-0.296112,0.155803,0.0,0.0,0.0,
1,2017-01-01,C_ID_3d0044924f,4,1,0,396,11.0,2.0,0.181818,0.818182,...,-0.725956,,-0.725956,-0.725956,-0.725956,,1.0,1.0,1.0,
2,2016-08-01,C_ID_d639edf6cd,2,2,0,549,2.0,0.0,0.0,0.0,...,-0.700326,,-0.700326,-0.700326,-0.700326,,0.0,0.0,0.0,


In [5]:
train_clean.head(3)

Unnamed: 0,first_active_month,card_id,feature_3,target,month,year,elapsed_time,feature_1_1,feature_1_2,feature_1_3,...,new_month_lag_min,new_month_lag_max,new_category_2_3.0_mean,new_category_2_2.0_mean,new_purchase_date_ptp,new_purchase_date_min,new_purchase_date_max,new_category_3_B_mean,new_city_id_nunique,outliers
0,2017-06-01,C_ID_92a2005557,1,-0.820283,6,2017,245,0,0,0,...,1.0,2.0,0.0,0.0,4742309.0,1520259000.0,1525001000.0,0.0,3.0,0
1,2017-01-01,C_ID_3d0044924f,0,0.392913,1,2017,396,0,0,0,...,1.0,2.0,0.0,0.0,4887632.0,1517505000.0,1522393000.0,1.0,1.0,0
2,2016-08-01,C_ID_d639edf6cd,0,0.688056,8,2016,549,0,1,0,...,2.0,2.0,0.0,0.0,0.0,1524937000.0,1524937000.0,0.0,1.0,0


In [23]:
df_train['outliers'] = train_clean['outliers']
df_train['target'] = train_clean['target']

In [24]:
df_train = df_train[df_train['outliers'] == 0]
target = df_train['target']
del df_train['target']
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]

In [25]:
param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1,
         "random_state": 2333}

In [26]:
%%time
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2333)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.60478	valid_1's rmse: 1.61613
[200]	training's rmse: 1.57349	valid_1's rmse: 1.58931
[300]	training's rmse: 1.55877	valid_1's rmse: 1.57884
[400]	training's rmse: 1.54895	valid_1's rmse: 1.57295
[500]	training's rmse: 1.54139	valid_1's rmse: 1.56928
[600]	training's rmse: 1.53542	valid_1's rmse: 1.56711
[700]	training's rmse: 1.53007	valid_1's rmse: 1.56576
[800]	training's rmse: 1.52525	valid_1's rmse: 1.56457
[900]	training's rmse: 1.52081	valid_1's rmse: 1.56379
[1000]	training's rmse: 1.51679	valid_1's rmse: 1.5632
[1100]	training's rmse: 1.51274	valid_1's rmse: 1.56286
[1200]	training's rmse: 1.50895	valid_1's rmse: 1.56262
[1300]	training's rmse: 1.50512	valid_1's rmse: 1.5623
[1400]	training's rmse: 1.50151	valid_1's rmse: 1.56218
[1500]	training's rmse: 1.49785	valid_1's rmse: 1.56184
[1600]	training's rmse: 1.4943	valid_1's rmse: 1.5617
[1700]	training's rmse: 1.49076	valid_1's rmse:

In [27]:
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = predictions

In [29]:
df_train = pd.read_csv('train_feature.csv')
df_test = pd.read_csv('test_feature.csv')

In [35]:
df_train['outliers'] = train_clean['outliers']

In [36]:
target = df_train['outliers']
del df_train['outliers']

In [37]:
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

In [38]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

In [39]:
%%time
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0444838	valid_1's binary_logloss: 0.0470287
[200]	training's binary_logloss: 0.0444626	valid_1's binary_logloss: 0.0470584
Early stopping, best iteration is:
[27]	training's binary_logloss: 0.0444635	valid_1's binary_logloss: 0.0469785
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.045163	valid_1's binary_logloss: 0.0460864
[200]	training's binary_logloss: 0.0450993	valid_1's binary_logloss: 0.0460215
[300]	training's binary_logloss: 0.0451133	valid_1's binary_logloss: 0.046037
[400]	training's binary_logloss: 0.0450984	valid_1's binary_logloss: 0.0460208
Early stopping, best iteration is:
[203]	training's binary_logloss: 0.0450944	valid_1's binary_logloss: 0.0460152
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0453457	valid_1's binary_logloss: 0.0446483
[200]	training'

In [40]:
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.107539
1,C_ID_130fd0cbdd,0.002639
2,C_ID_b709037bc5,0.004452
3,C_ID_d27d835a9f,0.0022
4,C_ID_2b5e3df5c2,0.0022


In [41]:
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by='target',ascending = False).head(25000)['card_id'])

In [42]:
best_submission = pd.read_csv('submit.csv')

In [43]:
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-4.706186
1,C_ID_6d8dba8475,-0.495479
2,C_ID_7f1041e8e1,-5.099685
3,C_ID_22e4a47c72,0.633485
4,C_ID_b54cfad8b2,-0.943807


In [44]:
%%time
for card_id in most_likely_liers['card_id']:
    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target']\
    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

Wall time: 5min 56s


In [45]:
model_without_outliers.to_csv("submission.csv", index=False)