In [1]:
import numpy as np
import pandas as pd
import time
import warnings
import gc
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn import preprocessing
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

## Simple processing train and test

In [2]:
train = pd.read_csv('../data/train.csv', parse_dates=["first_active_month"])
test = pd.read_csv('../data/test.csv', parse_dates=["first_active_month"])
test["target"] = -9999
data = pd.concat([train, test])
data["year"] = data["first_active_month"].apply(lambda x: x.year)
data["month"] = data["first_active_month"].apply(lambda x: x.month)
data["day"] = data["first_active_month"].apply(lambda x: x.day)
data["dayofyear"] = data["first_active_month"].apply(lambda x: x.dayofyear)
data['week'] = data["first_active_month"].dt.weekofyear
data['dayofweek'] = data['first_active_month'].dt.dayofweek
data['days'] = (datetime.date(2018, 2, 1) - data['first_active_month'].dt.date).dt.days
data["quarter"] = data["first_active_month"].apply(lambda x: x.quarter)
data["is_month_start"] = data["first_active_month"].apply(lambda x: x.is_month_start)
data["days_feature1"] = data["days"] * data["feature_1"]
data["days_feature2"] = data["days"] * data["feature_2"]
data["days_feature3"] = data["days"] * data["feature_3"]

In [3]:
def pro_trans(trans):  
    
    trans["authorized_flag"] = trans["authorized_flag"].map({"Y": 1, "N": 0})
    trans["category_1"] = trans["category_1"].map({"Y": 1, "N":0})
    trans["purchase_date"] = pd.to_datetime(trans["purchase_date"])
    trans["year"] = trans["purchase_date"].apply(lambda x: x.year)
    trans["month"] = trans["purchase_date"].apply(lambda x: x.month)
    trans["weekofyear"] = trans["purchase_date"].apply(lambda x: x.weekofyear)
    trans["dayofweek"] = trans["purchase_date"].apply(lambda x: x.dayofweek)
    trans["weekend"] = (trans["purchase_date"].apply(lambda x: x.dayofweek) >= 5).astype(int)
    trans["hour"] = trans["purchase_date"].apply(lambda x: x.hour)
    trans["quarter"] = trans["purchase_date"].apply(lambda x: x.quarter)
    trans["minute"] = trans["purchase_date"].apply(lambda x: x.minute)
    trans["month_diff"] = ((datetime.datetime.today() - trans["purchase_date"]).apply(lambda x: x.days)) // 30
    trans["month_diff"] += trans["month_lag"]
    trans["month_diff2"] = trans["month"] - trans["month_lag"]
    trans["category_2"] = trans["category_2"].fillna(value = 2.0)
    trans["category_3"] = trans["category_3"].fillna(value = "A")
    trans["merchant_id"] = trans["merchant_id"].fillna(value = "M_ID_00a6ca8a8a")
    lbl = preprocessing.LabelEncoder()
    trans["category_3"] = lbl.fit_transform(list(trans["category_3"].values))
    
    agg_func = {"mean": ["mean"]}
    for col in ["category_2", "category_3"]:
        trans[col+"_pa_mean"] = trans["purchase_amount"].groupby(trans[col]).agg("mean")
        trans[col+"_pa_max"] = trans["purchase_amount"].groupby(trans[col]).agg("max")
        trans[col+"_pa_min"] = trans["purchase_amount"].groupby(trans[col]).agg("min")
        trans[col+"_pa_var"] = trans["purchase_amount"].groupby(trans[col]).agg("var")
        trans[col+"_im_mean"] = trans["installments"].groupby(trans[col]).agg("mean")
        trans[col+"_im_max"] = trans["installments"].groupby(trans[col]).agg("max")
        trans[col+"_im_min"] = trans["installments"].groupby(trans[col]).agg("min")
        trans[col+"_im_var"] = trans["installments"].groupby(trans[col]).agg("var")
    for col in ["month", "hour"]:
        trans[col+"_pa_mean"] = trans["purchase_amount"].groupby(trans[col]).agg("mean")
        trans[col+"_pa_max"] = trans["purchase_amount"].groupby(trans[col]).agg("max")
        trans[col+"_pa_min"] = trans["purchase_amount"].groupby(trans[col]).agg("min")
        trans[col+"_pa_var"] = trans["purchase_amount"].groupby(trans[col]).agg("var")
        trans[col+"_im_mean"] = trans["installments"].groupby(trans[col]).agg("mean")
        trans[col+"_im_max"] = trans["installments"].groupby(trans[col]).agg("max")
        trans[col+"_im_min"] = trans["installments"].groupby(trans[col]).agg("min")
        trans[col+"_im_var"] = trans["installments"].groupby(trans[col]).agg("var")
    trans_data = trans
    
    return trans_data

In [4]:
# Taking Reference from Other Kernels
def trans_agg(trans, nunique_col, prefix):
    agg_func = {"purchase_date":["max", "min"],
                "month_diff": ["max", "min", "mean", "var"],
                "weekend": ["max", "min", "mean", "sum"],
                "authorized_flag": ["max", "min", "mean", "sum"],
                "category_1": ["max", "min", "mean", "sum"],
                "category_2": ["max", "min", "mean", "sum"],
                "category_3": ["max", "min", "mean", "sum"],
                "installments": ["max", "min", "mean", "std", "sum"],
                "purchase_amount": ["max", "min", "mean", "std", "sum"],
                "merchant_id": ["nunique"],
                "month_lag": ["mean", "max", "min", "nunique", "var"],
                "month_diff": ["mean", "max", "min", "nunique", "var"],
                "card_id": ["size", "nunique"],
                "month": ["max", "min", "nunique"],
                "hour": ["max", "min", "nunique"],
                "weekofyear": ["max", "min", "nunique"],
                "dayofweek": ["max", "min", "nunique"],
                "year": ["max", "min", "nunique"],
                "subsector_id": ["max", "min", "nunique"],
                "merchant_category_id": ["max", "min", "nunique"]}
    agg_trans = trans.groupby([nunique_col]).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = (trans.groupby(nunique_col).size().reset_index(name='{}transactions_count'.format(prefix)))
    agg_trans = pd.merge(df, agg_trans, on=nunique_col, how='left')

    return agg_trans

In [5]:
# Feature Engineering - Adding new features inspired by Chau's first kernel
def data_add_feat(data, prefix):
    data[prefix + "purchase_date_max"] = pd.to_datetime(data[prefix + "purchase_date_max"])
    data[prefix + "purchase_date_min"] = pd.to_datetime(data[prefix + "purchase_date_min"])
    data[prefix + "purchase_date_diff"] = (data[prefix + "purchase_date_max"] - data[prefix + "purchase_date_min"]).dt.days   
    data[prefix + "purchase_date_average"] = data[prefix + "purchase_date_diff"] / data[prefix + "card_id_size"]
    data[prefix + "purchase_date_uptonow"] = (datetime.datetime.today() - data[prefix + "purchase_date_max"]).dt.days
    data[prefix + "first_buy"] = (data[prefix + "purchase_date_min"] - data["first_active_month"]).dt.days
    data[prefix + "last_buy"] = (data[prefix + "purchase_date_max"] - data["first_active_month"]).dt.days
    for feature in [prefix + "purchase_date_max", prefix + "purchase_date_min"]:
        data[feature] = data[feature].astype(np.int64) * 1e-9
    
    return data

## Processing historical transactions

In [6]:
hist_trans = pd.read_csv('../data/historical_transactions.csv')
hist_trans_pro = pro_trans(hist_trans)
hist_card_trans = trans_agg(hist_trans_pro, nunique_col="card_id", prefix="hist_")
# del hist_trans_pro
gc.collect()

hist_data = data.merge(hist_card_trans, on="card_id", how="left")
gc.collect()

hist_data = data_add_feat(hist_data, prefix="hist_")
gc.collect()


106

## Processing new_transactions

In [26]:
new_trans = pd.read_csv('../data/new_merchant_transactions.csv')
new_trans_pro = pro_trans(new_trans)
merge_trans = trans_agg(new_trans, nunique_col="card_id", prefix='new_')
# gc.collect()

data = pd.merge(hist_data, merge_trans, on='card_id', how='left')
# gc.collect()

data = data_add_feat(data, prefix="new_")
# gc.collect()


In [None]:
# del hist_trans
del hist_trans_pro
del hist_card_trans

### Add features base on merchant_id for hist、new trans

In [8]:
def merchant_id_pro(trans, prefix):
    df = trans[["card_id", "merchant_id"]]
    grouped = trans.groupby("merchant_id")["installments", "purchase_amount"].mean().reset_index()
    grouped.columns = ["merchant_id", "merchant_mean_installments", "merchant_mean_purchase_amount"]
    trans_last = df.merge(grouped, on="merchant_id", how="left")
    
    grouped = trans.groupby("merchant_id")["installments", "purchase_amount"].std().reset_index()
    grouped.columns = ["merchant_id", "merchant_std_installments", "merchant_std_purchase_amount"]
    trans_last = trans_last.merge(grouped, on="merchant_id", how="left")
    
    return trans_last
    

In [None]:
hist_trans_last = merchant_id_pro(hist_trans, prefix="new_")
hist_trans_last = hist_trans_last.groupby("card_id", as_index=False).mean()
hist_data = data.merge(hist_trans_last, on="card_id", how="left")
hist_data1 = hist_data[["card_id", "merchant_mean_installments", "merchant_mean_purchase_amount",
                 "merchant_std_installments", "merchant_std_purchase_amount"]]
data = data.merge(hist_data1, on="card_id", how="left")

In [9]:
# new_trans = pd.read_csv('../data/new_merchant_transactions.csv')
new_trans_last = merchant_id_pro(new_trans, prefix="new_")
new_trans_last = new_trans_last.groupby("card_id", as_index=False).mean()
new_data = data.merge(new_trans_last, on="card_id", how="left")
new_data1 = new_data[["card_id", "merchant_mean_installments", "merchant_mean_purchase_amount",
                 "merchant_std_installments", "merchant_std_purchase_amount"]]
data = data.merge(new_data1, on="card_id", how="left")

In [28]:
# added new feature - Interactive
data['card_id_total'] = data['new_card_id_size'] + data['hist_card_id_size']
# data['purchase_amount_total'] = data['new_purchase_amount_sum'] + data['hist_purchase_amount_sum']
data["purchase_amount_mean"] = data["new_purchase_amount_mean"] + data["hist_purchase_amount_mean"]
data["purchase_amount_max"] = data["new_purchase_amount_max"] + data["hist_purchase_amount_max"]
data["purchase_amount_min"] = data["new_purchase_amount_min"] + data["hist_purchase_amount_min"]
data["purchase_amount_std"] = data["new_purchase_amount_std"] + data["hist_purchase_amount_std"]

# data["installments_total"] = data["new_installments_sum"] + data["hist_installments_sum"]
data["installments_mean"] = data["new_installments_mean"] + data["hist_installments_mean"]
data["installments_max"] = data["new_installments_max"] + data["hist_installments_max"]
data["installments_min"] = data["new_installments_min"] + data["hist_installments_min"]
data["installments_std"] = data["new_installments_std"] + data["hist_installments_std"]

gc.collect()

105

In [29]:
data["hist_month_nunique_hist_month_diff_mean_add"] = data["hist_month_nunique"] + data["hist_month_diff_mean"]   
data["hist_month_nunique_hist_month_diff_mean_sub"] = data["hist_month_nunique"] - data["hist_month_diff_mean"]   
data["hist_month_nunique_hist_month_diff_mean_mul"] = data["hist_month_nunique"] * data["hist_month_diff_mean"]
data["hist_month_nunique_hist_month_diff_mean_div"] = data["hist_month_nunique"] / data["hist_month_diff_mean"]

data["hist_month_nunique_hist_authorized_flag_mean_add"] = data["hist_month_nunique"] + data["hist_authorized_flag_mean"]    
data["hist_month_nunique_hist_authorized_flag_mean_sub"] = data["hist_month_nunique"] - data["hist_authorized_flag_mean"]
data["hist_month_nunique_hist_authorized_flag_mean_mul"] = data["hist_month_nunique"] * data["hist_authorized_flag_mean"]
data["hist_month_nunique_hist_authorized_flag_mean_div"] = data["hist_month_nunique"] / data["hist_authorized_flag_mean"]

data["hist_month_diff_mean_hist_authorized_flag_mean_add"] = data["hist_month_diff_mean"] + data["hist_authorized_flag_mean"]   
data["hist_month_diff_mean_hist_authorized_flag_mean_sub"] = data["hist_month_diff_mean"] - data["hist_authorized_flag_mean"]
data["hist_month_diff_mean_hist_authorized_flag_mean_mul"] = data["hist_month_diff_mean"] * data["hist_authorized_flag_mean"]
data["hist_month_diff_mean_hist_authorized_flag_mean_div"] = data["hist_month_diff_mean"] / data["hist_authorized_flag_mean"]

data["hist_month_nunique_new_purchase_date_diff_add"] = data["hist_month_nunique"] + data["new_purchase_date_diff"]
data["hist_month_nunique_new_purchase_date_diff_sub"] = data["hist_month_nunique"] - data["new_purchase_date_diff"]
data["hist_month_nunique_new_purchase_date_diff_mul"] = data["hist_month_nunique"] * data["new_purchase_date_diff"]
data["hist_month_nunique_new_purchase_date_diff_div"] = data["hist_month_nunique"] / data["new_purchase_date_diff"]

data["hist_month_diff_mean_new_purchase_date_diff_add"] = data["hist_month_diff_mean"] + data["new_purchase_date_diff"]
data["hist_month_diff_mean_new_purchase_date_diff_sub"] = data["hist_month_diff_mean"] - data["new_purchase_date_diff"]
data["hist_month_diff_mean_new_purchase_date_diff_mul"] = data["hist_month_diff_mean"] * data["new_purchase_date_diff"]
data["hist_month_diff_mean_new_purchase_date_diff_div"] = data["hist_month_diff_mean"] / data["new_purchase_date_diff"]

data["hist_authorized_flag_mean_new_purchase_date_diff_add"] = data["hist_authorized_flag_mean"] + data["new_purchase_date_diff"]    
data["hist_authorized_flag_mean_new_purchase_date_diff_sub"] = data["hist_authorized_flag_mean"] - data["new_purchase_date_diff"]
data["hist_authorized_flag_mean_new_purchase_date_diff_mul"] = data["hist_authorized_flag_mean"] * data["new_purchase_date_diff"]
data["hist_authorized_flag_mean_new_purchase_date_diff_div"] = data["hist_authorized_flag_mean"] / data["new_purchase_date_diff"]

data["hist_month_nunique_hist_month_lag_mean_add"] = data["hist_month_nunique"] + data["hist_month_lag_mean"]
data["hist_month_nunique_hist_month_lag_mean_sub"] = data["hist_month_nunique"] - data["hist_month_lag_mean"]
data["hist_month_nunique_hist_month_lag_mean_mul"] = data["hist_month_nunique"] * data["hist_month_lag_mean"]
data["hist_month_nunique_hist_month_lag_mean_div"] = data["hist_month_nunique"] / data["hist_month_lag_mean"]

data["hist_month_diff_mean_hist_month_lag_mean_add"] = data["hist_month_diff_mean"] + data["hist_month_lag_mean"]
data["hist_month_diff_mean_hist_month_lag_mean_sub"] = data["hist_month_diff_mean"] - data["hist_month_lag_mean"]
data["hist_month_diff_mean_hist_month_lag_mean_mul"] = data["hist_month_diff_mean"] * data["hist_month_lag_mean"]
data["hist_month_diff_mean_hist_month_lag_mean_div"] = data["hist_month_diff_mean"] / data["hist_month_lag_mean"]

data["hist_authorized_flag_mean_hist_month_lag_mean_add"] = data["hist_authorized_flag_mean"] + data["hist_month_lag_mean"]
data["hist_authorized_flag_mean_hist_month_lag_mean_sub"] = data["hist_authorized_flag_mean"] - data["hist_month_lag_mean"]
data["hist_authorized_flag_mean_hist_month_lag_mean_mul"] = data["hist_authorized_flag_mean"] * data["hist_month_lag_mean"]
data["hist_authorized_flag_mean_hist_month_lag_mean_div"] = data["hist_authorized_flag_mean"] / data["hist_month_lag_mean"]

data["new_purchase_date_diff_hist_month_lag_mean_add"] = data["new_purchase_date_diff"] + data["hist_month_lag_mean"]
data["new_purchase_date_diff_hist_month_lag_mean_sub"] = data["new_purchase_date_diff"] - data["hist_month_lag_mean"]
data["new_purchase_date_diff_hist_month_lag_mean_mul"] = data["new_purchase_date_diff"] * data["hist_month_lag_mean"]
data["new_purchase_date_diff_hist_month_lag_mean_div"] = data["new_purchase_date_diff"] / data["hist_month_lag_mean"]


In [31]:
# Check for missing values in training set
nulls = np.sum(data.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = data.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)

numeric_dtypes = ['float64']
numerics = []
for i in data.columns:
    if data[i].dtype in numeric_dtypes:
        numerics.append(i)
train = data[data["target"] != -9999]
test = data[data["target"] == -9999]
test = test.drop(["target"], axis=1)

train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

for features in ['feature_1', 'feature_2', 'feature_3']:
    order_label = train.groupby([features])['outliers'].mean()
    train[features] = train[features].map(order_label)
    test[features] = test[features].map(order_label)

In [32]:
train_df = train
test_df = test

In [33]:
train_df = train_df[train_df["outliers"] == 0]
target = train_df["target"]
del train_df["target"]
features = [c for c in train_df.columns if c not in ["card_id", "first_active_month", "outliers"]]
categorical_feats = [c for c in features if "feature_" in c]

In [34]:
%%time
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 10,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": -1}

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df,train_df['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= -1,
                    early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))
    

fold 0
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 1.51502	valid_1's rmse: 1.55964
fold 1
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 1.51466	valid_1's rmse: 1.55663
fold 2
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 1.51548	valid_1's rmse: 1.55285
fold 3
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 1.51326	valid_1's rmse: 1.56575
fold 4
Training until validation scores don't improve for 200 rounds.
Did not meet early stopping. Best iteration is:
[10000]	training's rmse: 1.51606	valid_1's rmse: 1.55362
CV score: 1.55770 
Wall time: 38min 45s


In [35]:
model_without_outliers = pd.DataFrame({"card_id":test["card_id"].values})
model_without_outliers["target"] = predictions

In [36]:
model_without_outliers.to_csv("../submission/"+str(int(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time())))) +".csv", index=False)    

In [37]:
train_df = train
test_df = test
target = train_df["outliers"]
del train_df["outliers"]
del train_df["target"]

features = [c for c in train_df.columns if c not in ["card_id", "first_active_month"]]
categorical_feats = [c for c in features if "feature_" in c]

In [38]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 10,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": -1}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train_df))
predictions2 = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=-1, 
                    early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions2 += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))


fold n°0
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[5302]	training's binary_logloss: 0.0293103	valid_1's binary_logloss: 0.0446472
fold n°1
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[5036]	training's binary_logloss: 0.030062	valid_1's binary_logloss: 0.0434941
fold n°2
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[5936]	training's binary_logloss: 0.0288192	valid_1's binary_logloss: 0.0425688
fold n°3
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[5013]	training's binary_logloss: 0.0290901	valid_1's binary_logloss: 0.0475513
fold n°4
Training until validation scores don't improve for 200 rounds.
Early stopping, best iteration is:
[5629]	training's binary_logloss: 0.0292013	valid_1's binary_logloss: 0.0431724
CV score: 0.04429 


In [39]:
df_outlier_prob = pd.DataFrame({"card_id": test_df["card_id"].values})
df_outlier_prob["target"] = predictions2

In [40]:
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by="target", ascending=False).head(20000)["card_id"])
best_submission = pd.read_csv("../submission/6911+6912+0103163343_2.csv")
most_likely_liers = best_submission.merge(outlier_id, how="right")

In [41]:
for card_id in most_likely_liers["card_id"]:
    model_without_outliers.loc[model_without_outliers["card_id"] == card_id, "target"] = \
    most_likely_liers.loc[most_likely_liers["card_id"] == card_id, "target"].values

In [42]:
model_without_outliers.to_csv("../submission/"+str(int(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))))+".csv", index=False)             