In [None]:
import numpy as np
import pandas as pd
import time
import warnings
import gc
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn import preprocessing
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

In [None]:
# Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df

## Simple processing train and test

In [None]:
train = reduce_mem_usage(pd.read_csv('../data/train.csv', parse_dates=["first_active_month"]))
test = reduce_mem_usage(pd.read_csv('../data/test.csv', parse_dates=["first_active_month"]))
test["target"] = -999
data = pd.concat([train, test])
data["year"] = data["first_active_month"].dt.year
data["month"] = data["first_active_month"].dt.month
data["day"] = data["first_active_month"].dt.day
data["dayofyear"] = data["first_active_month"].dt.dayofyear
data['week'] = data["first_active_month"].dt.weekofyear
data['dayofweek'] = data['first_active_month'].dt.dayofweek
data['days'] = (datetime.date(2018, 2, 1) - data['first_active_month'].dt.date).dt.days

### project features

In [None]:
def pro_trans(trans):  
    
    trans["authorized_flag"] = trans["authorized_flag"].map({"Y": 1, "N": 0})
    trans["category_1"] = trans["category_1"].map({"Y": 1, "N":0})
    trans["purchase_date"] = pd.to_datetime(trans["purchase_date"])
    trans["year"] = trans["purchase_date"].dt.year
    trans["month"] = trans["purchase_date"].dt.month
    trans["day"] = trans["purchase_date"].dt.day
    trans["weekofyear"] = trans["purchase_date"].dt.weekofyear
    trans["dayofweek"] = trans["purchase_date"].dt.dayofweek
    trans["weekend"] = (trans["purchase_date"].dt.weekday >= 5).astype(int)
    trans["hour"] = trans["purchase_date"].dt.hour
    trans["minute"] = trans["purchase_date"].dt.minute
    trans["month_diff"] = ((datetime.datetime.today() - trans["purchase_date"]).dt.days) // 30
    trans["month_diff"] += trans["month_lag"]
    trans["category_2"] = trans["category_2"].fillna(2.0)
    trans["category_3"] = trans["category_3"].fillna("A")
    trans["merchant_id"] = trans["merchant_id"].fillna("M_ID_00a6ca8a8a")
    agg_func = {"mean": ["mean"], "sum": ["sum"], "max": ["max"], "min": ["min"],
                "count": ["count"], "std": ["std"], "var": ["var"]}
    for col in ["category_1", "category_2", "category_3"]:
        trans[col] = trans["purchase_amount"].groupby(trans[col]).agg(agg_func)
        trans[col] = trans["installments"].groupby(trans[col]).agg(agg_func)
        
    # New Year: 1-1-2017
    trans['New_year_2017'] = (pd.to_datetime('2017-01-01') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)  
    # Martin Luther King Day: 1-16-2017
    trans['Mlk_day_2017'] = (pd.to_datetime('2017-01-16') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0) 
    # president day: 2-20-2017
    trans['President_day_2017'] = (pd.to_datetime('2017-02-20') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Valentine_Day: 2-14-2017
    trans['Valentine’s_day_2017'] = (pd.to_datetime('2017-02-14') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Patrick day: 3-17-2017
    trans['Patrick_day_2017'] = (pd.to_datetime('2017-03-17') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Earth day: 4-5-2017
    trans['Earth_day_2017'] = (pd.to_datetime('2017-04-24') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # president day: 4-5-2017
    trans['Patrick_day_2017'] = (pd.to_datetime('2017-04-05') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Mothers Day: 5-14-2017
    trans['Mothers_day_2017'] = (pd.to_datetime('2017-05-14') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Memorial Day: 5-29-2017
    trans['Memorial_day_2017'] = (pd.to_datetime('2017-05-29') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # fathers Day: 6-18-2017
    trans['Fathers_day_2017'] = (pd.to_datetime('2017-06-18') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Independence Day: 7-4-2017
    trans['Independence_day_2017'] = (pd.to_datetime('2017-07-04') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Labor Day: 9-5-2017
    trans['Labor_day_2017'] = (pd.to_datetime('2017-09-05') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Columbus Day: 10-9-2017
    trans['Columbus_day_2017'] = (pd.to_datetime('2017-10-09') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Halloween's Day: 10-31-2017
    trans['Halloweens_day_2017'] = (pd.to_datetime('2017-10-31') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Global shopping Day: 11-11-2017
    trans['Global_shopping_2017'] = (pd.to_datetime('2017-11-11') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Global shopping Day: 11-23-2017
    trans['Thanksgiving_Day_2017'] = (pd.to_datetime('2017-11-23') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # Christmas Eve Day：12-24-2017 
    trans['Christmas_Eve_day_2017'] = (pd.to_datetime('2017-12-24') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    # New_year: 2018-01-01
    trans["New_year_2018"] = (pd.to_datetime("2018-01-01") - trans["purchase_date"]).dt.days.apply(
       lambda x: x if x > 0 and x < 60 else 0)
    # Martin Luther King Day: 1-16-2018
    trans['Mlk_day_2018'] = (pd.to_datetime('2018-01-16') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 60 else 0)
    
    return trans
    

In [None]:
# Taking Reference from Other Kernels
def trans_agg(trans, prefix):
    agg_func = {"purchase_date":["max", "min", "mean", "nunique"],
                "month_diff": ["max", "min", "mean", "nunique"],
                "weekend": ["sum", "mean", "nunique"],
                "authorized_flag": ["max", "min", "sum", "mean", "nunique"],
                "category_1": ["sum", "mean", "nunique"],
                "installments": ["sum", "max", "min", "mean", "std", "median", "var"],
                "purchase_amount": ["sum", "max", "min", "mean", "std", "median", "var"],
                "month_lag": ["max", "min", "mean", "std", "var"],
                "month_diff": ["max", "min","mean"],
                "card_id": ["size"],
                "month": ["nunique"],
                "hour": ["nunique"],
                "weekofyear": ["nunique"],
                "dayofweek": ["nunique"],
                "year": ["nunique"],
                "subsector_id": ["nunique"],
                "merchant_category_id": ["nunique"],
                "New_year_2017": ["mean"],
                "Mlk_day_2017": ["mean"],
                "President_day_2017": ["mean"],
                "Valentine’s_day_2017": ["mean"],
                "Patrick_day_2017": ["mean"],
                "Earth_day_2017": ["mean"],
                "Patrick_day_2017": ["mean"],
                "Mothers_day_2017": ["mean"],
                "Memorial_day_2017": ["mean"],
                "Fathers_day_2017": ["mean"],
                "Independence_day_2017": ["mean"],
                "Labor_day_2017": ["mean"],
                "Columbus_day_2017": ["mean"],
                "Halloweens_day_2017": ["mean"],
                "Global_shopping_2017": ["mean"],
                "Thanksgiving_Day_2017": ["mean"],
                "Christmas_Eve_day_2017": ["mean"],
                "New_year_2018": ["mean"],
                "Mlk_day_2018": ["mean"]}

    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = (trans.groupby('card_id').size().reset_index(name='{}transactions_count'.format(prefix)))
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')

    return agg_trans

In [None]:
# Feature Engineering - Adding new features inspired by Chau's first kernel
def data_add_feat(data, prefix):
    data[prefix + "purchase_date_max"] = pd.to_datetime(data[prefix + "purchase_date_max"])
    data[prefix + "purchase_date_min"] = pd.to_datetime(data[prefix + "purchase_date_min"])
    data[prefix + "purchase_date_mean"] = pd.to_datetime(data[prefix + "purchase_date_mean"])
    data[prefix + "purchase_date_diff1"] = (data[prefix + "purchase_date_max"] - data[prefix + "purchase_date_min"]).dt.days
    data[prefix + "purchase_date_diff2"] = (data[prefix + "purchase_date_max"] - data[prefix + "purchase_date_mean"]).dt.days
    
    data[prefix + "purchase_date_average"] = data[prefix + "purchase_date_diff"] / data[prefix + "card_id_size"]
    data[prefix + "purchase_date_uptonow"] = (datetime.datetime.today() - data[prefix + "purchase_date_max"]).dt.days
    data[prefix + "first_buy"] = (data[prefix + "purchase_date_min"] - data["first_active_month"]).dt.days
    data[prefix + "last_buy"] = (data[prefix + "purchase_date_max"] - data["first_active_month"]).dt.days
    for feature in [prefix + "purchase_date_max", prefix + "purchase_date_min"]:
        data[feature] = data[feature].astype(np.int64) * 1e-9
    
    return data

## Processing historical transactions

In [None]:
hist_trans = reduce_mem_usage(pd.read_csv('../data/historical_transactions.csv'))
hist_trans_pro = pro_trans(hist_trans)
merge_trans = trans_agg(hist_trans_pro, prefix='hist_')
# del hist_trans_pro
gc.collect()

data = pd.merge(data, merge_trans, on='card_id', how='left')
del merge_trans
gc.collect()

data = data_add_feat(data, prefix="hist_")
gc.collect()


In [None]:
merch_card = pd.DataFrame()
merch_card["card_id"] = hist_trans.card_id
merch_card["merchant_id"] = hist_trans.merchant_id
# merch_card = merch_card.drop_duplicates(["card_id"], keep="last")

In [None]:
merch_card = merch_card.drop_duplicates(["card_id"], keep="last")

In [None]:
data = pd.merge(data, merch_card, on="card_id", how="left")

## processing new_transacton

In [None]:
new_trans = reduce_mem_usage(pd.read_csv('../data/new_merchant_transactions.csv'))
new_trans = pro_trans(new_trans)
merge_trans = trans_agg(new_trans, prefix='new_')
# del new_trans
gc.collect()

data = pd.merge(data, merge_trans, on='card_id', how='left')
del merge_trans
gc.collect()

data = data_add_feat(data, prefix="new_")
gc.collect()


In [None]:
data.shape

### merchant project

In [None]:
merchants = pd.read_csv("../data/merchants.csv")
merchants["category_1"] = merchants["category_1"].map({"Y": 0, "N": 1})
merchants["category_4"] = merchants["category_4"].map({"Y": 0, "N": 1})
merchants["most_recent_sales_range"] = merchants["most_recent_sales_range"].map({"A":0,"B":1,"C":2,"D":3,"E":5})       
merchants["most_recent_purchases_range"] = merchants["most_recent_purchases_range"].map({"A":0,"B":1,"C":2,"D":3,"E":5})

In [None]:
merchants_index = merchants["merchant_id"].isin(data["merchant_id"])
merchants = merchants[merchants_index]
merchants = merchants.drop_duplicates("merchant_id")
data = pd.merge(data, merchants, on="merchant_id", how="left")
data = data.drop(["merchant_id"], axis=1)

### Add new feature —— new and hist cross features

In [None]:
# added new feature - Interactive
data['card_id_total'] = data['new_card_id_size'] + data['hist_card_id_size']
data['purchase_amount_total'] = data['new_purchase_amount_sum'] + data['hist_purchase_amount_sum']
data["purchase_amount_mean"] = data["new_purchase_amount_mean"] + data["hist_purchase_amount_mean"]
data["purchase_amount_max"] = data["new_purchase_amount_max"] + data["hist_purchase_amount_max"]
data["purchase_amount_min"] = data["new_purchase_amount_min"] + data["hist_purchase_amount_min"]
data["purchase_amount_std"] = data["new_purchase_amount_std"] + data["hist_purchase_amount_std"]

data["installments_total"] = data["new_installments_sum"] + data["hist_installments_sum"]
data["installments_mean"] = data["new_installments_mean"] + data["hist_installments_mean"]
data["installments_max"] = data["new_installments_max"] + data["hist_installments_max"]
data["installments_min"] = data["new_installments_min"] + data["hist_installments_min"]
data["installments_std"] = data["new_installments_std"] + data["hist_installments_std"]

gc.collect()

### Additional features

In [None]:
data["hist_month_nunique_hist_month_diff_mean_add"] = data["hist_month_nunique"] + data["hist_month_diff_mean"]   
data["hist_month_nunique_hist_month_diff_mean_sub"] = data["hist_month_nunique"] - data["hist_month_diff_mean"]   
data["hist_month_nunique_hist_month_diff_mean_mul"] = data["hist_month_nunique"] * data["hist_month_diff_mean"]
data["hist_month_nunique_hist_month_diff_mean_div"] = data["hist_month_nunique"] / data["hist_month_diff_mean"]

data["hist_month_nunique_hist_authorized_flag_mean_add"] = data["hist_month_nunique"] + data["hist_authorized_flag_mean"]    
data["hist_month_nunique_hist_authorized_flag_mean_sub"] = data["hist_month_nunique"] - data["hist_authorized_flag_mean"]
data["hist_month_nunique_hist_authorized_flag_mean_mul"] = data["hist_month_nunique"] * data["hist_authorized_flag_mean"]
data["hist_month_nunique_hist_authorized_flag_mean_div"] = data["hist_month_nunique"] / data["hist_authorized_flag_mean"]

data["hist_month_diff_mean_hist_authorized_flag_mean_add"] = data["hist_month_diff_mean"] + data["hist_authorized_flag_mean"]   
data["hist_month_diff_mean_hist_authorized_flag_mean_sub"] = data["hist_month_diff_mean"] - data["hist_authorized_flag_mean"]
data["hist_month_diff_mean_hist_authorized_flag_mean_mul"] = data["hist_month_diff_mean"] * data["hist_authorized_flag_mean"]
data["hist_month_diff_mean_hist_authorized_flag_mean_div"] = data["hist_month_diff_mean"] / data["hist_authorized_flag_mean"]

data["hist_month_nunique_new_purchase_date_diff_add"] = data["hist_month_nunique"] + data["new_purchase_date_diff"]
data["hist_month_nunique_new_purchase_date_diff_sub"] = data["hist_month_nunique"] - data["new_purchase_date_diff"]
data["hist_month_nunique_new_purchase_date_diff_mul"] = data["hist_month_nunique"] * data["new_purchase_date_diff"]
data["hist_month_nunique_new_purchase_date_diff_div"] = data["hist_month_nunique"] / data["new_purchase_date_diff"]

data["hist_month_diff_mean_new_purchase_date_diff_add"] = data["hist_month_diff_mean"] + data["new_purchase_date_diff"]
data["hist_month_diff_mean_new_purchase_date_diff_sub"] = data["hist_month_diff_mean"] - data["new_purchase_date_diff"]
data["hist_month_diff_mean_new_purchase_date_diff_mul"] = data["hist_month_diff_mean"] * data["new_purchase_date_diff"]
data["hist_month_diff_mean_new_purchase_date_diff_div"] = data["hist_month_diff_mean"] / data["new_purchase_date_diff"]

data["hist_authorized_flag_mean_new_purchase_date_diff_add"] = data["hist_authorized_flag_mean"] + data["new_purchase_date_diff"]    
data["hist_authorized_flag_mean_new_purchase_date_diff_sub"] = data["hist_authorized_flag_mean"] - data["new_purchase_date_diff"]
data["hist_authorized_flag_mean_new_purchase_date_diff_mul"] = data["hist_authorized_flag_mean"] * data["new_purchase_date_diff"]
data["hist_authorized_flag_mean_new_purchase_date_diff_div"] = data["hist_authorized_flag_mean"] / data["new_purchase_date_diff"]

data["hist_month_nunique_hist_month_lag_mean_add"] = data["hist_month_nunique"] + data["hist_month_lag_mean"]
data["hist_month_nunique_hist_month_lag_mean_sub"] = data["hist_month_nunique"] - data["hist_month_lag_mean"]
data["hist_month_nunique_hist_month_lag_mean_mul"] = data["hist_month_nunique"] * data["hist_month_lag_mean"]
data["hist_month_nunique_hist_month_lag_mean_div"] = data["hist_month_nunique"] / data["hist_month_lag_mean"]

data["hist_month_diff_mean_hist_month_lag_mean_add"] = data["hist_month_diff_mean"] + data["hist_month_lag_mean"]
data["hist_month_diff_mean_hist_month_lag_mean_sub"] = data["hist_month_diff_mean"] - data["hist_month_lag_mean"]
data["hist_month_diff_mean_hist_month_lag_mean_mul"] = data["hist_month_diff_mean"] * data["hist_month_lag_mean"]
data["hist_month_diff_mean_hist_month_lag_mean_div"] = data["hist_month_diff_mean"] / data["hist_month_lag_mean"]

data["hist_authorized_flag_mean_hist_month_lag_mean_add"] = data["hist_authorized_flag_mean"] + data["hist_month_lag_mean"]
data["hist_authorized_flag_mean_hist_month_lag_mean_sub"] = data["hist_authorized_flag_mean"] - data["hist_month_lag_mean"]
data["hist_authorized_flag_mean_hist_month_lag_mean_mul"] = data["hist_authorized_flag_mean"] * data["hist_month_lag_mean"]
data["hist_authorized_flag_mean_hist_month_lag_mean_div"] = data["hist_authorized_flag_mean"] / data["hist_month_lag_mean"]

data["new_purchase_date_diff_hist_month_lag_mean_add"] = data["new_purchase_date_diff"] + data["hist_month_lag_mean"]
data["new_purchase_date_diff_hist_month_lag_mean_sub"] = data["new_purchase_date_diff"] - data["hist_month_lag_mean"]
data["new_purchase_date_diff_hist_month_lag_mean_mul"] = data["new_purchase_date_diff"] * data["hist_month_lag_mean"]
data["new_purchase_date_diff_hist_month_lag_mean_div"] = data["new_purchase_date_diff"] / data["hist_month_lag_mean"]


#### Check for missing values in training set

In [None]:
# Check for missing values in training set
nulls = np.sum(data.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = data.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)

numeric_dtypes = ['float64']
numerics = []
for i in data.columns:
    if data[i].dtype in numeric_dtypes:
        numerics.append(i)


In [None]:
train = data[data["target"] != -999]
test = data[data["target"] == -999]
test = test.drop(["target"], axis=1)

train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

for features in ['feature_1', 'feature_2', 'feature_3']:
    order_label = train.groupby([features])['outliers'].mean()
    train[features] = train[features].map(order_label)
    test[features] = test[features].map(order_label)

## Combining model with a model without outlier

In [None]:
train_df = train
test_df = test

### Part1: Training model without outliers

In [None]:
train_df = train_df[train_df["outliers"] == 0]
target = train_df["target"]
del train_df["target"]
features = [c for c in train_df.columns if c not in ["card_id", "first_active_month", "outliers"]]
categorical_feats = [c for c in features if "feature_" in c]

In [None]:
%%time
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 8,
         "bagging_fraction": 0.9,
         "bagging_seed": 10,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 6}

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df,train_df['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= -1,
                    early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))
    

In [None]:
cols = (feature_importance_df[["feature", "importance"]]
        .groupby("feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:1000].index)

best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

plt.figure(figsize=(14,25))
sns.barplot(x="importance",
            y="feature",
            data=best_features.sort_values(by="importance",
                                           ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('../fig/lgbm_importances.png')

In [None]:
model_without_outliers = pd.DataFrame({"card_id":test["card_id"].values})
model_without_outliers["target"] = predictions

### Part2: Training model for outliers classification

In [None]:
train_df = train
test_df = test
target = train_df["outliers"]
del train_df["outliers"]
del train_df["target"]

features = [c for c in train_df.columns if c not in ["card_id", "first_active_month"]]
categorical_feats = [c for c in features if "feature_" in c]

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 10,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 6}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train_df))
predictions2 = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=-1, 
                    early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions2 += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))


In [None]:
df_outlier_prob = pd.DataFrame({"card_id": test_df["card_id"].values})
df_outlier_prob["target"] = predictions2

### Part 3: combining submission:

In [None]:
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by="target", ascending=False).head(25000)["card_id"])
best_submission = pd.read_csv("../submission/6911+6912+26121119.csv")
most_likely_liers = best_submission.merge(outlier_id, how="right")

In [None]:
for card_id in most_likely_liers["card_id"]:
    model_without_outliers.loc[model_without_outliers["card_id"] == card_id, "target"] = \
    most_likely_liers.loc[most_likely_liers["card_id"] == card_id, "target"].values

In [None]:
model_without_outliers.to_csv("../submission/"+str(int(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))))+".csv", index=False)             