In [None]:
import numpy as np
import pandas as pd
import time
import warnings
import gc
import matplotlib.pyplot as plt
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from sklearn import preprocessing
warnings.filterwarnings('ignore')
plt.style.use('seaborn')


In [None]:
# Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df

## Simple processing train and test

In [None]:
train = reduce_mem_usage(pd.read_csv('../data/train.csv', parse_dates=["first_active_month"]))
test = reduce_mem_usage(pd.read_csv('../data/test.csv', parse_dates=["first_active_month"]))
test["target"] = -999
data = pd.concat([train, test])
data["year"] = data["first_active_month"].dt.year
data["month"] = data["first_active_month"].dt.month
data["day"] = data["first_active_month"].dt.day
data["dayofyear"] = data["first_active_month"].dt.dayofyear
data['week'] = data["first_active_month"].dt.weekofyear
data['dayofweek'] = data['first_active_month'].dt.dayofweek
data['days'] = (datetime.date(2018, 2, 1) - data['first_active_month'].dt.date).dt.days

In [None]:
def pro_trans(trans):  
    
    trans["authorized_flag"] = trans["authorized_flag"].map({"Y": 1, "N": 0})
    trans["category_1"] = trans["category_1"].map({"Y": 1, "N":0})
    trans["purchase_date"] = pd.to_datetime(trans["purchase_date"])
    trans["year"] = trans["purchase_date"].dt.year
    trans["month"] = trans["purchase_date"].dt.month
    trans["weekofyear"] = trans["purchase_date"].dt.weekofyear
    trans["dayofweek"] = trans["purchase_date"].dt.dayofweek
    trans["weekend"] = (trans["purchase_date"].dt.weekday >= 5).astype(int)
    trans["hour"] = trans["purchase_date"].dt.hour
    trans["month_diff"] = ((datetime.datetime.today() - trans["purchase_date"]).dt.days) // 30
    trans["month_diff"] += trans["month_lag"]
    trans["category_2"] = trans["category_2"].fillna(2.0)
    trans["category_3"] = trans["category_3"].fillna("A")
    trans["merchant_id"] = trans["merchant_id"].fillna("M_ID_00a6ca8a8a")
    lbl = preprocessing.LabelEncoder()
    trans["category_3"] = lbl.fit_transform(list(trans["category_3"].values))
    # Christmas Eve
    trans['Christmas_Eva_2017'] = (pd.to_datetime('2017-12-24') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)    
    # Christmas Day：12 25 2017 
    trans['Christmas_day_2017'] = (pd.to_datetime('2017-12-25') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Mothers Day: May 14 2017
    trans['Mothers_day_2017'] = (pd.to_datetime('2017-05-04') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # fathers day: August 13 2017
    trans['fathers_day_2017'] = (pd.to_datetime('2017-08-13') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Childrens day: October 12 2017
    trans['Children_day_2017'] = (pd.to_datetime('2017-10-12') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Black Friday : 24th November 2017
    trans['Black_Friday_2017'] = (pd.to_datetime('2017-11-24') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Valentines Day
    trans['Valentine_day_2017'] = (pd.to_datetime('2017-06-12') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Global Shopping ：11-11
    trans['Global_shopping_2017'] = (pd.to_datetime('2017-11-11') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # New Year ：1-1-2018
    trans['New_year_2017'] = (pd.to_datetime('2017-01-01') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Independence_Day: 7-4-2017
    trans['Independence_day_2017'] = (pd.to_datetime('2017-07-04') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Valentine_Day: 2-14-2017
    trans['Valentine’s_day_2017'] = (pd.to_datetime('2017-02-14') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Halloween_Day: 10-30-2017
    trans['Halloween_day_2017'] = (pd.to_datetime('2017-10-30') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    # Mothers Day: May 13 2018
    trans['Mothers_day_2018'] = (pd.to_datetime('2018-05-13') - trans['purchase_date']).dt.days.apply(
        lambda x: x if x > 0 and x < 100 else 0)
    
    return trans
    

In [None]:
# Taking Reference from Other Kernels
def trans_agg(trans, prefix):
    agg_func = {"purchase_date":["max", "min"],
                "month_diff": ["count", "max", "min", "mean"],
                "weekend": ["count", "sum", "max", "min", "mean"],
                "authorized_flag": ["count", "sum", "max", "min", "mean"],
                "category_1": ["count", "sum", "max", "min", "mean"],
                "category_2": ["count", "sum", "max", "min", "mean"],
                "category_3": ["count", "sum", "max", "min", "mean"],
                "installments": ["count", "sum", "max", "min", "mean", "std", "var"],
                "purchase_amount": ["count", "sum", "max", "min", "mean", "std", "var"],
                "merchant_id": ["nunique"],
                "month_lag": ["count", "max", "min", "nunique"],
                "month_diff": ["count", "max", "min", "nunique"],
                "card_id": ["size", "nunique"],
                "month": ["count", "max", "min", "nunique"],
                "hour": ["count", "max", "min", "nunique"],
                "weekofyear": ["count", "max", "min", "nunique"],
                "dayofweek": ["count", "max", "min", "nunique"],
                "year": ["count", "max", "min", "nunique"],
                "subsector_id": ["count", "max", "min", "nunique"],
                "merchant_category_id": ["count", "max", "min", "nunique"],
                "Christmas_day_2017": ["count", "max", "min", "mean", "std"],
                "Mothers_day_2017": ["count", "max", "min", "mean", "std"],
                "fathers_day_2017": ["count", "max", "min", "mean", "std"],
                "Children_day_2017": ["count", "max", "min", "mean", "std"],
                "Black_Friday_2017": ["count", "max", "min", "mean", "std"],
                "Valentine_day_2017": ["count", "max", "min", "mean", "std"],
                "Christmas_Eva_2017": ["count", "max", "min", "mean", "std"],
                "Global_shopping_2017": ["count", "max", "min", "mean", "std"],
                "Independence_day_2017": ["count", "max", "min", "mean", "std"],
                "New_year_2017": ["count", "max", "min", "mean", "std"],
                "Valentine’s_day_2017": ["count", "max", "min", "mean", "std"],
                "Halloween_day_2017": ["count", "max", "min", "mean", "std"],
                "Mothers_day_2018": ["count", "max", "min", "mean", "std"]}
    
    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = (trans.groupby('card_id').size().reset_index(name='{}transactions_count'.format(prefix)))
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')

    return agg_trans

In [None]:
# Feature Engineering - Adding new features inspired by Chau's first kernel
def data_add_feat(data, prefix):
    data[prefix + 'purchase_date_max'] = pd.to_datetime(data[prefix + 'purchase_date_max'])
    data[prefix + 'purchase_date_min'] = pd.to_datetime(data[prefix + 'purchase_date_min'])
    data[prefix + 'purchase_date_diff'] = (data[prefix + 'purchase_date_max'] - data[prefix + 'purchase_date_min']).dt.days
    data[prefix + 'purchase_date_average'] = data[prefix + 'purchase_date_diff'] / data[prefix + 'card_id_size']
    data[prefix + 'purchase_date_uptonow'] = (datetime.datetime.today() - data[prefix + 'purchase_date_max']).dt.days
    data[prefix + 'first_buy'] = (data[prefix + 'purchase_date_min'] - data['first_active_month']).dt.days
    for feature in [prefix + 'purchase_date_max', prefix + 'purchase_date_min']:
        data[feature] = data[feature].astype(np.int64) * 1e-9
    
    return data

## Processing historical transactions

In [None]:
hist_trans = reduce_mem_usage(pd.read_csv('../data/historical_transactions.csv'))

In [None]:
hist_trans = pro_trans(hist_trans)
merge_trans = trans_agg(hist_trans, prefix='hist_')
del hist_trans
gc.collect()

In [None]:
data = pd.merge(data, merge_trans, on='card_id', how='left')
del merge_trans
gc.collect()

In [None]:
data = data_add_feat(data, prefix="hist_")
gc.collect()

In [None]:
data.to_csv("../data_feat/data_hist.csv", index=False)

## processing new_transacton

In [None]:
# data = pd.read_csv("../data_feat/data_hist.csv")
new_trans = reduce_mem_usage(pd.read_csv('../data/new_merchant_transactions.csv'))

In [None]:
new_trans = pro_trans(new_trans)
merge_trans = trans_agg(new_trans, prefix='new_')
del new_trans
gc.collect()

In [None]:
data = pd.merge(data, merge_trans, on='card_id', how='left')
del merge_trans
gc.collect()

In [None]:
data = data_add_feat(data, prefix="new_")
gc.collect()

In [None]:
# added new feature - Interactive
data['card_id_total'] = data['new_card_id_size'] + data['hist_card_id_size']
data['purchase_amount_total'] = data['new_purchase_amount_sum'] + data['hist_purchase_amount_sum']

gc.collect()

In [None]:
# Check for missing values in training set
nulls = np.sum(data.isnull())
nullcols = nulls.loc[(nulls != 0)]
dtypes = data.dtypes
dtypes2 = dtypes.loc[(nulls != 0)]
info = pd.concat([nullcols, dtypes2], axis=1).sort_values(by=0, ascending=False)

numeric_dtypes = ['float64']
numerics = []
for i in data.columns:
    if data[i].dtype in numeric_dtypes:
        numerics.append(i)


In [None]:
train = data[data["target"] != -999]
test = data[data["target"] == -999]
test = test.drop(["target"], axis=1)

train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

for features in ['feature_1', 'feature_2', 'feature_3']:
    order_label = train.groupby([features])['outliers'].mean()
    train[features] = train[features].map(order_label)
    test[features] = test[features].map(order_label)

## training model

In [None]:
import time
df_train_columns = [c for c in train.columns if c not in ['card_id', 'first_active_month', 'target', 'outliers']]
target = train['target']
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": -1}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=-1,
                    early_stopping_rounds=100)
    oof[val_idx] = clf.predict(train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)

    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    predictions += clf.predict(test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, target))

sample_submission = pd.read_csv('../data/sample_submission.csv')
sample_submission['target'] = predictions
sample_submission.to_csv("../submission/"+str(int(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))))+".csv", index=False)

## Combining model with a model without outlier

In [None]:
train.to_csv("../data_feat/train.csv", index=False)
test.to_csv("../data_feat/test.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

In [None]:
%%time
train = pd.read_csv("../data_feat/train.csv")
test = pd.read_csv("../data_feat/test.csv")

### Part1: Training model without outliers

In [None]:
train_df = train[train["outliers"] == 0]
target = train_df["target"]
del train_df["target"]
features = [c for c in train_df.columns if c not in ["card_id", "first_active_month", "outliers"]]
categorical_feats = [c for c in features if "feature_" in c]

In [None]:
%%time
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.001,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 8,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 6}

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df,train_df['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= -1,
                    early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))
    

In [None]:
model_without_outliers = pd.DataFrame({"card_id":test["card_id"].values})
model_without_outliers["target"] = predictions

### Part2: Training model for outliers classification

In [None]:
train_df = train
test_df = test
target = train_df["outliers"]
del train_df["outliers"]
del train_df["target"]

features = [c for c in train_df.columns if c not in ["card_id", "first_active_month"]]
categorical_feats = [c for c in features if "feature_" in c]

In [None]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.005,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 6}
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train_df))
predictions2 = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=-1, 
                    early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions2 += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))


In [None]:
df_outlier_prob = pd.DataFrame({"card_id": test_df["card_id"].values})
df_outlier_prob["target"] = predictions2

### Part 3: combining submission:

In [None]:
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by="target", ascending=False).head(25000)["card_id"])
best_submission = pd.read_csv("../submission/6921_6922_55.csv")
most_likely_liers = best_submission.merge(outlier_id, how="right")

In [None]:
for card_id in most_likely_liers["card_id"]:
    model_without_outliers.loc[model_without_outliers["card_id"] == card_id, "target"] = \
    most_likely_liers.loc[most_likely_liers["card_id"] == card_id, "target"].values

In [None]:
model_without_outliers.to_csv("../submission/"+str(int(time.strftime("%Y%m%d%H%M%S", time.localtime(time.time()))))+".csv", index=False)             