In [1]:
import pandas as pd
import numpy as np
import time
import datetime
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss

In [2]:
train = pd.read_csv('../../data/train.csv')
train["first_active_month"] = pd.to_datetime(train["first_active_month"])
train["month"] = train["first_active_month"].apply(lambda x: x.month)
train["day"] = train["first_active_month"].apply(lambda x: x.day)
train["dayofyear"] = train["first_active_month"].apply(lambda x: x.dayofyear)
train['week'] = train["first_active_month"].dt.weekofyear
train['dayofweek'] = train['first_active_month'].dt.dayofweek
train['days'] = (datetime.date(2018, 2, 1) - train['first_active_month'].dt.date).dt.days
train["quarter"] = train["first_active_month"].apply(lambda x: x.quarter)
# train["is_month_start"] = train["first_active_month"].apply(lambda x: x.is_month_start)
train["days_feature1"] = train["days"] * train["feature_1"]
train["days_feature2"] = train["days"] * train["feature_2"]
train["days_feature3"] = train["days"] * train["feature_3"]

In [3]:
new = pd.read_csv("../../data/new_merchant_transactions.csv")

In [4]:
new["authorized_flag"] = new["authorized_flag"].map({"Y": 1, "N": 0})
new["category_1"] = new["category_1"].map({"Y": 1, "N":0})
new["purchase_date"] = pd.to_datetime(new["purchase_date"])
new["month"] = new["purchase_date"].apply(lambda x: x.month)
new["weekofyear"] = new["purchase_date"].apply(lambda x: x.weekofyear)
new["dayofweek"] = new["purchase_date"].apply(lambda x: x.dayofweek)
new["weekend"] = (new["purchase_date"].apply(lambda x: x.dayofweek) >= 5).astype(int)
new["hour"] = new["purchase_date"].apply(lambda x: x.hour)
new["quarter"] = new["purchase_date"].apply(lambda x: x.quarter)
new["minute"] = new["purchase_date"].apply(lambda x: x.minute)
new["month_diff"] = ((datetime.datetime.today() - new["purchase_date"]).apply(lambda x: x.days)) // 30
new["month_diff"] += new["month_lag"]
new["month_diff2"] = new["month"] - new["month_lag"]
new["category_2"] = new["category_2"].fillna(value = 2.0)
new["category_3"] = new["category_3"].fillna(value = "A")
new["merchant_id"] = new["merchant_id"].fillna(value = "M_ID_00a6ca8a8a")
lbl = preprocessing.LabelEncoder()
new["category_3"] = lbl.fit_transform(list(new["category_3"].values))

In [5]:
new.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,...,subsector_id,month,weekofyear,dayofweek,weekend,hour,quarter,minute,month_diff,month_diff2
0,1,C_ID_415bb3a509,107,0,1,1,307,M_ID_b0c793002c,1,-0.557574,...,19,3,10,6,1,14,1,57,11,2
1,1,C_ID_415bb3a509,140,0,1,1,307,M_ID_88920c89e8,1,-0.56958,...,19,3,12,0,0,18,1,53,11,2
2,1,C_ID_415bb3a509,330,0,1,1,507,M_ID_ad5237ef6b,2,-0.551037,...,14,4,17,3,0,14,2,8,11,2
3,1,C_ID_415bb3a509,-1,1,1,1,661,M_ID_9e84cda3b1,1,-0.671925,...,8,3,10,2,0,9,1,43,11,2
4,1,C_ID_ef55cf8d4b,-1,1,1,1,166,M_ID_3c86fa3831,1,-0.659904,...,29,3,12,3,0,21,1,7,11,2


In [6]:
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [7]:
new_index = new["card_id"].isin(train.drop_duplicates("card_id")["card_id"].tolist())       
new_train = new[new_index]

In [8]:
new_train_data = train.merge(new_train, on="card_id", how="left")

In [9]:
card = new_train_data.groupby("card_id", as_index=False)["outliers"].agg({"card_sum": "sum", "card_cnt": "count"})
card["card_cvr"] = (card["card_sum"]) / (card["card_cnt"] + 3)

In [10]:
card.head()

Unnamed: 0,card_id,card_sum,card_cnt,card_cvr
0,C_ID_00007093c1,0,2,0.0
1,C_ID_0001506ef0,0,2,0.0
2,C_ID_000183fdda,0,11,0.0
3,C_ID_00027503e2,0,1,0.0
4,C_ID_0002c7c2c1,0,6,0.0


In [11]:
merchant = new_train_data.groupby("merchant_id", as_index=False)["outliers"].agg({"merchant_sum": "sum", "merchant_cnt": "count"})
merchant["merchant_cvr"] = (merchant["merchant_sum"]) / (merchant["merchant_cnt"] + 3)

In [12]:
city = new_train_data.groupby("city_id", as_index=False)["outliers"].agg({"city_sum": "sum", "city_cnt": "count"})
city["city_cvr"] = (city["city_sum"]) / (city["city_cnt"] + 3)

In [13]:
mertcate = new_train_data.groupby("merchant_category_id", as_index=False)["outliers"].agg({"mertcate_sum": "sum", "mertcate_cnt": "count"})
mertcate["mertcate_cvr"] = (mertcate["mertcate_sum"]) / (mertcate["mertcate_cnt"] + 3)

In [14]:
state = new_train_data.groupby("state_id", as_index=False)["outliers"].agg({"state_sum": "sum", "state_cnt": "count"})
state["state_cvr"] = (state["state_sum"]) / (state["state_cnt"] + 3)

In [15]:
subsector = new_train_data.groupby("subsector_id", as_index=False)["outliers"].agg({"subsector_sum": "sum", "subsector_cnt": "count"})
subsector["subsector_cvr"] = (subsector["subsector_sum"]) / (subsector["subsector_cnt"] + 3)

In [16]:
new_data = new_train_data.merge(card, on="card_id", how="left")
new_data = new_data.merge(merchant, on="merchant_id", how="left")
new_data = new_data.merge(city, on="city_id", how="left")
new_data = new_data.merge(mertcate, on="merchant_category_id", how="left")
new_data = new_data.merge(state, on="state_id", how="left")
new_data = new_data.merge(subsector, on="subsector_id", how="left")

In [None]:
new_data["card_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')
new_data["card_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["card_cvr"].rank(ascending=False,method='dense')

In [None]:
new_data.head()

In [None]:
new_data["merchant_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')
new_data["merchant_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["merchant_cvr"].rank(ascending=False,method='dense')

In [None]:
new_data["city_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')
new_data["city_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["city_cvr"].rank(ascending=False,method='dense')

In [None]:
new_data["mertcate_cvr_card_rank"] = new_data.groupby("card_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_merchant_rank"] = new_data.groupby("merchant_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_city_rank"] = new_data.groupby("city_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_mertcate_rank"] = new_data.groupby("merchant_category_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_state_rank"] = new_data.groupby("state_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')
new_data["mertcate_cvr_subsector_rank"] = new_data.groupby("subsector_id", as_index=False)["mertcate_cvr"].rank(ascending=False,method='dense')