In [1]:
import numpy as np
import pandas as pd
import time
import warnings
import gc
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import seaborn as sns
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import log_loss
import lightgbm as lgb
from sklearn import preprocessing
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

## 1 Simple processing train and test

In [2]:
train = pd.read_csv('../../data/train.csv', parse_dates=["first_active_month"])
test = pd.read_csv('../../data/test.csv', parse_dates=["first_active_month"])
test["target"] = -9999
data = pd.concat([train, test])
data["month"] = data["first_active_month"].apply(lambda x: x.month)
data["day"] = data["first_active_month"].apply(lambda x: x.day)
data["dayofyear"] = data["first_active_month"].apply(lambda x: x.dayofyear)
data['week'] = data["first_active_month"].dt.weekofyear
data['dayofweek'] = data['first_active_month'].dt.dayofweek
data['days'] = (datetime.date(2018, 2, 1) - data['first_active_month'].dt.date).dt.days
data["quarter"] = data["first_active_month"].apply(lambda x: x.quarter)
data["is_month_start"] = data["first_active_month"].apply(lambda x: x.is_month_start)
data["days_feature1"] = data["days"] * data["feature_1"]
data["days_feature2"] = data["days"] * data["feature_2"]
data["days_feature3"] = data["days"] * data["feature_3"]


## 2 Processing historical and new_transactions

In [3]:
def pro_trans(trans): 
    """
    Simple processing historical_transactions and new_transactions and extract features.
    """
    trans["authorized_flag"] = trans["authorized_flag"].map({"Y": 1, "N": 0})
    trans["category_1"] = trans["category_1"].map({"Y": 1, "N":0})
    trans["purchase_date"] = pd.to_datetime(trans["purchase_date"])
    trans["month"] = trans["purchase_date"].apply(lambda x: x.month)
    trans["weekofyear"] = trans["purchase_date"].apply(lambda x: x.weekofyear)
    trans["dayofweek"] = trans["purchase_date"].apply(lambda x: x.dayofweek)
    trans["weekend"] = (trans["purchase_date"].apply(lambda x: x.dayofweek) >= 5).astype(int)
    trans["hour"] = trans["purchase_date"].apply(lambda x: x.hour)
    trans["quarter"] = trans["purchase_date"].apply(lambda x: x.quarter)
    trans["minute"] = trans["purchase_date"].apply(lambda x: x.minute)
    trans["month_diff"] = ((datetime.datetime.today() - trans["purchase_date"]).apply(lambda x: x.days)) // 30
    trans["month_diff"] += trans["month_lag"]
    trans["month_diff2"] = trans["month"] - trans["month_lag"]
    trans["category_2"] = trans["category_2"].fillna(value = 2.0)
    trans["category_3"] = trans["category_3"].fillna(value = "A")
    trans["merchant_id"] = trans["merchant_id"].fillna(value = "M_ID_00a6ca8a8a")
    lbl = preprocessing.LabelEncoder()
    trans["category_3"] = lbl.fit_transform(list(trans["category_3"].values))
    
    for col in ["category_2", "category_3", "month", "hour"]:
        trans[col+"_pa_mean"] = trans["purchase_amount"].groupby(trans[col]).agg("mean")
        trans[col+"_pa_max"] = trans["purchase_amount"].groupby(trans[col]).agg("max")
        trans[col+"_pa_min"] = trans["purchase_amount"].groupby(trans[col]).agg("min")
        trans[col+"_pa_var"] = trans["purchase_amount"].groupby(trans[col]).agg("var")
        trans[col+"_im_mean"] = trans["installments"].groupby(trans[col]).agg("mean")
        trans[col+"_im_max"] = trans["installments"].groupby(trans[col]).agg("max")
        trans[col+"_im_min"] = trans["installments"].groupby(trans[col]).agg("min")
        trans[col+"_im_var"] = trans["installments"].groupby(trans[col]).agg("var")
    trans_data = trans
    
    return trans_data

In [4]:
# Taking Reference from Other Kernels
def trans_agg(trans, nunique_col, prefix):
    agg_func = {"purchase_date":["max", "min"],
                "month_diff": ["max", "min", "mean", "var"],
                "weekend": ["max", "min", "mean", "sum"],
                "authorized_flag": ["max", "min", "mean", "sum"],
                "category_1": ["max", "min", "mean", "sum"],
                "category_2": ["max", "min", "mean", "sum"],
                "category_3": ["max", "min", "mean", "sum"],
                "installments": ["max", "min", "mean", "std", "sum"],
                "purchase_amount": ["max", "min", "mean", "std", "sum"],
                "month_lag": ["mean", "max", "min", "nunique", "var"],
                "month_diff": ["mean", "max", "min", "nunique", "var"],
                "card_id": ["size", "nunique"],
                "month": ["max", "min", "nunique"],
                "hour": ["max", "min", "nunique"],
                "weekofyear": ["max", "min", "nunique"],
                "dayofweek": ["max", "min", "nunique"],
                "merchant_id": ["nunique"],
                "city_id": ["nunique"],
                "state_id": ["nunique"],
                "subsector_id": ["max", "min", "nunique"],
                "merchant_category_id": ["max", "min", "nunique"]}
    agg_trans = trans.groupby([nunique_col]).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = (trans.groupby(nunique_col).size().reset_index(name='{}transactions_count'.format(prefix)))
    agg_trans = pd.merge(df, agg_trans, on=nunique_col, how='left')
    
    agg_trans[prefix + "purchase_date_max"] = pd.to_datetime(agg_trans[prefix + "purchase_date_max"])
    agg_trans[prefix + "purchase_date_min"] = pd.to_datetime(agg_trans[prefix + "purchase_date_min"])
    agg_trans[prefix + "purchase_date_diff"] = (agg_trans[prefix + "purchase_date_max"] - agg_trans[prefix + "purchase_date_min"]).dt.days   
    agg_trans[prefix + "purchase_date_average"] = agg_trans[prefix + "purchase_date_diff"] / agg_trans[prefix + "card_id_size"]
    agg_trans[prefix + "purchase_date_uptonow"] = (datetime.datetime.today() - agg_trans[prefix + "purchase_date_max"]).dt.days
    for feature in [prefix + "purchase_date_max", prefix + "purchase_date_min"]:
        agg_trans[feature] = agg_trans[feature].astype(np.int64) * 1e-9

    return agg_trans

### 2.1 processing historical_transactions base on card_id

In [5]:
hist_trans = pd.read_csv('../../data/historical_transactions.csv')
print("Reading data over.")
hist_trans_pro = pro_trans(hist_trans)
print("Processing trans_agg function...")
hist_card_trans = trans_agg(hist_trans_pro, nunique_col="card_id", prefix='hist_')

Reading data over.
Processing trans_agg function...


MemoryError: 

In [None]:
hist_card_trans

In [None]:
hist_trans = pd.read_csv('../../data/historical_transactions.csv')
print("Reading data over.")
hist_trans_pro = pro_trans(hist_trans)
print("Processing trans_agg function...")
hist_card_trans = trans_agg(hist_trans_pro, nunique_col="card_id", prefix='hist_')
print("Merge df and hist_card_trans...")
df = pd.DataFrame()
df["card_id"] = data["card_id"]
hist_card = df.merge(hist_card_trans, on='card_id', how='left')
del hist_trans_pro
del hist_card_trans
gc.collect()


### 2.2 processing new_transactions base on card_id

In [None]:
new_trans = pd.read_csv('../../data/new_merchant_transactions.csv')
print("Reading data over.")
new_trans_pro = pro_trans(new_trans)
print("Processing trans_agg function...")
new_card_trans = trans_agg(new_trans_pro, nunique_col="card_id", prefix='new_')
print("Merge df and hist_card_trans...")
df = pd.DataFrame()
df["card_id"] = data["card_id"]
new_card = df.merge(new_card_trans, on='card_id', how='left')
del new_trans_pro
del new_card_trans
gc.collect()


In [None]:
hist_card.to_csv("./feats_group/hist_card.csv", index=False)
new_card.to_csv("./feats_group/new_card.csv", index=False)

### 2.3 processing historical and new trans base on merchant_id

In [None]:
def id_pro(trans, ID, prefix):
    df = trans[["card_id", ID]]
    grouped = trans.groupby(ID)["installments", "purchase_amount"].mean().reset_index()
    grouped.columns = [ID, prefix+ID+"_installments_mean", prefix+ID+"_purchase_amount_mean"]
    trans_last = df.merge(grouped, on=ID, how="left")
    
    grouped = trans.groupby(ID)["installments", "purchase_amount"].sum().reset_index()
    grouped.columns = [ID, prefix+ID+"_installments_sum", prefix+ID+"_purchase_amount_sum"]
    trans_last = trans_last.merge(grouped, on=ID, how="left")
    
    grouped = trans.groupby(ID)["installments", "purchase_amount"].max().reset_index()
    grouped.columns = [ID, prefix+ID+"_installments_max", prefix+ID+"_purchase_amount_max"]
    trans_last = trans_last.merge(grouped, on=ID, how="left")
    
    grouped = trans.groupby(ID)["installments", "purchase_amount"].min().reset_index()
    grouped.columns = [ID, prefix+ID+"_installments_min", prefix+ID+"_purchase_amount_min"]
    trans_last = trans_last.merge(grouped, on=ID, how="left")
    
    grouped = trans.groupby(ID)["installments", "purchase_amount"].std().reset_index()
    grouped.columns = [ID, prefix+ID+"_installments_std", prefix+ID+"_purchase_amount_std"]
    trans_last = trans_last.merge(grouped, on=ID, how="left")
    
    grouped = trans.groupby(ID)["installments", "purchase_amount"].var().reset_index()
    grouped.columns = [ID, prefix+ID+"_installments_var", prefix+ID+"_purchase_amount_var"]
    trans_last = trans_last.merge(grouped, on=ID, how="left")
    
    return trans_last
    

In [None]:
hist_trans_merch = id_pro(hist_trans, ID="merchant_id", prefix="new_")
hist_trans_merch = hist_trans_merch.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
hist_merch = df.merge(hist_trans_merch, on="card_id", how="left")

gc.collect()

In [None]:
new_trans_merch = id_pro(new_trans,ID="merchant_id", prefix="new_")
new_trans_merch = new_trans_merch.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
new_merch = df.merge(new_trans_merch, on="card_id", how="left")

gc.collect()

#### 2.4 processing historical and new trans base on city_id

In [None]:
hist_trans_city = id_pro(hist_trans, ID="city_id", prefix="new_")
hist_trans_city = hist_trans_city.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
hist_city = df.merge(hist_trans_city, on="card_id", how="left")
hist_city = hist_city.drop("city_id", axis=1)

gc.collect()

In [None]:
new_trans_city = id_pro(new_trans, ID="city_id", prefix="new_")
new_trans_city = new_trans_city.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
new_city = df.merge(new_trans_city, on="card_id", how="left")
new_city = new_city.drop("city_id", axis=1)

gc.collect()

#### 2.5 processing historical and new trans base on merchant_category_id

In [None]:
hist_trans_merchcate = id_pro(hist_trans, ID="merchant_category_id", prefix="new_")
hist_trans_merchcate = hist_trans_merchcate.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
hist_merchcate = df.merge(hist_trans_merchcate, on="card_id", how="left")
hist_merchcate = hist_merchcate.drop("merchant_category_id", axis=1)

gc.collect()

In [None]:
new_trans_merchcate = id_pro(new_trans, ID="merchant_category_id", prefix="new_")
new_trans_merchcate = new_trans_merchcate.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
new_merchcate = df.merge(new_trans_merchcate, on="card_id", how="left")
new_merchcate = new_merchcate.drop("merchant_category_id", axis=1)

gc.collect()

#### 2.6 processing historical and new trans base on state_id

In [None]:
hist_trans_state = id_pro(hist_trans, ID="state_id", prefix="new_")
hsit_trans_state = hist_trans_state.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
hist_state = df.merge(hist_trans_state, on="card_id", how="left")
hist_state = hist_state.drop("state_id", axis=1)

gc.collect()

In [None]:
new_trans_state = id_pro(new_trans, ID="state_id", prefix="new_")
new_trans_state = new_trans_state.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
new_state = df.merge(new_trans_state, on="card_id", how="left")
new_state = new_state.drop("state_id", axis=1)

gc.collect()

#### 2.7 processing historical and new trans base on subsector_id

In [None]:
hist_trans_subsector = id_pro(hist_trans, ID="subsector_id", prefix="new_")
hist_trans_subsector = hist_trans_subsector.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
hist_subsector = df.merge(hist_trans_subsector, on="card_id", how="left")
hist_subsector = hist_subsector.drop("subsector_id", axis=1)

gc.collect()

In [None]:
new_trans_subsector = id_pro(new_trans, ID="subsector_id", prefix="new_")
new_trans_subsector = new_trans_subsector.groupby("card_id", as_index=False).mean()
df = pd.DataFrame()
df["card_id"] = data["card_id"]
new_subsector = df.merge(new_trans_subsector, on="card_id", how="left")
new_subsector = new_subsector.drop("subsector_id", axis=1)

gc.collect()

In [None]:
hist_data = hist_card.merge(hist_merch, on="card_id", how="left") 
hist_data = hist_data.merge(hist_city, on="card_id", how="left")
hist_data = hist_data.merge(hist_merchcate, on="card_id", how="left")
hist_data = hist_data.merge(hist_state, on="card_id", how="left")
hist_data = hist_data.merge(hist_subsector, on="card_id", how="left")

In [None]:
new_data = new_card.merge(new_merch, on="card_id", how="left") 
new_data = new_data.merge(new_city, on="card_id", how="left")
new_data = new_data.merge(new_merchcate, on="card_id", how="left")
new_data = new_data.merge(new_state, on="card_id", how="left")
new_data = new_data.merge(new_subsector, on="card_id", how="left")

In [None]:
data1 = hist_data.merge(new_data, on="card_id", how="left")

In [None]:
data1.to_csv("../../data_feat/data1.csv", index=False)