In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import gc
import time
import sys
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge
import lightgbm as lgb

In [2]:
new_transactions = pd.read_csv("../data/new_merchant_transactions.csv", parse_dates=["purchase_date"])
historical_transactions = pd.read_csv("../data/historical_transactions.csv", parse_dates=["purchase_date"])

In [3]:
new_transactions.head(2)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19


In [4]:
def binarize(df):
    for col in ["authorized_flag", "category_1"]:
        df[col] = df[col].map({"Y":1, "N":0})
    return df

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [5]:
%%time
def read_data(data_file):
    df = pd.read_csv(data_file)
    df["first_active_month"] = pd.to_datetime(df["first_active_month"])
    df["elapsed"] = (datetime.date(2018, 2, 1) - df["first_active_month"].dt.date).dt.days
    return df

train = read_data("../data/train.csv")
test = read_data("../data/test.csv")
target = train["target"]
del train["target"]
gc.collect

Wall time: 1.32 s


In [6]:
def reduce_mem_usage(df, verbose=True):
    numerics =["int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum()/1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print("Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".
                      format(end_mem, 100*(start_mem - end_mem)/start_mem))
    return df

### feature engineering

In [7]:
%%time
historical_transactions = pd.get_dummies(historical_transactions, columns=["category_2", "category_3"])
new_transactions = pd.get_dummies(new_transactions, columns=["category_2", "category_3"])
historical_transactions = reduce_mem_usage(historical_transactions)
new_transactions = reduce_mem_usage(new_transactions)
agg_fun = {"authorized_flag": ["sum", "mean"]}
auth_mean = historical_transactions.groupby(["card_id"]).agg(agg_fun)
auth_mean.columns = ["_".join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)
authorized_transactions = historical_transactions[historical_transactions["authorized_flag"] == 1]
historical_transactions = historical_transactions[historical_transactions["authorized_flag"] == 0]
gc.collect()

Mem. usage decreased to 1304.89 Mb (54.8% reduction)
Mem. usage decreased to 84.24 Mb (56.7% reduction)
Wall time: 23.7 s


In [8]:
historical_transactions.head(2)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,category_2_1.0,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_3_A,category_3_B,category_3_C
115,0,C_ID_4e6213e9bc,88,0,0,842,M_ID_22c9cfa265,-10,-0.730379,2017-04-07 12:58:09,16,37,1,0,0,0,0,1,0,0
132,0,C_ID_4e6213e9bc,88,0,0,367,M_ID_86ec983688,-5,-0.723782,2017-09-17 22:40:27,16,16,1,0,0,0,0,1,0,0


In [9]:
new_transactions.head(2)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,category_2_1.0,category_2_2.0,category_2_3.0,category_2_4.0,category_2_5.0,category_3_A,category_3_B,category_3_C
0,1,C_ID_415bb3a509,107,0,1,307,M_ID_b0c793002c,1,-0.557617,2018-03-11 14:57:36,9,19,1,0,0,0,0,0,1,0
1,1,C_ID_415bb3a509,140,0,1,307,M_ID_88920c89e8,1,-0.569336,2018-03-19 18:53:37,9,19,1,0,0,0,0,0,1,0
