In [1]:
%run 0-Base.ipynb

In [2]:
SOURCE_PATH = f"{DATA_PATH}/1-reformatted"
TARGET_PATH = f"{DATA_PATH}/2-feature-engineered"

In [3]:
%time transactions_df = pd.read_feather(f"{SOURCE_PATH}/transactions.feather")

CPU times: user 1.17 s, sys: 718 ms, total: 1.89 s
Wall time: 3.61 s


In [4]:
%time transactions_df[    "authorized_purchase_amount"] = transactions_df.purchase_amount.where( transactions_df.authorized_flag)
%time transactions_df["not_authorized_purchase_amount"] = transactions_df.purchase_amount.where(~transactions_df.authorized_flag)

CPU times: user 103 ms, sys: 25.8 ms, total: 129 ms
Wall time: 130 ms
CPU times: user 90.3 ms, sys: 46.8 ms, total: 137 ms
Wall time: 134 ms


In [5]:
%%time

transactions_df["purchase_year" ] = transactions_df.purchase_date.dt.year
transactions_df["purchase_month"] = transactions_df.purchase_date.dt.month

CPU times: user 4.73 s, sys: 90.1 ms, total: 4.82 s
Wall time: 4.82 s


In [6]:
%time transactions_df = pd.get_dummies(transactions_df, columns=["installments"], dtype="int8")

CPU times: user 1.94 s, sys: 653 ms, total: 2.59 s
Wall time: 2.59 s


In [7]:
agg = {
    "authorized_flag": ["count", "mean"],

    "purchase_year":  ["first"],
    "purchase_month": ["first"],

    "purchase_amount":                ["min", "mean", "max"],
    "authorized_purchase_amount":     ["min", "mean", "max"],
    "not_authorized_purchase_amount": ["min", "mean", "max"],

    "installments_-1":  ["mean"],
    "installments_0":   ["mean"],
    "installments_1":   ["mean"],
    "installments_2":   ["mean"],
    "installments_3":   ["mean"],
    "installments_4":   ["mean"],
    "installments_5":   ["mean"],
    "installments_6":   ["mean"],
    "installments_7":   ["mean"],
    "installments_8":   ["mean"],
    "installments_9":   ["mean"],
    "installments_10":  ["mean"],
    "installments_11":  ["mean"],
    "installments_12":  ["mean"],
    "installments_999": ["mean"],
}

In [8]:
%time aggregated_transactions_df = transactions_df.groupby(["card_id", "month_lag"]).agg(agg)
aggregated_transactions_df.columns = [f"{col}_{fn}" for col, fn in aggregated_transactions_df.columns]
aggregated_transactions_df = aggregated_transactions_df.rename(columns={"authorized_flag_count": "count"}).reset_index()

CPU times: user 16.2 s, sys: 2.38 s, total: 18.5 s
Wall time: 18.6 s


Define a synthetic feature - *season of year* - one of (winter, spring, summer, autumn):

In [9]:
%time aggregated_transactions_df["season"] = (aggregated_transactions_df["purchase_month_first"] % 12 + 3) // 3

CPU times: user 60.4 ms, sys: 86 µs, total: 60.5 ms
Wall time: 60.1 ms


In [10]:
aggregated_transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044128 entries, 0 to 3044127
Data columns (total 31 columns):
card_id                                object
month_lag                              int64
count                                  int64
authorized_flag_mean                   float64
purchase_year_first                    int64
purchase_month_first                   int64
purchase_amount_min                    float32
purchase_amount_mean                   float32
purchase_amount_max                    float32
authorized_purchase_amount_min         float32
authorized_purchase_amount_mean        float32
authorized_purchase_amount_max         float32
not_authorized_purchase_amount_min     float32
not_authorized_purchase_amount_mean    float32
not_authorized_purchase_amount_max     float32
installments_-1_mean                   float64
installments_0_mean                    float64
installments_1_mean                    float64
installments_2_mean                    float64
ins

In [11]:
from elo_competition.data import reduce_mem_usage

%time aggregated_transactions_df = reduce_mem_usage(aggregated_transactions_df)

CPU times: user 689 ms, sys: 326 ms, total: 1.02 s
Wall time: 1.02 s


In [12]:
aggregated_transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3044128 entries, 0 to 3044127
Data columns (total 31 columns):
card_id                                object
month_lag                              int8
count                                  int16
authorized_flag_mean                   float32
purchase_year_first                    int16
purchase_month_first                   int8
purchase_amount_min                    float32
purchase_amount_mean                   float32
purchase_amount_max                    float32
authorized_purchase_amount_min         float32
authorized_purchase_amount_mean        float32
authorized_purchase_amount_max         float32
not_authorized_purchase_amount_min     float32
not_authorized_purchase_amount_mean    float32
not_authorized_purchase_amount_max     float32
installments_-1_mean                   float32
installments_0_mean                    float32
installments_1_mean                    float32
installments_2_mean                    float32
insta

In [13]:
%time aggregated_transactions_df.to_feather(f"{TARGET_PATH}/aggregated-transactions-by-card-id.feather")

CPU times: user 535 ms, sys: 218 ms, total: 753 ms
Wall time: 382 ms
