In [1]:
%run ../0-utils/0-Base.ipynb

Read data about customers (`train` and `test`):

In [2]:
customers_dfs = dict()

for clazz in ("train", "test"):
    df = pd.read_csv(f"../data/raw/{clazz}.csv", parse_dates=["first_active_month"])

    # test.csv has one missing value - replace it with the most popular 
    # first_active_month for rows in train.csv that have the same feature_x
    if "test" == clazz: df.fillna(pd.to_datetime("2017-01"), inplace=True)
        
    if "target" not in df.columns: df["target"] = 0

    feature_1_backup = df.feature_1
    feature_2_backup = df.feature_2

    df = pd.get_dummies(df, columns=["feature_1", "feature_2"])

    df["feature_1"] = feature_1_backup
    df["feature_2"] = feature_2_backup
    
    customers_dfs[clazz] = df
    
with pd.option_context("display.max_rows", 6):
    for customers_df in customers_dfs.values():
        display(customers_df)

Unnamed: 0,first_active_month,card_id,feature_3,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2
0,2017-06-01,C_ID_92a2005557,1,-0.820283,0,0,0,0,1,0,1,0,5,2
1,2017-01-01,C_ID_3d0044924f,0,0.392913,0,0,0,1,0,1,0,0,4,1
2,2016-08-01,C_ID_d639edf6cd,0,0.688056,0,1,0,0,0,0,1,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201914,2017-08-01,C_ID_7666735b3d,0,0.093494,0,0,0,1,0,0,0,1,4,3
201915,2016-07-01,C_ID_73f5a0efd0,1,-4.676589,0,0,1,0,0,0,1,0,3,2
201916,2017-07-01,C_ID_92c9984c58,1,-1.859413,0,0,1,0,0,1,0,0,3,1


Unnamed: 0,first_active_month,card_id,feature_3,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2
0,2017-04-01,C_ID_0ab67a22ab,1,0,0,0,1,0,0,0,0,1,3,3
1,2017-01-01,C_ID_130fd0cbdd,0,0,0,1,0,0,0,0,0,1,2,3
2,2017-08-01,C_ID_b709037bc5,1,0,0,0,0,0,1,1,0,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123620,2016-09-01,C_ID_21d56d950c,1,0,0,0,0,0,1,1,0,0,5,1
123621,2017-06-01,C_ID_6c46fc5a9d,0,0,0,1,0,0,0,1,0,0,2,1
123622,2016-10-01,C_ID_87e7979a5f,1,0,0,0,0,0,1,1,0,0,5,1


Define a sample partition of customers:

In [3]:
sample_customers_df = add_part(customers_dfs["train"], n_parts=TRANSACTIONS_N_PARTS["new"])
sample_customers_df = sample_customers_df.loc[lambda x: x.part == 13].drop("part", axis=1)

with pd.option_context("display.max_rows", 6): display(sample_customers_df)

Unnamed: 0,first_active_month,card_id,feature_3,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2
43,2017-11-01,C_ID_1919f9080e,0,0.393388,0,1,0,0,0,0,0,1,2,3
225,2017-01-01,C_ID_ae77d244b6,0,-33.219281,0,1,0,0,0,1,0,0,2,1
243,2017-09-01,C_ID_ee5bb1f392,0,-1.230539,0,1,0,0,0,0,0,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201682,2017-08-01,C_ID_5ac91f1427,0,-2.315998,0,1,0,0,0,1,0,0,2,1
201899,2017-10-01,C_ID_58e359763e,1,-2.702214,0,0,0,0,1,1,0,0,5,1
201910,2017-10-01,C_ID_9072609210,0,-1.753271,0,1,0,0,0,0,0,1,2,3


Define a function for reading and processing a partition with transactions:

In [4]:
from datetime import datetime

transactions_cat_cols = ("city_id",
                         "category_3",
                         "merchant_category_id",
                         "merchant_id",
                         "state_id",
                         "subsector_id")

transactions_dtype = {cat_col: "category" for cat_col in transactions_cat_cols}

def read_transactions_part(part_file_name):
    transactions_df = pd.read_csv(f"../data/1-partitioned/{part_file_name}",
                                  parse_dates=["purchase_date"],
                                  dtype=transactions_dtype)
    
    transactions_df = transactions_df.reset_index().rename(columns={"index": "ID"})

    days_diff = datetime.today() - transactions_df.purchase_date
    transactions_df["month_diff"] = days_diff // np.timedelta64(1, "M")
    transactions_df["month_diff"] += transactions_df.month_lag
    
    cat_cols_mapping = dict(
        authorized_flag=dict(
            Y=1,
            N=0,
        ),
        category_1=dict(
            Y=1,
            N=0,
        ),
        category_3=dict(
            A=0,
            B=1,
            C=2,
        ),
    )
    
    for col, mapping in cat_cols_mapping.items():
        transactions_df[col] = transactions_df[col].map(mapping)
    
    for col in ("authorized_flag", "category_1", "category_2", "category_3"):
        transactions_df[col] = transactions_df[col].fillna(-1).astype(int)
        
    category_2_backup = transactions_df.category_2
    category_3_backup = transactions_df.category_3

    transactions_df = pd.get_dummies(transactions_df, columns=["category_2", "category_3"])

    transactions_df["category_2"] = category_2_backup
    transactions_df["category_3"] = category_3_backup
        
    return transactions_df

Read a sample partition with `read_transactions_part(...)`:

In [5]:
%%time

sample_transactions_df = read_transactions_part("new.013.csv")

with pd.option_context("display.max_rows", 6): display(sample_transactions_df)

Unnamed: 0,ID,authorized_flag,card_id,city_id,category_1,installments,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,month_diff,category_2_-1,category_2_1,category_2_2,category_2_3,category_2_4,category_2_5,category_3_-1,category_3_0,category_3_1,category_3_2,category_2,category_3
0,0,1,C_ID_be71a60d23,143,0,0,19,M_ID_ab392e3714,1,-0.701753,2018-03-01 05:47:19,5,36,12,0,0,0,0,0,1,0,1,0,0,5,0
1,1,1,C_ID_be71a60d23,333,0,0,307,M_ID_1fb0d13ea0,1,-0.716855,2018-03-08 18:27:49,9,19,12,0,1,0,0,0,0,0,1,0,0,1,0
2,2,1,C_ID_be71a60d23,143,0,0,166,M_ID_12c118b802,2,-0.736389,2018-04-30 17:44:44,5,29,11,0,0,0,0,0,1,0,1,0,0,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20756,20756,1,C_ID_7750eb98a6,20,0,1,818,M_ID_1b5a3c1373,2,2.825675,2018-04-19 12:02:33,19,12,11,0,0,0,1,0,0,0,0,1,0,3,1
20757,20757,1,C_ID_438aea7835,53,0,0,557,M_ID_8013941341,1,-0.665765,2018-03-28 08:57:19,20,29,11,0,0,0,0,0,1,0,1,0,0,5,0
20758,20758,1,C_ID_611d4d588f,302,0,1,307,M_ID_ee5a6f6a35,2,-0.731881,2018-03-11 15:07:49,7,19,12,0,0,0,1,0,0,0,0,1,0,3,1


CPU times: user 182 ms, sys: 2.31 ms, total: 185 ms
Wall time: 191 ms


Define a function for building an entry set for `customers_df` and `transactions_df`:

In [6]:
import featuretools as ft
import featuretools.variable_types as vtypes

transactions_vtypes = {
    "authorized_flag": vtypes.Boolean,
    "category_1":      vtypes.Boolean,

    "category_2":    vtypes.Ordinal,
    "category_2_1":  vtypes.Boolean,
    "category_2_2":  vtypes.Boolean,
    "category_2_3":  vtypes.Boolean,
    "category_2_4":  vtypes.Boolean,
    "category_2_5":  vtypes.Boolean,
    "category_2_-1": vtypes.Boolean,

    "category_3":    vtypes.Ordinal,
    "category_3_0":  vtypes.Boolean,
    "category_3_1":  vtypes.Boolean,
    "category_3_2":  vtypes.Boolean,
    "category_3_-1": vtypes.Boolean,
}

customers_vtypes = {
    "feature_1": vtypes.Ordinal,
    "feature_2": vtypes.Ordinal,
    "feature_3": vtypes.Boolean,
}

def build_entity_set(customers_df, transactions_df):
    es = ft.EntitySet()
    
    es.entity_from_dataframe(entity_id="customers",
                             dataframe=customers_df,
                             index="card_id",
                             variable_types=customers_vtypes)

    es.entity_from_dataframe(entity_id="transactions",
                             dataframe=transactions_df,
                             index="ID",
                             variable_types=transactions_vtypes)

    customer_transactions_relationship = ft.Relationship(es["customers"]["card_id"],
                                                         es["transactions"]["card_id"])

    es.add_relationships([customer_transactions_relationship])

    es["transactions"]["category_1"].interesting_values = [1]
    es["transactions"]["authorized_flag"].interesting_values = [1]
    es["transactions"]["installments"].interesting_values = [-1, 999]

    for col in ("category_2", "category_3"):
        for dummy_col in filter(lambda x: x.startswith(col + "_"), transactions_df.columns):
            es["transactions"][dummy_col].interesting_values = [1]

    return es

Build an entity set for sample data:

In [7]:
es = build_entity_set(sample_customers_df, sample_transactions_df)

Do deep feature synthesis:

In [8]:
features = ft.dfs(entityset=es, target_entity="customers", features_only=True)

print("Number of features: %d" % len(features))

features

Number of features: 89


[<Feature: target>,
 <Feature: feature_1_1>,
 <Feature: feature_1_2>,
 <Feature: feature_1_3>,
 <Feature: feature_1_4>,
 <Feature: feature_1_5>,
 <Feature: feature_2_1>,
 <Feature: feature_2_2>,
 <Feature: feature_2_3>,
 <Feature: feature_1>,
 <Feature: feature_2>,
 <Feature: feature_3>,
 <Feature: SUM(transactions.installments)>,
 <Feature: SUM(transactions.month_lag)>,
 <Feature: SUM(transactions.purchase_amount)>,
 <Feature: SUM(transactions.month_diff)>,
 <Feature: STD(transactions.installments)>,
 <Feature: STD(transactions.month_lag)>,
 <Feature: STD(transactions.purchase_amount)>,
 <Feature: STD(transactions.month_diff)>,
 <Feature: MAX(transactions.installments)>,
 <Feature: MAX(transactions.month_lag)>,
 <Feature: MAX(transactions.purchase_amount)>,
 <Feature: MAX(transactions.month_diff)>,
 <Feature: SKEW(transactions.installments)>,
 <Feature: SKEW(transactions.month_lag)>,
 <Feature: SKEW(transactions.purchase_amount)>,
 <Feature: SKEW(transactions.month_diff)>,
 <Feature: 

In [9]:
feature_matrix_df = ft.calculate_feature_matrix(features=features, entityset=es, verbose=True)

with pd.option_context("display.max_rows", 6): display(feature_matrix_df)

Elapsed: 00:13 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


Unnamed: 0_level_0,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2,feature_3,SUM(transactions.installments),SUM(transactions.month_lag),SUM(transactions.purchase_amount),SUM(transactions.month_diff),STD(transactions.installments),STD(transactions.month_lag),STD(transactions.purchase_amount),STD(transactions.month_diff),MAX(transactions.installments),MAX(transactions.month_lag),MAX(transactions.purchase_amount),MAX(transactions.month_diff),SKEW(transactions.installments),SKEW(transactions.month_lag),SKEW(transactions.purchase_amount),SKEW(transactions.month_diff),MIN(transactions.installments),MIN(transactions.month_lag),MIN(transactions.purchase_amount),MIN(transactions.month_diff),MEAN(transactions.installments),MEAN(transactions.month_lag),MEAN(transactions.purchase_amount),MEAN(transactions.month_diff),COUNT(transactions),PERCENT_TRUE(transactions.authorized_flag),PERCENT_TRUE(transactions.category_1),PERCENT_TRUE(transactions.category_2_1),PERCENT_TRUE(transactions.category_2_2),PERCENT_TRUE(transactions.category_2_3),PERCENT_TRUE(transactions.category_2_4),PERCENT_TRUE(transactions.category_2_5),PERCENT_TRUE(transactions.category_2_-1),PERCENT_TRUE(transactions.category_3_0),PERCENT_TRUE(transactions.category_3_1),PERCENT_TRUE(transactions.category_3_2),PERCENT_TRUE(transactions.category_3_-1),NUM_UNIQUE(transactions.city_id),NUM_UNIQUE(transactions.merchant_category_id),NUM_UNIQUE(transactions.merchant_id),NUM_UNIQUE(transactions.state_id),NUM_UNIQUE(transactions.subsector_id),NUM_UNIQUE(transactions.category_2),NUM_UNIQUE(transactions.category_3),MODE(transactions.city_id),MODE(transactions.merchant_category_id),MODE(transactions.merchant_id),MODE(transactions.state_id),MODE(transactions.subsector_id),MODE(transactions.category_2),MODE(transactions.category_3),DAY(first_active_month),YEAR(first_active_month),MONTH(first_active_month),WEEKDAY(first_active_month),COUNT(transactions WHERE category_2_3 = 1),COUNT(transactions WHERE category_2_4 = 1),COUNT(transactions WHERE category_2_-1 = 1),COUNT(transactions WHERE installments = -1),COUNT(transactions WHERE category_3_0 = 1),COUNT(transactions WHERE authorized_flag = 1),COUNT(transactions WHERE category_2_2 = 1),COUNT(transactions WHERE category_3_-1 = 1),COUNT(transactions WHERE category_2_1 = 1),COUNT(transactions WHERE installments = 999),COUNT(transactions WHERE category_3_2 = 1),COUNT(transactions WHERE category_1 = 1),COUNT(transactions WHERE category_3_1 = 1),COUNT(transactions WHERE category_2_5 = 1),NUM_UNIQUE(transactions.DAY(purchase_date)),NUM_UNIQUE(transactions.YEAR(purchase_date)),NUM_UNIQUE(transactions.MONTH(purchase_date)),NUM_UNIQUE(transactions.WEEKDAY(purchase_date)),MODE(transactions.DAY(purchase_date)),MODE(transactions.YEAR(purchase_date)),MODE(transactions.MONTH(purchase_date)),MODE(transactions.WEEKDAY(purchase_date))
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1
C_ID_0033871586,-1.052924,1,0,0,0,0,0,1,0,1,2,0,2.0,2.0,-1.015973,42.0,0.0,0.000000,0.299635,0.000000,1.0,1.0,-0.296112,21.0,,,,,1.0,1.0,-0.719860,21.0,1.0,1.00,-0.507986,21.00,2.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,17,557,M_ID_59946bf50f,22,29,4.0,1.0,1,2015,9,1,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0,2.0,21.0,2017.0,5.0,4.0
C_ID_005c9dbbfa,0.263760,0,0,0,0,1,1,0,0,5,1,1,10.0,14.0,-5.590839,211.0,0.0,0.516398,0.164790,0.316228,1.0,2.0,-0.229996,22.0,0.0,0.484123,1.105720,3.162278,1.0,1.0,-0.713849,21.0,1.0,1.40,-0.559084,21.10,10.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,9.0,10.0,2.0,8.0,1.0,1.0,158,278,M_ID_00a6ca8a8a,15,37,1.0,1.0,1,2016,7,4,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,10.0,0.0,0.0,0.0,10.0,0.0,8.0,1.0,2.0,6.0,21.0,2017.0,5.0,2.0
C_ID_008c25003f,-0.863420,0,0,1,0,0,0,1,0,3,2,1,0.0,5.0,-2.622322,45.0,0.0,0.500000,0.082733,0.500000,0.0,2.0,-0.551563,12.0,0.0,2.000000,0.650971,2.000000,0.0,1.0,-0.726968,11.0,0.0,1.25,-0.655580,11.25,4.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0,4.0,1.0,4.0,1.0,1.0,281,278,M_ID_20d2b5a421,21,16,5.0,0.0,1,2016,6,2,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,1.0,2.0,4.0,8.0,2018.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C_ID_ffea783298,0.518986,0,0,0,0,1,0,1,0,5,2,1,0.0,3.0,0.714496,51.0,0.0,0.000000,1.303920,0.000000,0.0,1.0,1.741483,17.0,0.0,0.000000,1.708078,0.000000,0.0,1.0,-0.585854,17.0,0.0,1.00,0.238165,17.00,3.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,3.0,3.0,2.0,3.0,1.0,1.0,88,340,M_ID_00a6ca8a8a,16,27,1.0,0.0,1,2017,5,0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,2.0,8.0,2017.0,9.0,5.0
C_ID_ffee45f941,0.951472,0,0,1,0,0,1,0,0,3,1,1,0.0,2.0,-0.698883,11.0,,,,,0.0,2.0,-0.698883,11.0,,,,,0.0,2.0,-0.698883,11.0,0.0,2.00,-0.698883,11.00,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,28,705,M_ID_86a84b8d7f,9,33,1.0,0.0,1,2017,9,4,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,28.0,2018.0,4.0,5.0
C_ID_fff7031631,0.289718,0,0,0,0,1,1,0,0,5,1,1,6.0,1.0,1.394220,15.0,,,,,6.0,1.0,1.394220,15.0,,,,,6.0,1.0,1.394220,15.0,6.0,1.00,1.394220,15.00,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,140,840,M_ID_1688d3651e,9,20,1.0,2.0,1,2016,10,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,24.0,2017.0,11.0,4.0


In [13]:
%%time

from multiprocessing import Pool

def calculate_feature_matrix_for(transactions_part_file_name, customers_part_df):
    transactions_part_df = read_transactions_part(transactions_part_file_name)
    part_entity_set = build_entity_set(customers_part_df, transactions_part_df)
    
    feature_matrix_part_df = ft.calculate_feature_matrix(
        features=features, entityset=part_entity_set,
    )
    
    return feature_matrix_part_df

feature_matrix_dfs = dict()

for customers_clazz, customers_df in customers_dfs.items():
    for transactions_clazz, n_parts in TRANSACTIONS_N_PARTS.items():
        customers_with_part_df = add_part(customers_df, n_parts=n_parts)
        
        processes_args = []
        
        for part, customers_part_df in customers_with_part_df.groupby("part"):
            transactions_part_file_name = "%s.%03d.csv" % (transactions_clazz, part)
            processes_args.append((transactions_part_file_name, customers_part_df))
        
        with Pool(8) as p:
            feature_matrix_part_dfs = p.starmap(
                calculate_feature_matrix_for,
                processes_args,
            )

        feature_matrix_df = pd.concat(
            feature_matrix_part_dfs,
            sort=False,
        )
        
        feature_matrix_dfs[customers_clazz] = feature_matrix_df
        
        break # only old transactions for now

CPU times: user 3.04 s, sys: 544 ms, total: 3.59 s
Wall time: 16min


In [14]:
%%time

with pd.option_context("display.max_rows", 6):
    for feature_matrix_clazz, feature_matrix_df in feature_matrix_dfs.items():
        feature_matrix_df.to_csv(f"../data/2-feature-engineered/{feature_matrix_clazz}.csv")

CPU times: user 20.3 s, sys: 152 ms, total: 20.5 s
Wall time: 20.5 s


In [None]:
context = ("display.max_rows",     0,
           "display.max_colwidth", 0,)

with pd.option_context(*context): display(ft.list_primitives())

Add a `fraud` boolean flag?

In [None]:
authorized_card_rate = transactions_df.groupby("card_id").authorized_flag.mean().sort_values()

with pd.option_context("display.max_rows", 10): display(authorized_card_rate)

`999` installments looks like a `fraud` indicator too

In [None]:
transactions_df.groupby("installments").authorized_flag.mean()

Use `np.log` for `purchase_amount`?

In [None]:
_, axs = plt.subplots(1, 2, figsize=(6, 3), sharey=True)
axs[0].hist(transactions_df.purchase_amount)
axs[1].hist(np.log(transactions_df.purchase_amount + 1))

plt.show()