In [67]:
%run ../0-utils/0-Base.ipynb

Read data about customers (`train` and `test`):

In [68]:
customers_dfs = dict()

for clazz in ("train", "test"):
    df = pd.read_csv(f"../data/raw/{clazz}.csv", parse_dates=["first_active_month"])

    # test.csv has one missing value - replace it with the most popular 
    # first_active_month for rows in train.csv that have the same feature_x
    if "test" == clazz: df.fillna(pd.to_datetime("2017-01"), inplace=True)
        
    if "target" not in df.columns: df["target"] = 0

    feature_1_backup = df.feature_1
    feature_2_backup = df.feature_2

    df = pd.get_dummies(df, columns=["feature_1", "feature_2"])

    df["feature_1"] = feature_1_backup
    df["feature_2"] = feature_2_backup
    
    display(df)
    
    customers_dfs[clazz] = df

Unnamed: 0,first_active_month,card_id,feature_3,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2
0,2017-06-01,C_ID_92a2005557,1,-0.820283,0,0,0,0,1,0,1,0,5,2
1,2017-01-01,C_ID_3d0044924f,0,0.392913,0,0,0,1,0,1,0,0,4,1
2,2016-08-01,C_ID_d639edf6cd,0,0.688056,0,1,0,0,0,0,1,0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201914,2017-08-01,C_ID_7666735b3d,0,0.093494,0,0,0,1,0,0,0,1,4,3
201915,2016-07-01,C_ID_73f5a0efd0,1,-4.676589,0,0,1,0,0,0,1,0,3,2
201916,2017-07-01,C_ID_92c9984c58,1,-1.859413,0,0,1,0,0,1,0,0,3,1


Unnamed: 0,first_active_month,card_id,feature_3,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2
0,2017-04-01,C_ID_0ab67a22ab,1,0,0,0,1,0,0,0,0,1,3,3
1,2017-01-01,C_ID_130fd0cbdd,0,0,0,1,0,0,0,0,0,1,2,3
2,2017-08-01,C_ID_b709037bc5,1,0,0,0,0,0,1,1,0,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123620,2016-09-01,C_ID_21d56d950c,1,0,0,0,0,0,1,1,0,0,5,1
123621,2017-06-01,C_ID_6c46fc5a9d,0,0,0,1,0,0,0,1,0,0,2,1
123622,2016-10-01,C_ID_87e7979a5f,1,0,0,0,0,0,1,1,0,0,5,1


Define a sample partition of customers:

In [69]:
sample_customers_df = add_part(customers_dfs["train"], n_parts=TRANSACTIONS_N_PARTS["new"])
sample_customers_df = sample_customers_df.loc[lambda x: x.part == 13].drop("part", axis=1)

display(sample_customers_df)

Unnamed: 0,first_active_month,card_id,feature_3,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2
43,2017-11-01,C_ID_1919f9080e,0,0.393388,0,1,0,0,0,0,0,1,2,3
225,2017-01-01,C_ID_ae77d244b6,0,-33.219281,0,1,0,0,0,1,0,0,2,1
243,2017-09-01,C_ID_ee5bb1f392,0,-1.230539,0,1,0,0,0,0,0,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201682,2017-08-01,C_ID_5ac91f1427,0,-2.315998,0,1,0,0,0,1,0,0,2,1
201899,2017-10-01,C_ID_58e359763e,1,-2.702214,0,0,0,0,1,1,0,0,5,1
201910,2017-10-01,C_ID_9072609210,0,-1.753271,0,1,0,0,0,0,0,1,2,3


Define a function for reading and processing a partition with transactions:

In [70]:
from datetime import datetime
from pandas.api.types import CategoricalDtype

transactions_cat_cols = ("city_id",
                         "category_3",
                         "merchant_category_id",
                         "state_id",
                         "subsector_id")

transactions_dtype = {cat_col: "category" for cat_col in transactions_cat_cols}

transactions_dtype["installments"] = CategoricalDtype(
    categories=np.arange(-1, 13).tolist() + [999],
    ordered=True,
)

def read_transactions_part(part_file_name):
    transactions_df = pd.read_csv(f"../data/1-partitioned/{part_file_name}",
                                  parse_dates=["purchase_date"],
                                  dtype=transactions_dtype)
    
    transactions_df = transactions_df.reset_index().rename(columns={"index": "ID"})

    days_diff = datetime.today() - transactions_df.purchase_date
    transactions_df["month_diff"] = days_diff // np.timedelta64(1, "M")
    transactions_df["month_diff"] += transactions_df.month_lag
    
    cat_cols_mapping = dict(
        authorized_flag=dict(
            Y=1,
            N=0,
        ),
        category_1=dict(
            Y=1,
            N=0,
        ),
        category_3=dict(
            A=0,
            B=1,
            C=2,
        ),
    )
    
    for col, mapping in cat_cols_mapping.items():
        transactions_df[col] = transactions_df[col].map(mapping)
    
    for col in ("authorized_flag", "category_1", "category_2", "category_3"):
        transactions_df[col] = transactions_df[col].fillna(-1).astype(int)
        
    category_2_backup = transactions_df.category_2
    category_3_backup = transactions_df.category_3
    installments_backup = transactions_df.installments

    transactions_df = pd.get_dummies(transactions_df, columns=["category_2", "category_3", "installments"])
    
    transactions_df["category_2"] = category_2_backup
    transactions_df["category_3"] = category_3_backup
    transactions_df["installments"] = installments_backup

    return transactions_df

Read a sample partition with `read_transactions_part(...)`:

In [71]:
%%time

sample_transactions_df = read_transactions_part("new.013.csv")

display(sample_transactions_df)

Unnamed: 0,ID,authorized_flag,card_id,city_id,category_1,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,state_id,subsector_id,month_diff,category_2_-1,category_2_1,category_2_2,category_2_3,category_2_4,category_2_5,category_3_-1,category_3_0,category_3_1,category_3_2,installments_-1,installments_0,installments_1,installments_2,installments_3,installments_4,installments_5,installments_6,installments_7,installments_8,installments_9,installments_10,installments_11,installments_12,installments_999,category_2,category_3,installments
0,0,1,C_ID_be71a60d23,143,0,19,M_ID_ab392e3714,1,-0.701753,2018-03-01 05:47:19,5,36,12,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0
1,1,1,C_ID_be71a60d23,333,0,307,M_ID_1fb0d13ea0,1,-0.716855,2018-03-08 18:27:49,9,19,12,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,2,1,C_ID_be71a60d23,143,0,166,M_ID_12c118b802,2,-0.736389,2018-04-30 17:44:44,5,29,11,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20756,20756,1,C_ID_7750eb98a6,20,0,818,M_ID_1b5a3c1373,2,2.825675,2018-04-19 12:02:33,19,12,11,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1
20757,20757,1,C_ID_438aea7835,53,0,557,M_ID_8013941341,1,-0.665765,2018-03-28 08:57:19,20,29,11,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0
20758,20758,1,C_ID_611d4d588f,302,0,307,M_ID_ee5a6f6a35,2,-0.731881,2018-03-11 15:07:49,7,19,12,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,1,1


CPU times: user 136 ms, sys: 0 ns, total: 136 ms
Wall time: 135 ms


In [72]:
%%time

merchants_df = pd.read_csv("../data/raw/merchants.csv")
merchants_df = merchants_df.drop_duplicates("merchant_id")
merchants_df = merchants_df.reset_index().rename(columns={"index": "ID"})

cat_cols_mapping = dict(
    category_1=dict(
        Y=1,
        N=0,
    ),
    category_4=dict(
        Y=1,
        N=0,
    ),
    most_recent_sales_range=dict(
        A=0,
        B=1,
        C=2,
        D=3,
        E=4,
    ),
    most_recent_purchases_range=dict(
        A=0,
        B=1,
        C=2,
        D=3,
        E=4,
    ),
)

for col, mapping in cat_cols_mapping.items():
    merchants_df[col] = merchants_df[col].map(mapping)

for col in ("category_1", "category_2", "category_4", "most_recent_sales_range", "most_recent_purchases_range"):
    merchants_df[col] = merchants_df[col].fillna(-1).astype(int)

category_2_backup = merchants_df.category_2
most_recent_sales_range_backup = merchants_df.most_recent_sales_range
most_recent_purchases_range_backup = merchants_df.most_recent_purchases_range

merchants_df = pd.get_dummies(merchants_df, columns=["category_2",
                                                     "most_recent_sales_range",
                                                     "most_recent_purchases_range"])

merchants_df["category_2"] = category_2_backup
merchants_df["most_recent_sales_range"] = most_recent_sales_range_backup
merchants_df["most_recent_purchases_range"] = most_recent_purchases_range_backup

merchants_df = merchants_df.fillna(0)

display(merchants_df)

Unnamed: 0,ID,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,avg_sales_lag3,avg_purchases_lag3,active_months_lag3,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2_-1,category_2_1,category_2_2,category_2_3,category_2_4,category_2_5,most_recent_sales_range_0,most_recent_sales_range_1,most_recent_sales_range_2,most_recent_sales_range_3,most_recent_sales_range_4,most_recent_purchases_range_0,most_recent_purchases_range_1,most_recent_purchases_range_2,most_recent_purchases_range_3,most_recent_purchases_range_4,category_2,most_recent_sales_range,most_recent_purchases_range
0,0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,0,-0.40,9.666667,3,-2.25,18.666667,6,-2.32,13.916667,12,0,242,9,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,4,4
1,1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,0,-0.72,1.750000,3,-0.74,1.291667,6,-0.57,1.687500,12,0,22,16,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,4,4
2,2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,0,-82.13,260.000000,2,-82.13,260.000000,2,-82.13,260.000000,2,0,-1,5,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,5,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
334630,334693,M_ID_f2045dd267,35,561,7,-0.057471,-0.057471,0,0.96,0.982957,3,0.90,0.924769,6,0.74,0.750763,8,1,160,21,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,5,0,0
334631,334694,M_ID_9139332ccc,35,511,7,-0.057471,-0.057471,1,0.94,0.919558,3,0.82,0.783000,6,0.65,0.584000,12,1,-1,-1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,-1,0,0
334632,334695,M_ID_cd2c0b07e9,35,606,17,-0.057471,-0.057471,0,0.90,0.913902,3,0.73,0.744417,6,0.53,0.540334,10,1,69,9,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0


CPU times: user 1.08 s, sys: 31.9 ms, total: 1.12 s
Wall time: 1.11 s


Define a function for building an entry set for `customers_df` and `transactions_df`:

In [73]:
import featuretools as ft
import featuretools.variable_types as vtypes

transactions_vtypes = dict(authorized_flag=vtypes.Boolean,
                           category_1=vtypes.Boolean)

for ord_col in ("installments", "category_2", "category_3"):
    transactions_vtypes[ord_col] = vtypes.Ordinal
    
    for bool_col in filter(lambda x: x.startswith(ord_col + "_"), sample_transactions_df.columns):
        transactions_vtypes[bool_col] = vtypes.Boolean
        
customers_vtypes = {
    "feature_1": vtypes.Ordinal,
    "feature_2": vtypes.Ordinal,
    "feature_3": vtypes.Boolean,
}

merchants_vtypes = dict(category_1=vtypes.Boolean,
                        category_4=vtypes.Boolean)

for ord_col in ("category_2", "most_recent_sales_range", "most_recent_purchases_range"):
    merchants_vtypes[ord_col] = vtypes.Ordinal
    
    for bool_col in filter(lambda x: x.startswith(ord_col + "_"), merchants_df.columns):
        merchants_vtypes[bool_col] = vtypes.Boolean

def build_entity_set(customers_df, transactions_df, merchants_df):
    es = ft.EntitySet()
    
    es.entity_from_dataframe(entity_id="customers",
                             dataframe=customers_df,
                             index="card_id",
                             variable_types=customers_vtypes)

    es.entity_from_dataframe(entity_id="transactions",
                             dataframe=transactions_df,
                             index="ID",
                             variable_types=transactions_vtypes)
    
    es.entity_from_dataframe(entity_id="merchants",
                             dataframe=merchants_df,
                             index="merchant_id",
                             variable_types=merchants_vtypes)

    customer_transactions_relationship = ft.Relationship(es["customers"]["card_id"],
                                                         es["transactions"]["card_id"])
    
    merchants_transactions_relationship = ft.Relationship(es["merchants"]["merchant_id"],
                                                          es["transactions"]["merchant_id"])

    es.add_relationships([customer_transactions_relationship,
                          merchants_transactions_relationship])

    es["transactions"]["category_1"].interesting_values = [1]
    es["transactions"]["authorized_flag"].interesting_values = [1]

    for col in ("category_2", "category_3", "installments"):
        for dummy_col in filter(lambda x: x.startswith(col + "_"), transactions_df.columns):
            es["transactions"][dummy_col].interesting_values = [1]
            
    es["merchants"]["category_1"].interesting_values = [1]
    es["merchants"]["category_4"].interesting_values = [1]
    
    for col in ("category_2", "most_recent_sales_range", "most_recent_purchases_range"):
        for dummy_col in filter(lambda x: x.startswith(col + "_"), merchants_df.columns):
            es["merchants"][dummy_col].interesting_values = [1]

    return es

Build an entity set for sample data:

In [74]:
es = build_entity_set(sample_customers_df, sample_transactions_df, merchants_df)

Do deep feature synthesis:

In [75]:
features = ft.dfs(entityset=es, target_entity="customers", features_only=True)

print("Number of features: %d" % len(features))

display(features)

Number of features: 221


[<Feature: target>,
 <Feature: feature_1_1>,
 <Feature: feature_1_2>,
 <Feature: feature_1_3>,
 <Feature: feature_1_4>,
 <Feature: feature_1_5>,
 <Feature: feature_2_1>,
 <Feature: feature_2_2>,
 <Feature: feature_2_3>,
 <Feature: feature_1>,
 <Feature: feature_2>,
 <Feature: feature_3>,
 <Feature: SUM(transactions.month_lag)>,
 <Feature: SUM(transactions.purchase_amount)>,
 <Feature: SUM(transactions.month_diff)>,
 <Feature: STD(transactions.month_lag)>,
 <Feature: STD(transactions.purchase_amount)>,
 <Feature: STD(transactions.month_diff)>,
 <Feature: MAX(transactions.month_lag)>,
 <Feature: MAX(transactions.purchase_amount)>,
 <Feature: MAX(transactions.month_diff)>,
 <Feature: SKEW(transactions.month_lag)>,
 <Feature: SKEW(transactions.purchase_amount)>,
 <Feature: SKEW(transactions.month_diff)>,
 <Feature: MIN(transactions.month_lag)>,
 <Feature: MIN(transactions.purchase_amount)>,
 <Feature: MIN(transactions.month_diff)>,
 <Feature: MEAN(transactions.month_lag)>,
 <Feature: MEAN(

In [76]:
feature_matrix_df = ft.calculate_feature_matrix(features=features, entityset=es, verbose=True)

display(feature_matrix_df)

Elapsed: 00:33 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 10/10 chunks


Unnamed: 0_level_0,target,feature_1_1,feature_1_2,feature_1_3,feature_1_4,feature_1_5,feature_2_1,feature_2_2,feature_2_3,feature_1,feature_2,feature_3,SUM(transactions.month_lag),SUM(transactions.purchase_amount),SUM(transactions.month_diff),STD(transactions.month_lag),STD(transactions.purchase_amount),STD(transactions.month_diff),MAX(transactions.month_lag),MAX(transactions.purchase_amount),MAX(transactions.month_diff),SKEW(transactions.month_lag),SKEW(transactions.purchase_amount),SKEW(transactions.month_diff),MIN(transactions.month_lag),MIN(transactions.purchase_amount),MIN(transactions.month_diff),MEAN(transactions.month_lag),MEAN(transactions.purchase_amount),MEAN(transactions.month_diff),COUNT(transactions),PERCENT_TRUE(transactions.authorized_flag),PERCENT_TRUE(transactions.category_1),PERCENT_TRUE(transactions.installments_-1),PERCENT_TRUE(transactions.installments_0),PERCENT_TRUE(transactions.installments_1),PERCENT_TRUE(transactions.installments_2),PERCENT_TRUE(transactions.installments_3),PERCENT_TRUE(transactions.installments_4),PERCENT_TRUE(transactions.installments_5),PERCENT_TRUE(transactions.installments_6),PERCENT_TRUE(transactions.installments_7),PERCENT_TRUE(transactions.installments_8),PERCENT_TRUE(transactions.installments_9),PERCENT_TRUE(transactions.installments_10),PERCENT_TRUE(transactions.installments_11),PERCENT_TRUE(transactions.installments_12),PERCENT_TRUE(transactions.installments_999),PERCENT_TRUE(transactions.category_2_-1),PERCENT_TRUE(transactions.category_2_1),PERCENT_TRUE(transactions.category_2_2),PERCENT_TRUE(transactions.category_2_3),PERCENT_TRUE(transactions.category_2_4),PERCENT_TRUE(transactions.category_2_5),PERCENT_TRUE(transactions.category_3_-1),PERCENT_TRUE(transactions.category_3_0),PERCENT_TRUE(transactions.category_3_1),PERCENT_TRUE(transactions.category_3_2),NUM_UNIQUE(transactions.city_id),NUM_UNIQUE(transactions.merchant_category_id),NUM_UNIQUE(transactions.merchant_id),NUM_UNIQUE(transactions.state_id),NUM_UNIQUE(transactions.subsector_id),NUM_UNIQUE(transactions.installments),NUM_UNIQUE(transactions.category_2),NUM_UNIQUE(transactions.category_3),MODE(transactions.city_id),MODE(transactions.merchant_category_id),MODE(transactions.merchant_id),MODE(transactions.state_id),MODE(transactions.subsector_id),MODE(transactions.installments),MODE(transactions.category_2),MODE(transactions.category_3),DAY(first_active_month),YEAR(first_active_month),MONTH(first_active_month),WEEKDAY(first_active_month),SUM(transactions.merchants.ID),SUM(transactions.merchants.merchant_group_id),SUM(transactions.merchants.merchant_category_id),SUM(transactions.merchants.subsector_id),SUM(transactions.merchants.numerical_1),SUM(transactions.merchants.numerical_2),SUM(transactions.merchants.avg_sales_lag3),SUM(transactions.merchants.avg_purchases_lag3),SUM(transactions.merchants.active_months_lag3),SUM(transactions.merchants.avg_sales_lag6),SUM(transactions.merchants.avg_purchases_lag6),SUM(transactions.merchants.active_months_lag6),SUM(transactions.merchants.avg_sales_lag12),SUM(transactions.merchants.avg_purchases_lag12),SUM(transactions.merchants.active_months_lag12),SUM(transactions.merchants.city_id),SUM(transactions.merchants.state_id),STD(transactions.merchants.ID),STD(transactions.merchants.merchant_group_id),STD(transactions.merchants.merchant_category_id),STD(transactions.merchants.subsector_id),STD(transactions.merchants.numerical_1),STD(transactions.merchants.numerical_2),STD(transactions.merchants.avg_sales_lag3),STD(transactions.merchants.avg_purchases_lag3),STD(transactions.merchants.active_months_lag3),STD(transactions.merchants.avg_sales_lag6),STD(transactions.merchants.avg_purchases_lag6),STD(transactions.merchants.active_months_lag6),STD(transactions.merchants.avg_sales_lag12),STD(transactions.merchants.avg_purchases_lag12),STD(transactions.merchants.active_months_lag12),STD(transactions.merchants.city_id),STD(transactions.merchants.state_id),MAX(transactions.merchants.ID),MAX(transactions.merchants.merchant_group_id),MAX(transactions.merchants.merchant_category_id),MAX(transactions.merchants.subsector_id),MAX(transactions.merchants.numerical_1),MAX(transactions.merchants.numerical_2),MAX(transactions.merchants.avg_sales_lag3),MAX(transactions.merchants.avg_purchases_lag3),MAX(transactions.merchants.active_months_lag3),MAX(transactions.merchants.avg_sales_lag6),MAX(transactions.merchants.avg_purchases_lag6),MAX(transactions.merchants.active_months_lag6),MAX(transactions.merchants.avg_sales_lag12),MAX(transactions.merchants.avg_purchases_lag12),MAX(transactions.merchants.active_months_lag12),MAX(transactions.merchants.city_id),MAX(transactions.merchants.state_id),SKEW(transactions.merchants.ID),SKEW(transactions.merchants.merchant_group_id),SKEW(transactions.merchants.merchant_category_id),SKEW(transactions.merchants.subsector_id),SKEW(transactions.merchants.numerical_1),SKEW(transactions.merchants.numerical_2),SKEW(transactions.merchants.avg_sales_lag3),SKEW(transactions.merchants.avg_purchases_lag3),SKEW(transactions.merchants.active_months_lag3),SKEW(transactions.merchants.avg_sales_lag6),SKEW(transactions.merchants.avg_purchases_lag6),SKEW(transactions.merchants.active_months_lag6),SKEW(transactions.merchants.avg_sales_lag12),SKEW(transactions.merchants.avg_purchases_lag12),SKEW(transactions.merchants.active_months_lag12),SKEW(transactions.merchants.city_id),SKEW(transactions.merchants.state_id),MIN(transactions.merchants.ID),MIN(transactions.merchants.merchant_group_id),MIN(transactions.merchants.merchant_category_id),MIN(transactions.merchants.subsector_id),MIN(transactions.merchants.numerical_1),MIN(transactions.merchants.numerical_2),MIN(transactions.merchants.avg_sales_lag3),MIN(transactions.merchants.avg_purchases_lag3),MIN(transactions.merchants.active_months_lag3),MIN(transactions.merchants.avg_sales_lag6),MIN(transactions.merchants.avg_purchases_lag6),MIN(transactions.merchants.active_months_lag6),MIN(transactions.merchants.avg_sales_lag12),MIN(transactions.merchants.avg_purchases_lag12),MIN(transactions.merchants.active_months_lag12),MIN(transactions.merchants.city_id),MIN(transactions.merchants.state_id),MEAN(transactions.merchants.ID),MEAN(transactions.merchants.merchant_group_id),MEAN(transactions.merchants.merchant_category_id),MEAN(transactions.merchants.subsector_id),MEAN(transactions.merchants.numerical_1),MEAN(transactions.merchants.numerical_2),MEAN(transactions.merchants.avg_sales_lag3),MEAN(transactions.merchants.avg_purchases_lag3),MEAN(transactions.merchants.active_months_lag3),MEAN(transactions.merchants.avg_sales_lag6),MEAN(transactions.merchants.avg_purchases_lag6),MEAN(transactions.merchants.active_months_lag6),MEAN(transactions.merchants.avg_sales_lag12),MEAN(transactions.merchants.avg_purchases_lag12),MEAN(transactions.merchants.active_months_lag12),MEAN(transactions.merchants.city_id),MEAN(transactions.merchants.state_id),COUNT(transactions WHERE category_3_2 = 1),COUNT(transactions WHERE installments_10 = 1),COUNT(transactions WHERE category_2_5 = 1),COUNT(transactions WHERE category_2_4 = 1),COUNT(transactions WHERE installments_1 = 1),COUNT(transactions WHERE installments_2 = 1),COUNT(transactions WHERE installments_4 = 1),COUNT(transactions WHERE installments_3 = 1),COUNT(transactions WHERE category_3_0 = 1),COUNT(transactions WHERE installments_5 = 1),COUNT(transactions WHERE category_1 = 1),COUNT(transactions WHERE category_2_1 = 1),COUNT(transactions WHERE installments_0 = 1),COUNT(transactions WHERE authorized_flag = 1),COUNT(transactions WHERE installments_999 = 1),COUNT(transactions WHERE installments_7 = 1),COUNT(transactions WHERE installments_11 = 1),COUNT(transactions WHERE installments_-1 = 1),COUNT(transactions WHERE category_3_1 = 1),COUNT(transactions WHERE installments_8 = 1),COUNT(transactions WHERE category_3_-1 = 1),COUNT(transactions WHERE category_2_-1 = 1),COUNT(transactions WHERE installments_12 = 1),COUNT(transactions WHERE installments_6 = 1),COUNT(transactions WHERE category_2_2 = 1),COUNT(transactions WHERE installments_9 = 1),COUNT(transactions WHERE category_2_3 = 1),NUM_UNIQUE(transactions.DAY(purchase_date)),NUM_UNIQUE(transactions.YEAR(purchase_date)),NUM_UNIQUE(transactions.MONTH(purchase_date)),NUM_UNIQUE(transactions.WEEKDAY(purchase_date)),NUM_UNIQUE(transactions.merchants.category_2),NUM_UNIQUE(transactions.merchants.most_recent_sales_range),NUM_UNIQUE(transactions.merchants.most_recent_purchases_range),MODE(transactions.DAY(purchase_date)),MODE(transactions.YEAR(purchase_date)),MODE(transactions.MONTH(purchase_date)),MODE(transactions.WEEKDAY(purchase_date)),MODE(transactions.merchants.category_2),MODE(transactions.merchants.most_recent_sales_range),MODE(transactions.merchants.most_recent_purchases_range)
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1
C_ID_0033871586,-1.052924,1,0,0,0,0,0,1,0,1,2,0,2.0,-1.015973,42.0,0.000000,0.299635,0.000000,1.0,-0.296112,21.0,,,,1.0,-0.719860,21.0,1.00,-0.507986,21.000000,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,17,557,M_ID_59946bf50f,22,29,1.0,4.0,1.0,1,2015,9,1,623909.0,32328.0,637.0,66.0,5.338256,5.288682,2.12,2.237366,6.0,2.11,2.272989,12.0,2.12,2.288885,24.0,34.0,44.0,30078.201151,22809.850548,337.289935,5.656854,3.813927,3.778873,0.042426,0.074123,0.0,0.007071,0.056186,0.0,0.028284,0.034495,0.000000,0.000000,0.000000,333223.0,32293.0,557.0,37.0,5.365982,5.316408,1.09,1.171096,3.0,1.06,1.176224,6.0,1.08,1.168834,12.0,17.0,22.0,,,,,,,,,,,,,,,,,,290686.0,35.0,80.0,29.0,-0.027726,-0.027726,1.03,1.066270,3.0,1.05,1.096765,6.0,1.04,1.120051,12.0,17.0,22.0,311954.50,16164.000000,318.500000,33.000000,2.669128,2.644341,1.060000,1.118683,3.0,1.055000,1.136494,6.0,1.0600,1.144443,12.000000,17.000000,22.000000,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,2.0,1.0,2.0,2.0,21.0,2017.0,5.0,4.0,4.0,1.0,1.0
C_ID_005c9dbbfa,0.263760,0,0,0,0,1,1,0,0,5,1,1,14.0,-5.590839,211.0,0.516398,0.164790,0.316228,2.0,-0.229996,22.0,0.484123,1.105720,3.162278,1.0,-0.713849,21.0,1.40,-0.559084,21.100000,10.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,9.0,10.0,2.0,8.0,1.0,1.0,1.0,158,278,M_ID_00a6ca8a8a,15,37,1.0,1.0,1.0,1,2016,7,4,2768547.0,192659.0,3446.0,255.0,19.096464,18.323101,16.84,22.999230,30.0,18.89,27.215321,60.0,19.41,28.095112,115.0,1262.0,134.0,64072.732627,21516.986135,138.702079,10.091250,5.091162,5.089138,1.847672,3.660250,0.0,2.301832,4.627782,0.0,2.340282,4.731604,1.581139,67.040286,5.059644,334455.0,51314.0,511.0,37.0,16.232717,16.153398,6.93,12.705128,3.0,8.42,15.855769,6.0,8.57,16.219780,12.0,158.0,15.0,-2.154834,0.623436,-0.485482,-0.392085,3.036741,3.040893,3.134418,3.148772,0.0,3.126683,3.130222,0.0,3.107501,3.114541,-3.162278,-1.778781,-3.162278,110969.0,35.0,80.0,7.0,-0.057471,-0.057471,0.97,0.984944,3.0,0.95,0.946009,6.0,0.95,0.941234,7.0,-1.0,-1.0,276854.70,19265.900000,344.600000,25.500000,1.909646,1.832310,1.684000,2.299923,3.0,1.889000,2.721532,6.0,1.9410,2.809511,11.500000,126.200000,13.400000,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,1.0,2.0,6.0,2.0,4.0,4.0,21.0,2017.0,5.0,2.0,1.0,3.0,2.0
C_ID_008c25003f,-0.863420,0,0,1,0,0,0,1,0,3,2,1,5.0,-2.622322,45.0,0.500000,0.082733,0.500000,2.0,-0.551563,12.0,2.000000,0.650971,2.000000,1.0,-0.726968,11.0,1.25,-0.655580,11.250000,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,4.0,4.0,1.0,4.0,1.0,1.0,1.0,281,278,M_ID_20d2b5a421,21,16,0.0,5.0,0.0,1,2016,6,2,862923.0,15403.0,1600.0,59.0,1.277183,1.247438,4.20,4.390214,12.0,4.24,4.248700,24.0,4.33,4.126240,48.0,1124.0,84.0,119340.619962,6994.076178,153.107805,16.070159,0.707325,0.712328,0.078740,0.111023,0.0,0.071647,0.178500,0.0,0.097425,0.227884,0.000000,0.000000,0.000000,327368.0,14319.0,623.0,37.0,1.380190,1.380190,1.14,1.233333,3.0,1.13,1.297222,6.0,1.18,1.271296,12.0,281.0,21.0,-0.373765,1.974325,1.657466,1.237254,1.998949,1.998196,-0.360516,0.119710,0.0,-1.174585,0.811870,0.0,-0.970827,-0.184554,0.000000,0.000000,0.000000,74980.0,35.0,278.0,2.0,-0.047556,-0.057471,0.95,0.966562,3.0,0.96,0.888506,6.0,0.95,0.769253,12.0,281.0,21.0,215730.75,3850.750000,400.000000,14.750000,0.319296,0.311860,1.050000,1.097553,3.0,1.060000,1.062175,6.0,1.0825,1.031560,12.000000,281.000000,21.000000,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,4.0,1.0,2.0,3.0,8.0,2018.0,3.0,3.0,5.0,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C_ID_ffea783298,0.518986,0,0,0,0,1,0,1,0,5,2,1,3.0,0.714496,52.0,0.000000,1.303920,0.577350,1.0,1.741483,18.0,0.000000,1.708078,1.732051,1.0,-0.585854,17.0,1.00,0.238165,17.333333,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,3.0,3.0,2.0,3.0,1.0,1.0,1.0,88,340,M_ID_00a6ca8a8a,16,27,0.0,1.0,0.0,1,2017,5,0,645531.0,106487.0,919.0,43.0,-0.023688,-0.083178,8.89,14.634862,9.0,10.54,17.847866,18.0,10.56,18.127237,31.0,86.0,14.0,111800.234924,52064.460137,223.410683,11.015141,0.068692,0.051519,3.437329,6.778667,0.0,4.250474,8.579572,0.0,4.374460,8.813922,2.886751,51.384174,9.814955,333272.0,95269.0,511.0,27.0,0.071423,0.031763,6.93,12.705128,3.0,8.42,15.855769,6.0,8.57,16.219780,12.0,88.0,16.0,0.550333,1.643143,-0.662724,1.668034,1.732051,1.732051,1.722556,1.731075,0.0,1.727738,1.731490,0.0,1.728376,1.731940,-1.732051,1.732051,1.732051,110969.0,35.0,68.0,7.0,-0.047556,-0.057471,0.86,0.889036,3.0,0.96,0.923298,6.0,0.90,0.920539,7.0,-1.0,-1.0,215177.00,35495.666667,306.333333,14.333333,-0.007896,-0.027726,2.963333,4.878287,3.0,3.513333,5.949289,6.0,3.5200,6.042412,10.333333,28.666667,4.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,1.0,2.0,2.0,3.0,3.0,8.0,2017.0,9.0,5.0,-1.0,1.0,2.0
C_ID_ffee45f941,0.951472,0,0,1,0,0,1,0,0,3,1,1,2.0,-0.698883,11.0,,,,2.0,-0.698883,11.0,,,,2.0,-0.698883,11.0,2.00,-0.698883,11.000000,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,28,705,M_ID_86a84b8d7f,9,33,0.0,1.0,0.0,1,2017,9,4,202708.0,11604.0,705.0,33.0,-0.057471,-0.057471,0.98,0.991870,3.0,1.00,0.998152,6.0,1.07,1.017738,12.0,28.0,9.0,,,,,,,,,,,,,,,,,,202708.0,11604.0,705.0,33.0,-0.057471,-0.057471,0.98,0.991870,3.0,1.00,0.998152,6.0,1.07,1.017738,12.0,28.0,9.0,,,,,,,,,,,,,,,,,,202708.0,11604.0,705.0,33.0,-0.057471,-0.057471,0.98,0.991870,3.0,1.00,0.998152,6.0,1.07,1.017738,12.0,28.0,9.0,202708.00,11604.000000,705.000000,33.000000,-0.057471,-0.057471,0.980000,0.991870,3.0,1.000000,0.998152,6.0,1.0700,1.017738,12.000000,28.000000,9.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,28.0,2018.0,4.0,5.0,1.0,3.0,3.0
C_ID_fff7031631,0.289718,0,0,0,0,1,1,0,0,5,1,1,1.0,1.394220,15.0,,,,1.0,1.394220,15.0,,,,1.0,1.394220,15.0,1.00,1.394220,15.000000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,140,840,M_ID_1688d3651e,9,20,6.0,1.0,2.0,1,2016,10,5,269634.0,106.0,840.0,20.0,-0.047556,-0.057471,1.40,1.505747,3.0,1.39,1.821839,6.0,1.28,1.770115,12.0,140.0,9.0,,,,,,,,,,,,,,,,,,269634.0,106.0,840.0,20.0,-0.047556,-0.057471,1.40,1.505747,3.0,1.39,1.821839,6.0,1.28,1.770115,12.0,140.0,9.0,,,,,,,,,,,,,,,,,,269634.0,106.0,840.0,20.0,-0.047556,-0.057471,1.40,1.505747,3.0,1.39,1.821839,6.0,1.28,1.770115,12.0,140.0,9.0,269634.00,106.000000,840.000000,20.000000,-0.047556,-0.057471,1.400000,1.505747,3.0,1.390000,1.821839,6.0,1.2800,1.770115,12.000000,140.000000,9.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,24.0,2017.0,11.0,4.0,1.0,3.0,4.0


In [77]:
%%time

from multiprocessing import Pool

def calculate_feature_matrix_for(transactions_part_file_name, customers_part_df):
    transactions_part_df = read_transactions_part(transactions_part_file_name)

    part_entity_set = build_entity_set(
        customers_part_df,
        transactions_part_df,
        merchants_df,
    )
    
    feature_matrix_part_df = ft.calculate_feature_matrix(
        features=features, entityset=part_entity_set,
    )
    
    return feature_matrix_part_df

feature_matrix_dfs = dict()

for customers_clazz, customers_df in customers_dfs.items():
    for transactions_clazz, n_parts in TRANSACTIONS_N_PARTS.items():
        customers_with_part_df = add_part(customers_df, n_parts=n_parts)
        
        processes_args = []
        
        for part, customers_part_df in customers_with_part_df.groupby("part"):
            transactions_part_file_name = "%s.%03d.csv" % (transactions_clazz, part)
            processes_args.append((transactions_part_file_name, customers_part_df))
        
        with Pool(8) as p:
            feature_matrix_part_dfs = p.starmap(
                calculate_feature_matrix_for,
                processes_args,
            )

        feature_matrix_df = pd.concat(
            feature_matrix_part_dfs,
            sort=False,
        )
        
        feature_matrix_dfs[customers_clazz] = feature_matrix_df
        
        break # only old transactions for now

CPU times: user 4.3 s, sys: 996 ms, total: 5.3 s
Wall time: 36min 27s


In [78]:
%%time

for feature_matrix_clazz, feature_matrix_df in feature_matrix_dfs.items():
    feature_matrix_df.to_csv(f"../data/2-feature-engineered/{feature_matrix_clazz}.csv")

CPU times: user 57.3 s, sys: 355 ms, total: 57.6 s
Wall time: 57.6 s


In [15]:
context = ("display.max_rows",     0,
           "display.max_colwidth", 0,)

with pd.option_context(*context): display(ft.list_primitives())

Unnamed: 0,name,type,description
0,skew,aggregation,Finds the maximum non-null value of a numeric feature.
1,trend,aggregation,Finds the maximum non-null value of a numeric feature.
2,min,aggregation,Finds the maximum non-null value of a numeric feature.
3,avg_time_between,aggregation,Finds the maximum non-null value of a numeric feature.
4,median,aggregation,Finds the maximum non-null value of a numeric feature.
5,time_since_last,aggregation,Finds the maximum non-null value of a numeric feature.
6,n_most_common,aggregation,Finds the maximum non-null value of a numeric feature.
7,all,aggregation,Finds the maximum non-null value of a numeric feature.
8,mode,aggregation,Finds the maximum non-null value of a numeric feature.
9,percent_true,aggregation,Finds the maximum non-null value of a numeric feature.


Add a `fraud` boolean flag?

In [None]:
authorized_card_rate = transactions_df.groupby("card_id").authorized_flag.mean().sort_values()

with pd.option_context("display.max_rows", 10): display(authorized_card_rate)

`999` installments looks like a `fraud` indicator too

In [None]:
transactions_df.groupby("installments").authorized_flag.mean()

Use `np.log` for `purchase_amount`?

In [3]:
_, axs = plt.subplots(1, 2, figsize=(6, 3), sharey=True)
axs[0].hist(transactions_df.purchase_amount)
axs[1].hist(np.log(transactions_df.purchase_amount + 1))

plt.show()

NameError: name 'plt' is not defined