In [1]:
%run ../0-utils/0-Base.ipynb

Define helpers for reading data about _customers_ and their _transactions_:

In [8]:
PARTS_DIR_PATH = "../data/2-formatted"

def read_part_customers_df(part, clazz):
    return pd.read_csv(f"{PARTS_DIR_PATH}/%03d/{clazz}.csv" % part)

display(read_part_customers_df(13, "test"),
        read_part_customers_df(13, "train"))

def read_part_transactions_df(part):
    return pd.read_csv(f"{PARTS_DIR_PATH}/%03d/transactions.csv" % part, parse_dates=["purchase_date"])

display(read_part_transactions_df(13))

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-03,C_ID_6b0ac2784f,3,2,1
1,2017-07,C_ID_b89dd01483,2,1,0
2,2017-08,C_ID_f49551e57a,3,1,1
...,...,...,...,...,...
472,2016-08,C_ID_70ac403a28,2,2,0
473,2016-08,C_ID_bc5ce56532,4,2,0
474,2016-12,C_ID_a2659c5784,3,1,1


Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-09,C_ID_ee5bb1f392,2,3,0,-1.230539
1,2017-08,C_ID_1d437b41aa,2,1,0,2.836237
2,2017-06,C_ID_7fc141d648,3,2,1,1.478886
...,...,...,...,...,...,...
711,2017-09,C_ID_b1aa9f2c57,5,2,1,0.538566
712,2016-07,C_ID_5927e17379,3,3,1,2.056873
713,2017-03,C_ID_fada3441a1,2,1,0,-2.961032


Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,new_merchant
0,1,C_ID_33a7af4259,289,0,1,1,705,M_ID_bbe2b7164c,-2,-0.696870,2017-12-13 20:25:49,5,5,33,0
1,1,C_ID_33a7af4259,289,0,1,1,705,M_ID_bbe2b7164c,-1,-0.738839,2018-01-25 17:34:15,5,5,33,0
2,1,C_ID_33a7af4259,289,0,1,1,705,M_ID_bbe2b7164c,-1,-0.696824,2018-01-05 20:13:56,5,5,33,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106565,1,C_ID_abbe13345f,-1,1,1,1,302,M_ID_b9f9332438,2,-0.523013,2018-04-07 15:56:55,-1,-1,22,1
106566,1,C_ID_81cad21f6c,199,0,0,0,705,M_ID_3df8f6ff05,1,-0.725901,2018-02-07 16:43:10,4,14,33,1
106567,1,C_ID_8e0ff5ca03,254,0,1,1,703,M_ID_83a46908e3,1,-0.667267,2018-03-17 08:38:48,1,15,29,1


Define months for which transactions data will be collected:

In [9]:
dates = pd.date_range(start="2017-01", end="2018-04", freq="M").to_period("M"); display(dates)

PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
             '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
             '2018-01', '2018-02', '2018-03'],
            dtype='period[M]', freq='M')

In [34]:
feature_set = "purchase_amount_by_authorized_flag"

def process_part(part, clazz):
    part_customers_df = read_part_customers_df(part, clazz)
    max_num_customers = int(part_customers_df.shape[0] / 5)
    
    part_transactions_df = read_part_transactions_df(part)
    
    part_transactions_df = part_transactions_df.assign(
        year=part_transactions_df.purchase_date.dt.year,
        month=part_transactions_df.purchase_date.dt.month,
    )
    
    part_transactions_df = part_transactions_df. \
        set_index(["card_id", "year", "month"]). \
        sort_index()
    
    X, y = [], []
        
    for customer in part_customers_df.itertuples(index=False):
        if clazz == "train" and max_num_customers < 0 and customer.target > -33: continue
        
        X_parts = []
        
        for date in dates:
            ix = (customer.card_id, date.year, date.month)
            
            if not part_transactions_df.index.contains(ix):
                if feature_set == "purchase_amount":
                    num_features = 3

                if feature_set == "purchase_amount_by_authorized_flag":
                    num_features = 6

                X_part = np.empty((1, num_features))
                X_part.fill(np.nan)
            else:
                transactions_df = part_transactions_df.loc[ix]
                
                if feature_set == "purchase_amount":
                    agg = dict(purchase_amount=("min", "mean", "max"))
                    X_part = transactions_df.agg(agg).T.values
                
                if feature_set == "purchase_amount_by_authorized_flag":
                    agg = dict(purchase_amount=("min", "mean", "max"))
                    x1 = transactions_df[transactions_df.authorized_flag == 1].agg(agg).T.values
                    x2 = transactions_df[transactions_df.authorized_flag == 0].agg(agg).T.values
                    X_part = np.concatenate((x1, x2), axis=1)
                
            X_parts.append(X_part)
        
        X.append(np.concatenate(X_parts))

        if clazz == "train":
            y.append(customer.target)
            
        max_num_customers -= 1
        
    return X, y

def process_train_part(part): return process_part(part, "train")
def process_test_part(part):  return process_part(part, "test")

# X, y = process_train_part(13);

In [35]:
%%time

from multiprocessing import Pool

with Pool(8) as pool: results = pool.map(process_train_part, range(TRANSACTIONS_N_PARTS))

CPU times: user 262 ms, sys: 183 ms, total: 445 ms
Wall time: 7min 6s


In [36]:
X = list(x[0] for x in results)
X = np.concatenate(X)

y = list(x[1] for x in results)
y = np.concatenate(y)

In [38]:
np.save(f"../data/3-feature-engineered/train/{feature_set}.npy", X)

# np.save("../data/3-feature-engineered/train/y.npy", y)