In [1]:
import pandas as pd
import numpy as np
import gc
import time
from utils import add_groupby_feats

%load_ext autoreload
%autoreload 2

In [2]:
mem_div = 1024**2 # denominator to convert memory usage to readable sizes
repeat = 5 # repeat code to get an average of execution time

# Load Data
- Original Loading

In [3]:
time_naive = []
for cnt in range(repeat):
    start = time.time()
    prior = pd.read_csv('order_products__prior.csv')
    train = pd.read_csv('order_products__train.csv')
    orders = pd.read_csv('orders.csv')
    time_naive.append(time.time() - start)

time_naive = round(np.array(time_naive).mean(), 3)
mem_naive = prior.memory_usage().sum() + train.memory_usage().sum() 
mem_naive += orders.memory_usage().sum()
mem_naive = round(mem_naive / mem_div, 3)

- Optimized loading

In [4]:
%%writefile Feat_Eng_Opt.py

import pandas as pd
import numpy as np
import time
from multiprocessing import Pool

def read_csv_with_dtype(path, dtype_dict):
    ''' Input params:
            path - string, dtype_dict - dictionary
        Output:
            df - pandas dataframe'''
    df = pd.read_csv(path, dtype = dtype_dict)
    return df

def load_data_opt(repeat = 5, mem_div = 1024**2):
    time_opt = []
    dtype_dict = {
        'order_id': np.uint32, 'product_id': np.uint16, 'add_to_cart_order': np.uint8,
        'reordered': np.uint8
    }
    # parameters to be passed for multiprocessing
    load_params = [
        ('order_products__prior.csv', dtype_dict), ('order_products__train.csv', dtype_dict),
        ('orders.csv', {
            'order_id': np.uint32, 'user_id': np.uint32, 'eval_set': 'category',
            'order_number': np.uint8, 'order_dow': 'category', 'order_hour_of_day': 'category',
            'days_since_prior_order': np.float16})]
    
    for cnt in range(repeat):
        start = time.time()
        # multiprocessing with 3 processes
        with Pool(3) as p:
            dfs = p.starmap(read_csv_with_dtype, load_params)

        time_opt.append(time.time() - start)

    time_opt = round(np.array(time_opt).mean(), 3)
    prior, train, orders = list(dfs)
    mem_opt = prior.memory_usage().sum() + train.memory_usage().sum() 
    mem_opt += orders.memory_usage().sum()
    mem_opt = round(mem_opt / mem_div, 3)
    return time_opt, mem_opt, dfs
    

Overwriting Feat_Eng_Opt.py


- Comparison

In [5]:
from Feat_Eng_Opt import load_data_opt

time_opt, mem_opt, dfs_opt = load_data_opt()
print(f'Original average datasets loading time is {time_naive} secs.')
print(f'Optimized average datasets loading time is {time_opt} secs.\n')
print(f'Original memory usage of datasets is {mem_naive} MBs.')
print(f'Optimized memory usage of datasets is {mem_opt} MBs.')

Original average datasets loading time is 10.886 secs.
Optimized average datasets loading time is 9.009 secs.

Original memory usage of datasets is 1214.783 MBs.
Optimized memory usage of datasets is 303.696 MBs.


# Product Features

In [6]:
# _user_buy_product_times: how many times have the user bought the product
col_user_buy_product_times = '_user_buy_product_times'

# aggregation functions to be applied
prod_agg = {'user_id': ['count'], 'reordered': ['sum'],
            col_user_buy_product_times: [lambda x: sum(x == 1), lambda x: sum(x == 2)]}

#_prod_tot_cnts: how many times have this product be bought
#_prod_reorder_tot_cnts: how many times have this product be reorderd
#_prod_buy_first_time_total_cnt: how many unique users have bought this product
#_prod_buy_second_time_total_cnt: how many unique users have reordered this product
col_tot_cnts, col_reorder_tot_cnts = '_prod_tot_cnts', '_prod_reorder_tot_cnts'
col_buy_first_time_total_cnt = '_prod_buy_first_time_total_cnt'
col_buy_second_time_total_cnt = '_prod_buy_second_time_total_cnt'
prod_cols = [col_tot_cnts, col_reorder_tot_cnts, col_buy_first_time_total_cnt,
             col_buy_second_time_total_cnt]

#_prod_reorder_prob: among the users who have bought this product, what percentage of them reordered it
#_prod_reorder_ratio: among the products that have been bought, how many of them were reoredred products
#_prod_reorder_times: average reorder times among the users who have bought this product
col_prod_reorder_prob = '_prod_reorder_prob'
col_prod_reorder_ratio = '_prod_reorder_ratio'
col_prod_reorder_times = '_prod_reorder_times'

prior_opt, train_opt, orders_opt = dfs_opt

- Original processing

In [7]:
time_naive = []
for cnt in range(3):
    start = time.time()
    # add order info to priors
    prior_orders = orders.set_index('order_id').join(prior.set_index('order_id'),
                                                     how = 'inner')

    prior_orders[col_user_buy_product_times] = [p + 1 for p 
                                                in prior_orders.groupby(['user_id',
                                                                         'product_id']).cumcount()]

    prod = add_groupby_feats(prior_orders, ['product_id'], prod_agg, prod_cols)
    prod[col_prod_reorder_prob] = prod[col_buy_second_time_total_cnt] / prod[col_buy_first_time_total_cnt]
    prod[col_prod_reorder_ratio] = prod[col_reorder_tot_cnts] / prod[col_tot_cnts]
    prod[col_prod_reorder_times] = 1 + prod[col_reorder_tot_cnts] / prod[col_buy_first_time_total_cnt]
    time_naive.append(time.time() - start)

time_naive = round(np.array(time_naive).mean(), 3)
mem_naive = round(prod.memory_usage().sum() / mem_div, 3)

- Optimized processing

In [8]:
%%writefile Feat_Eng_Opt.py

import time, os
import numpy as np
import pandas as pd
from functools import partial
from multiprocessing import Pool
from utils import *

def prod_feat_opt(df_left, df_right, po_col, prod_agg, prod_new_cols, prod_proc_cols,
                  repeat = 5, mem_div = 1024**2):
    '''
    Input params:
        df_left - pandas dataframe, df_right - pandas dataframe, po_col - string,
        prod_agg - dictionary, prod_new_cols - list, prod_proc_cols - list
    Output:
        time_opt - float, mem_opt - float, prod - pandas dataframe
    '''
    time_opt = []
    col_tot, col_reorder, col_1st, col_2nd = prod_new_cols
    col_prob, col_ratio, col_times = prod_proc_cols
    for cnt in range(repeat):
        start = time.time()
        # add order info to priors
        merge_mul_partial = partial(merge_mul, 'order_id', 'inner', df_left)
        cpu_cnt = os.cpu_count()
        with Pool(cpu_cnt) as p:
            pos = p.map(merge_mul_partial, np.array_split(df_right, cpu_cnt))
            
        prior_orders = pd.concat(list(pos))
        
        prior_orders[po_col] = prior_orders.groupby(['user_id', 'product_id']).cumcount() + 1
        
        prod = add_groupby_feats(prior_orders, ['product_id'], prod_agg, prod_new_cols)
        prod[col_prob] = prod[col_2nd] / prod[col_1st]
        prod[col_ratio] = prod[col_reorder] / prod[col_tot]
        prod[col_times] = 1 + prod[col_reorder] / prod[col_1st]
        time_opt.append(time.time() - start)
        
    time_opt = round(np.array(time_opt).mean(), 3)
    # optimize memory usage
    prod[['product_id', col_2nd]] = prod[['product_id', col_2nd]].apply(np.uint16)
    prod[[col_tot, col_1st]] = prod[[col_tot, col_1st]].apply(np.uint32)
    prod[prod.select_dtypes(np.float64).columns] = prod.select_dtypes(np.float64).astype(np.float16)
    
    mem_opt = round(prod.memory_usage().sum() / mem_div, 3)
    return time_opt, mem_opt, prod, prior_orders
    

Overwriting Feat_Eng_Opt.py


- Comparison

In [9]:
from Feat_Eng_Opt import prod_feat_opt

time_opt, mem_opt, prod_opt, prior_orders_opt = prod_feat_opt(prior_opt, orders_opt,
                                                              col_user_buy_product_times,
                                                              prod_agg, prod_cols,
                                                              [col_prod_reorder_prob,
                                                               col_prod_reorder_ratio,
                                                               col_prod_reorder_times], 3)

In [10]:
print(f'Original average product features processing time is {time_naive} secs.')
print(f'Optimized average product features processing time is {time_opt} secs.\n')
print(f'Original memory usage of newly generated product features is {mem_naive} MBs.')
print(f'Optimized memory usage of newly generated product features is {mem_opt} MBs.')

Original average product features processing time is 79.381 secs.
Optimized average product features processing time is 61.231 secs.

Original memory usage of newly generated product features is 3.032 MBs.
Optimized memory usage of newly generated product features is 0.948 MBs.


# User Features

In [11]:
gc.collect()

# _user_total_orders: how many orders have the user placed in the prior dataset
# _user_sum_days_since_prior_order: sum of days since prior order
# _user_mean_days_since_prior_order: average number of days since prior order
col_user_order_cnt = '_user_total_orders'
user_cols = [col_user_order_cnt, '_user_sum_days_since_prior_order',
             '_user_mean_days_since_prior_order']

# aggregation functions to be applied
user_agg_1 = {'order_number': ['max'], 'days_since_prior_order': ['sum', 'mean']}

user_agg_2 = {'reordered': ['sum'], 'product_id': ['count', lambda x: x.nunique()],
              'order_number': [lambda x: sum(x > 1)]}

# _user_reorder_ratio: number of reorders / number of orders after 1st order
# _user_total_products: number of products the user has bought
# _user_distinct_products: number of unique products the user has bought
col_user_reorder_ratio = '_user_reorder_ratio'
col_user_total_products = '_user_total_products'
col_user_distinct_products = '_user_distinct_products'

col_reorder_sum = 'reordered_sum'
user_cols_2 = [col_reorder_sum, col_user_total_products, col_user_distinct_products,
               'order_number']

# _user_average_basket: average number of products per order for this user
col_user_average_basket = '_user_average_basket'

- Original processing

In [12]:
time_naive = []
for cnt in range(repeat):
    start = time.time()
    user1 = add_groupby_feats(orders[orders['eval_set'] == 'prior'].copy(), ['user_id'],
                              user_agg_1, user_cols)

    user2 = add_groupby_feats(prior_orders, ['user_id'], user_agg_2, user_cols_2)
    user2[col_user_reorder_ratio] = user2[col_reorder_sum] / user2['order_number']
    user2.drop([col_reorder_sum, 'order_number'], axis = 1, inplace = True)
    users = user1.merge(user2, how = 'inner')
    users[col_user_average_basket] = users[col_user_total_products] / users[col_user_order_cnt]
    user3 = orders[orders['eval_set'] != "prior"][['user_id', 'order_id', 'eval_set',
                                                   'days_since_prior_order']]

    user3.rename(index = str, columns = {'days_since_prior_order': 'time_since_last_order'},
                 inplace = True)

    users = users.merge(user3, how='inner')

    time_naive.append(time.time() - start)
    
time_naive = round(np.array(time_naive).mean(), 3)
del user1, user2, user3
mem_naive = round(users.memory_usage().sum() / mem_div, 3)

- Optimized processing

In [13]:
%%writefile Feat_Eng_Opt.py

from multiprocessing import Pool
from utils import *
from functools import partial
import os, time
import numpy as np
import pandas as pd

def df_larger_sub(col, val, df):
    '''
    Input params:
        df - pandas dataframe, col - string, val - int
    Output:
        df_sub - pandas dataframe
    '''
    df_sub = df[df[col] > val]
    return df_sub

def merge_idx(df_left, df_right):
    '''
    Input params:
        df_left - pandas dataframe, df_right - pandas dataframe
    Output:
        df - pandas dataframe
    '''
    df = df_left.merge(df_right, how = 'inner', left_index = True, right_index = True)
    return df

def user_feat_opt(df1, df2, df2_cols, col_add, col_reorder, col_users, repeat = 5,
                  mem_div = 1024**2):
    
    '''
    Input params:
        df1 - pandas dataframe, df2 - pandas dataframe, df_cols - list, col - string
    Output:
        time_opt - float, mem_opt - float, users - pandas dataframe
    '''
    time_opt = []
    # aggregation functions to be applied
    user_agg_1 = {'order_number': ['max'], 'days_since_prior_order': ['sum', 'mean']}    
    user_agg_2 = {'reordered': ['sum'], 'product_id': ['count', 'nunique']}
    
    df1_prior = df1[df1['eval_set'] == 'prior']
    df1_non_prior = df1[df1['eval_set'] != "prior"][['user_id', 'order_id', 'eval_set',
                                                     'days_since_prior_order']]
    
    df1_non_prior.rename(index = str, columns = {'days_since_prior_order':
                                                 'time_since_last_order'}, inplace = True)
    
    for cnt in range(repeat):
        start = time.time()
        user1 = add_groupby_feats(df1_prior, ['user_id'], user_agg_1, col_users)
        user21 = add_groupby_feats(df2, ['user_id'], user_agg_2,
                                   [col_reorder] + df2_cols).set_index('user_id')
        
        cpu_cnt = os.cpu_count()
        df_larger_sub_partial = partial(df_larger_sub, 'order_number', 1)
        # multiprocessing
        with Pool(cpu_cnt) as p:
            d = p.map(df_larger_sub_partial, np.array_split(df2, cpu_cnt))
            
        df3 = pd.concat(list(d))
        user22 = df3.groupby(['user_id']).agg({'order_number': 'count'})
        merge_partial = partial(merge_idx, user21)
        # multiprocessing
        with Pool(cpu_cnt) as p:
            d = p.map(merge_partial, np.array_split(user22, cpu_cnt))
            
        user2 = pd.concat(list(d))
        user2[col_add] = user2[col_reorder] / user2['order_number']
        user2.drop([col_reorder, 'order_number'], axis = 1, inplace = True)
        user2.reset_index(inplace = True)
        del user21, user22
        u1, u2 = user1.set_index('user_id'), user2.set_index('user_id')
        join_mul_partial = partial(join_mul, u1)
        # multiprocessing
        with Pool(cpu_cnt) as p:
            d = p.map(join_mul_partial, np.array_split(u2, cpu_cnt))
            
        users = pd.concat(list(d))
        u2 = df1_non_prior.set_index('user_id')
        join_mul_partial = partial(join_mul, users)
        # multiprocessing
        with Pool(cpu_cnt) as p:
            d = p.map(join_mul_partial, np.array_split(u2, cpu_cnt))
            
        users = pd.concat(list(d)).reset_index()
        time_opt.append(time.time() - start)
        
    time_opt = round(np.array(time_opt).mean(), 3)
    # memory usage optimization
    users[df2_cols] = users[df2_cols].apply(np.uint16)
    users[users.select_dtypes(np.float64).columns] = users.select_dtypes(np.float64).astype(np.float16)
    
    mem_opt = round(users.memory_usage().sum() / mem_div, 3)
    return time_opt, mem_opt, users

Overwriting Feat_Eng_Opt.py


- Comparison

In [14]:
from Feat_Eng_Opt import user_feat_opt

time_opt, mem_opt, users_opt = user_feat_opt(orders_opt, prior_orders_opt,
                                             [col_user_total_products,
                                              col_user_distinct_products],
                                             col_user_reorder_ratio, col_reorder_sum,
                                             user_cols)

In [15]:
print(f'Original average product features processing time is {time_naive} secs.')
print(f'Optimized average product features processing time is {time_opt} secs.\n')
print(f'Original memory usage of newly generated product features is {mem_naive} MBs.')
print(f'Optimized memory usage of newly generated product features is {mem_opt} MBs.')

Original average product features processing time is 53.64 secs.
Optimized average product features processing time is 33.602 secs.

Original memory usage of newly generated product features is 18.879 MBs.
Optimized memory usage of newly generated product features is 5.113 MBs.


# User - Product Features

In [16]:
# _up_order_count: number of times the user has purchased this product
# _up_first_order_number: order number when the user first purchased this product
# _up_last_order_number: order number when the user last purchased this product
# _up_average_cart_position: the product's average position in cart of this user
col_up_order_count = '_up_order_count'
col_user_prod_1st_order_num = '_up_first_order_number'
col_user_prod_last_order_num = '_up_last_order_number'

# _up_order_rate: (number of times the product has been bought by user) / number of orders of the user
# _up_order_since_last_order: (most recent order number of the user) - (most recent order number containing the product)
# _up_order_rate_since_first_order: (number of times that the user has bought this product) / (number of orders within this purchasing history framework)
col_rate = '_up_order_rate'
col_since_last = '_up_order_since_last_order'
col_since_1st ='_up_order_rate_since_first_order'

- Original processing

In [17]:
time_naive = []
for cnt in range(3):
    start = time.time()
    data = add_groupby_feats(prior_orders, ['user_id', 'product_id'],
                             {'order_number': ['count', 'min', 'max'],
                              'add_to_cart_order': ['mean']}, [col_up_order_count,
                                                               col_user_prod_1st_order_num,
                                                               col_user_prod_last_order_num,
                                                               '_up_average_cart_position'])
    
    data = prod.merge(data, how = 'inner', on = 'product_id').merge(users, how = 'inner',
                                                                    on = 'user_id')
    
    data[col_rate] = data[col_up_order_count] / data[col_user_order_cnt]
    data[col_since_last] = data[col_user_order_cnt] - data[col_user_prod_last_order_num]
    data[col_since_1st] = data[col_up_order_count] / (data[col_user_order_cnt] - data[col_user_prod_1st_order_num] + 1)
    train_orders = orders[['order_id',
                           'user_id']].set_index('order_id').join(train.set_index('order_id'),
                                                                  how = 'right')
    
    data = data.set_index(['user_id',
                           'product_id']).join(train_orders[['user_id', 'product_id',
                                                             'reordered']].set_index(['user_id',
                                                                                     'product_id']),
                                               how = 'left')
    
    time_naive.append(time.time() - start)
    
time_naive = round(np.array(time_naive).mean(), 3)
mem_naive = round(data.memory_usage().sum() / mem_div, 3)

- Optimized processing

In [18]:
time_naive = []
for cnt in range(3):
    start = time.time()
    data_opt = add_groupby_feats(prior_orders_opt, ['user_id', 'product_id'],
                                 {'order_number': ['count', 'min', 'max'],
                                  'add_to_cart_order': ['mean']},
                                 [col_up_order_count, col_user_prod_1st_order_num,
                                  col_user_prod_last_order_num, '_up_average_cart_position'])
    
    d, p = data_opt.set_index('product_id'), prod_opt.set_index('product_id')
    u = users_opt.set_index('user_id')
    data_opt = d.join(p, how = 'inner').reset_index().set_index('user_id').join(u, how = 'inner').reset_index()
    
    data_opt[col_rate] = data_opt[col_up_order_count] / data_opt[col_user_order_cnt]
    data_opt[col_since_last] = data_opt[col_user_order_cnt] - data_opt[col_user_prod_last_order_num]
    data_opt[col_since_1st] = data_opt[col_up_order_count] / (data_opt[col_user_order_cnt] - data_opt[col_user_prod_1st_order_num] + 1)
    train_orders_opt = train_opt.merge(orders_opt[['order_id', 'user_id']], how = 'left',
                                       on = 'order_id')
    
    data_opt = data_opt.merge(train_orders_opt[['user_id', 'product_id', 'reordered']],
                              how = 'left', on = ['user_id', 'product_id'])
    
    time_naive.append(time.time() - start)
    
time_naive = round(np.array(time_naive).mean(), 3)
mem_naive = round(data_opt.memory_usage().sum() / mem_div, 3)

- Comparison

In [19]:
print(f'Original average product features processing time is {time_naive} secs.')
print(f'Optimized average product features processing time is {time_opt} secs.\n')
print(f'Original memory usage of newly generated product features is {mem_naive} MBs.')
print(f'Optimized memory usage of newly generated product features is {mem_opt} MBs.')

Original average product features processing time is 42.932 secs.
Optimized average product features processing time is 33.602 secs.

Original memory usage of newly generated product features is 1307.22 MBs.
Optimized memory usage of newly generated product features is 5.113 MBs.
