# Data Import & Connection to BigQuery

In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from google.cloud import bigquery
from itertools import product
from functions import *

# set display options to show all columns
pd.set_option('display.max_columns', None)
# Set the float format to display numbers without scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Set the client for future queries to BigQuery
client = bigquery.Client(project = "continente-lced-feup")

In [2]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=5CLzg2McUnAHdc3ggwcUzpsThcxDvp&access_type=offline&code_challenge=fn42U_gQkCVlEIumYoRkIP0FZX7uv_aQQWS2J3kCj6g&code_challenge_method=S256


Credentials saved to file: [/Users/vp/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "continente-lced-feup" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the r

# Data Loading

In [3]:
query = client.query("""
   SELECT *
   FROM 
       tables_raw.dim_customer 
       JOIN tables_raw.fact_transaction USING(CUSTOMER_ACCOUNT_NR_MASK)
       JOIN tables_raw.dim_product USING(SKU)
    WHERE 
        SUBCAT_CD_EXT IN (140304, 50401, 30301, 20201, 10301, 80103, 60102, 60401, 30401, 10101, 100102, 100204, 140204, 
        20302, 30201, 50203, 90201, 170101, 80404, 50402, 70202, 100201, 80409, 10303, 20306, 80411, 10102, 20305, 60105, 80110, 
        140301, 30202, 90202, 100101, 80105, 80104, 50202, 50303, 70204, 60306, 80403, 10302, 10201, 80405, 170304, 170303, 80406, 
        140201, 60302, 30403, 30304, 20204, 170106, 140205, 10204, 60404, 50301, 50302, 20205, 60406, 20301, 80407, 20203, 70201, 100205,
        60106, 170302, 50201, 60301, 10205, 30203, 80401, 100202, 30302, 170111, 10202, 70203, 60303, 170109, 60403, 30402, 140302, 30208, 60307, 
        80107, 50403, 60103, 20307, 60305, 60101, 170307, 80414, 80415, 60405, 20303, 80402, 30204, 30206, 170310, 60304, 140206, 10203, 30205, 60107, 
        70206, 170108, 90203, 90204, 30207, 140303, 30303, 80408, 140202, 50304, 80101, 170313, 100203, 60402, 170305, 50305, 50404, 20202, 170110, 
        170105, 170112, 170301, 10206, 10208, 20304, 80102, 70205, 10207, 10305, 170309, 170114, 80111, 90206, 30306, 30305, 140203, 80413) 
        AND SEG_AGE_DSC = ']25;35]'
        AND QTY >= 0
    ORDER BY CUSTOMER_ACCOUNT_NR_MASK DESC, TIME_KEY ASC
   """)

df = query.result().to_dataframe() # Wait for the job to complete.

# Raw data statistics

In [20]:
query = client.query("""
    SELECT *
    FROM tables_raw.dim_customer
    """
)

customer = query.result().to_dataframe()

In [31]:
customer.columns

Index(['CUSTOMER_ACCOUNT_NR_MASK', 'GENDER', 'FAMILY_MEMBERS', 'CP4',
       'seg_lifestyle_cd', 'seg_lifestyle_dsc', 'SEG_AGE', 'SEG_AGE_DSC',
       'seg_lifestage_cd', 'seg_lifestage_dsc'],
      dtype='object')

In [30]:
customer.describe()

Unnamed: 0,CUSTOMER_ACCOUNT_NR_MASK,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd
count,93748.0,74865.0,93749.0,93749.0
mean,41185196.93,3.02,2.19,4.11
std,39645028.96,4.81,0.87,1.54
min,3506.0,0.0,1.0,1.0
25%,7333072.0,2.0,1.0,3.0
50%,22431879.0,3.0,2.0,4.0
75%,79581269.25,4.0,3.0,5.0
max,133928977.0,99.0,4.0,6.0


In [32]:
customer.isnull().sum()

CUSTOMER_ACCOUNT_NR_MASK        1
GENDER                       8324
FAMILY_MEMBERS              18884
CP4                          8600
seg_lifestyle_cd                0
seg_lifestyle_dsc               0
SEG_AGE                         0
SEG_AGE_DSC                     0
seg_lifestage_cd                0
seg_lifestage_dsc               0
dtype: int64

In [33]:
8324 + 18884 + 8600 + 1

35809

In [34]:
customer.duplicated().sum()

0

In [22]:
query = client.query("""
    SELECT *
    FROM tables_raw.dim_location
    """
)

location = query.result().to_dataframe()

In [36]:
location.columns

Index(['LOCATION_CD', 'LOCATION_DSC', 'LOC_BRAND_CD', 'LOC_BRAND_DSC', 'cp7'], dtype='object')

In [37]:
location.describe()

Unnamed: 0,LOCATION_CD,LOC_BRAND_CD,cp7
count,373.0,373.0,373.0
mean,2725.22,285.4,4414.05
std,2846.7,77.2,2259.32
min,1.0,143.0,1000.0
25%,289.0,302.0,2724.0
50%,1704.0,302.0,4250.0
75%,4420.0,303.0,4920.0
max,9665.0,888.0,9950.0


In [38]:
location.isnull().sum()

LOCATION_CD      0
LOCATION_DSC     0
LOC_BRAND_CD     0
LOC_BRAND_DSC    0
cp7              0
dtype: int64

In [39]:
location.duplicated().sum()

0

In [23]:
query = client.query("""
    SELECT *
    FROM tables_raw.dim_product
    """
)

product = query.result().to_dataframe()

In [40]:
product.isnull().sum()

SKU                   0
PRODUCT_DSC           0
UNIT_BASE_CD_EXT      0
UNIT_BASE_DSC_EXT     0
SUBCAT_CD_EXT         0
SUBCAT_DSC_EXT        0
CAT_CD_EXT            0
CAT_DSC_EXT           0
BIZ_UNIT_CD_EXT       0
BIZ_UNIT_DSC_EXT      0
DEPARTMENT_CD_EXT     0
DEPARTMENT_DSC_EXT    0
PRODUCT_SHORT_DSC     0
BRAND_DSC             0
BRAND_TYPE_CD         0
PRICE_RANGE           0
CONVERSION_FACTOR     0
CAPACITY_UNIT         0
dtype: int64

In [41]:
product.duplicated().sum()

0

In [24]:
query = client.query("""
    SELECT *
    FROM tables_raw.fact_transaction
    """
)

transaction = query.result().to_dataframe()

In [44]:
transaction.describe()

Unnamed: 0,TIME_KEY,TRANSACTION_ID_MASK,CUSTOMER_ACCOUNT_NR_MASK,LOC_BRAND_CD,LOCATION_CD,SKU,PRODUCT_KEY,QTY,NET_SLS_AMT,GROSS_SLS_AMT,PROD_DSCNT_ISSUED_AMT,TRANS_DSCNT_RAT_AMT,DIRECT_DSCNT_AMT
count,66579316.0,66579316.0,66568608.0,66579316.0,66579316.0,66579316.0,66579316.0,66579316.0,66579316.0,66579316.0,66579316.0,66579316.0,66579316.0
mean,20215672.36,-53611862945.91,32885613.29,243.88,1294.83,5346997.09,82777543042.2,1.29,2.41,2.77,0.01,0.13,0.53
std,5013.16,8.00408431398784e+17,34168384.52,77.84,1947.7,1812027.19,1812027187931.08,1.51,4.24,4.8,0.18,0.45,2.13
min,20210101.0,6.919050137126249e+18,3506.0,143.0,1.0,2000022.0,2000022010001.0,-949.05,-8049.02,-8531.96,-68.95,-130.0,0.0
25%,20210703.0,7.279063339485043e+18,6209604.0,143.0,215.0,4230946.0,4230946010001.0,1.0,0.97,1.09,0.0,0.0,0.0
50%,20211231.0,7.461252190221281e+18,12373151.0,302.0,333.0,5697239.0,5697239010001.0,1.0,1.62,1.89,0.0,0.0,0.0
75%,20220706.0,8.795113279699681e+18,58658275.0,303.0,1902.0,7046400.0,7046400010001.0,1.0,2.82,3.18,0.0,0.14,0.28
max,20221231.0,9.138164897063896e+18,133928977.0,888.0,9665.0,98892503.0,98892503010001.0,2111.11,8057.08,8540.5,487.5,367.22,999.0


In [42]:
transaction.isnull().sum()

TIME_KEY                        0
TRANSACTION_ID_MASK             0
CUSTOMER_ACCOUNT_NR_MASK    10708
LOC_BRAND_CD                    0
LOCATION_CD                     0
POS_TP_CD                       0
SKU                             0
PRODUCT_KEY                     0
QTY                             0
NET_SLS_AMT                     0
GROSS_SLS_AMT                   0
PROD_DSCNT_ISSUED_AMT           0
TRANS_DSCNT_RAT_AMT             0
DIRECT_DSCNT_AMT                0
dtype: int64

In [43]:
transaction.duplicated().sum()

0

# Data Preparation (more to be done...)

In [4]:
df = df.drop(columns=['LOC_BRAND_CD','PROD_DSCNT_ISSUED_AMT','NET_SLS_AMT','TRANS_DSCNT_RAT_AMT','DIRECT_DSCNT_AMT',
                 'seg_lifestyle_dsc','SEG_AGE','SEG_AGE_DSC','seg_lifestage_dsc',
                 'UNIT_BASE_DSC_EXT','SUBCAT_DSC_EXT','BIZ_UNIT_DSC_EXT','DEPARTMENT_DSC_EXT',
                'PRODUCT_SHORT_DSC','BRAND_DSC','BRAND_TYPE_CD','CONVERSION_FACTOR','CAPACITY_UNIT','PRODUCT_DSC','SKU',
                'LOCATION_CD','GROSS_SLS_AMT','CP4','CAT_DSC_EXT','PRODUCT_KEY','POS_TP_CD','PRICE_RANGE'])

In [5]:
df.isnull().sum()

CUSTOMER_ACCOUNT_NR_MASK         0
GENDER                      207890
FAMILY_MEMBERS              778011
seg_lifestyle_cd                 0
seg_lifestage_cd                 0
TIME_KEY                         0
TRANSACTION_ID_MASK              0
QTY                              0
UNIT_BASE_CD_EXT                 0
SUBCAT_CD_EXT                    0
CAT_CD_EXT                       0
BIZ_UNIT_CD_EXT                  0
DEPARTMENT_CD_EXT                0
dtype: int64

In [8]:
df.describe(include='all')

Unnamed: 0,CUSTOMER_ACCOUNT_NR_MASK,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,TIME_KEY,TRANSACTION_ID_MASK,QTY,UNIT_BASE_CD_EXT,SUBCAT_CD_EXT,CAT_CD_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT
count,4066247.0,3858357,3288236.0,4066247.0,4066247.0,4066247.0,4066247.0,4066247.0,4066247.0,4066247.0,4066247.0,4066247.0,4066247.0
unique,,2,,,,,,,,,,,
top,,F,,,,,,,,,,,
freq,,2546533,,,,,,,,,,,
mean,53169321.95,,2.76,1.81,4.22,20215846.71,-699585902733.85,1.42,5800117.13,58001.14,579.98,5.77,10.0
std,35562730.03,,3.53,0.78,0.79,5009.83,8.027785665767329e+17,1.36,4181744.18,41817.44,418.17,4.18,0.0
min,9450.0,,0.0,1.0,1.0,20210101.0,6.920988597626136e+18,0.27,1010101.0,10101.0,101.0,1.0,10.0
25%,19342043.0,,1.0,1.0,4.0,20210712.0,7.284105638989963e+18,1.0,2030203.0,20302.0,203.0,2.0,10.0
50%,58149188.0,,2.0,2.0,4.0,20220112.0,7.467197965326499e+18,1.0,5040402.0,50404.0,504.0,5.0,10.0
75%,83514782.0,,4.0,2.0,5.0,20220712.0,8.798150124427613e+18,1.0,8040606.0,80406.0,804.0,8.0,10.0


In [10]:
df.duplicated().sum()

377312

# Feature Engineering & ML Dataset Development

In [6]:
# convert the 'TIME_KEY' column to datetime format
df['TIME_KEY'] = pd.to_datetime(df['TIME_KEY'], format='%Y%m%d')

# create new columns for month, quarter, semester, and year
df['MONTH'] = df['TIME_KEY'].dt.month
df['QUARTER'] = df['TIME_KEY'].dt.quarter
df['SEMESTER'] = df['MONTH'].apply(semester)
df['YEAR'] = df['TIME_KEY'].dt.year

In [7]:
# Filter the dataframe by the customer's frequency score
df = filter_customers(df)
# Sort the dataframe (later suffled again)
df = df.sort_values(['TIME_KEY','TRANSACTION_ID_MASK','YEAR','MONTH'])

In [8]:
# Step 2: Drop duplicates, keeping only the first occurrence of each customer ID
df_first_transaction = df.drop_duplicates('CUSTOMER_ACCOUNT_NR_MASK', keep='first')

# Step 3: Create a new DataFrame with customer ID and first transaction date
df_first_transaction = df_first_transaction[['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY']]

# Convert the 'TIME_KEY' column to datetime type
df_first_transaction['TIME_KEY'] = pd.to_datetime(df_first_transaction['TIME_KEY'])

# Extract the year and month from the 'TIME_KEY' column and format it as "year-month"
df_first_transaction['TIME_KEY'] = df_first_transaction['TIME_KEY'].dt.strftime('%Y-%m')

df_first_transaction = df_first_transaction.sort_values(['CUSTOMER_ACCOUNT_NR_MASK'])

In [9]:
# get all unique values of customer_id, category_id, month and year
customer_ids = df['CUSTOMER_ACCOUNT_NR_MASK'].unique()
category_ids = df['SUBCAT_CD_EXT'].unique()
months = df['MONTH'].unique()
years = df['YEAR'].unique()

# create a new dataframe with all possible combinations of customer_id and category_id
ml_dataset = pd.DataFrame(list(product(customer_ids, category_ids, months, years)), 
                                    columns=['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT','MONTH','YEAR'])

# add the quarter and semester columns based on the month value
quarter_map = {1: 1, 2: 1, 3: 1, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3, 10: 4, 11: 4, 12: 4}
semester_map = {1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 2, 8: 2, 9: 2, 10: 2, 11: 2, 12: 2}

ml_dataset['QUARTER'] = ml_dataset['MONTH'].map(quarter_map)
ml_dataset['SEMESTER'] = ml_dataset['MONTH'].map(semester_map)

In [10]:
# Combine the 'year' and 'month' columns into a single column
ml_dataset['TIME_KEY_agg'] = ml_dataset['YEAR'].astype(str) + '-' + ml_dataset['MONTH'].astype(str)

# Convert the 'TIME_KEY' column to datetime format
ml_dataset['TIME_KEY_agg'] = pd.to_datetime(ml_dataset['TIME_KEY_agg'], format='%Y-%m')

# Extract the year and month from the 'TIME_KEY' column and format it as "year-month"
ml_dataset['TIME_KEY_agg'] = ml_dataset['TIME_KEY_agg'].dt.strftime('%Y-%m')

# Step 3: Merge with the aggregated dataset
ml_dataset = pd.merge(ml_dataset, df_first_transaction, on='CUSTOMER_ACCOUNT_NR_MASK', how='left')

# Step 4: Filter the aggregated dataset based on the first transaction date
ml_dataset = ml_dataset[ml_dataset['TIME_KEY_agg'] >= ml_dataset['TIME_KEY']]

ml_dataset = ml_dataset.drop(columns=['TIME_KEY_agg','TIME_KEY'])

# less 8M rows with useless data

In [11]:
##
# Sort the aggregated dataset to properly compute features
ml_dataset = ml_dataset.sort_values(['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT','YEAR','MONTH'])

In [12]:
# TOTAL NUMBER OF ORDERS

df['CUST_NUM_TRANSACTIONS_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_MONTH'] = df['CUST_NUM_TRANSACTIONS_MONTH'].astype(int)

df['CUST_NUM_TRANSACTIONS_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_QUARTER'] = df['CUST_NUM_TRANSACTIONS_QUARTER'].astype(int)

df['CUST_NUM_TRANSACTIONS_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_SEMESTER'] = df['CUST_NUM_TRANSACTIONS_SEMESTER'].astype(int)

df['CUST_NUM_TRANSACTIONS_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_TRANSACTIONS_YEAR'] = df['CUST_NUM_TRANSACTIONS_YEAR'].astype(int)

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_NUM_TRANSACTIONS', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_NUM_TRANSACTIONS', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_NUM_TRANSACTIONS', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_NUM_TRANSACTIONS', 'CUST', False)

In [13]:
# TOTAL QUATITY BOUGHT BY CUSTOMER

df['CUST_TOTAL_QTY_BOUGHT_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['QTY'].transform(pd.Series.cumsum)
df['CUST_TOTAL_QTY_BOUGHT_MONTH'] = df['CUST_TOTAL_QTY_BOUGHT_MONTH'].astype(int)

df['CUST_TOTAL_QTY_BOUGHT_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['QTY'].transform(pd.Series.cumsum)
df['CUST_TOTAL_QTY_BOUGHT_QUARTER'] = df['CUST_TOTAL_QTY_BOUGHT_QUARTER'].astype(int)

df['CUST_TOTAL_QTY_BOUGHT_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['QTY'].transform(pd.Series.cumsum)
df['CUST_TOTAL_QTY_BOUGHT_SEMESTER'] = df['CUST_TOTAL_QTY_BOUGHT_SEMESTER'].astype(int)

df['CUST_TOTAL_QTY_BOUGHT_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['QTY'].transform(pd.Series.cumsum)
df['CUST_TOTAL_QTY_BOUGHT_YEAR'] = df['CUST_TOTAL_QTY_BOUGHT_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_TOTAL_QTY_BOUGHT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_TOTAL_QTY_BOUGHT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_TOTAL_QTY_BOUGHT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_TOTAL_QTY_BOUGHT', 'CUST', False)

In [14]:
# UNIQUE NUMBER OF SUBCATEGORIES BOUGHT

df['CUST_NUM_UNIQUE_SUBCAT_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_MONTH'] = df['CUST_NUM_UNIQUE_SUBCAT_MONTH'].astype(int)

df['CUST_NUM_UNIQUE_SUBCAT_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_QUARTER'] = df['CUST_NUM_UNIQUE_SUBCAT_QUARTER'].astype(int)

df['CUST_NUM_UNIQUE_SUBCAT_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_SEMESTER'] = df['CUST_NUM_UNIQUE_SUBCAT_SEMESTER'].astype(int)

df['CUST_NUM_UNIQUE_SUBCAT_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT_YEAR'] = df['CUST_NUM_UNIQUE_SUBCAT_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_NUM_UNIQUE_SUBCAT', 'CUST', False)

In [15]:
# AVERAGE DAYS SINCE LAST CUSTOMER'S TRANSACTION

df = calculate_rolling_avg(df, 'MONTH', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')
df = calculate_rolling_avg(df, 'QUARTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')
df = calculate_rolling_avg(df, 'SEMESTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')
df = calculate_rolling_avg(df, 'YEAR', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST')

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUST', True)

# Regression feature
ml_dataset['REG_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH'] = ml_dataset.groupby(['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT']) \
                                                                    .CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH \
                                                                    .shift(-1)

In [16]:
# AVERAGE BASKET SIZE

df['CUST_NUM_SUBCAT_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'MONTH'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_MONTH'] = df['CUST_NUM_SUBCAT_MONTH']
df['CUST_AVG_BASKET_SIZE_MONTH'] = df['CUST_NUM_SUBCAT_MONTH'] / df['CUST_NUM_TRANSACTIONS_MONTH']

df['CUST_NUM_SUBCAT_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'QUARTER'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_QUARTER'] = df['CUST_NUM_SUBCAT_QUARTER']
df['CUST_AVG_BASKET_SIZE_QUARTER'] = df['CUST_NUM_SUBCAT_QUARTER'] / df['CUST_NUM_TRANSACTIONS_QUARTER']

df['CUST_NUM_SUBCAT_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR', 'SEMESTER'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_SEMESTER'] = df['CUST_NUM_SUBCAT_SEMESTER']
df['CUST_AVG_BASKET_SIZE_SEMESTER'] = df['CUST_NUM_SUBCAT_SEMESTER'] / df['CUST_NUM_TRANSACTIONS_SEMESTER']

df['CUST_NUM_SUBCAT_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'YEAR'])['SUBCAT_CD_EXT'].transform(count)
df['CUST_NUM_SUBCAT_YEAR'] = df['CUST_NUM_SUBCAT_YEAR']
df['CUST_AVG_BASKET_SIZE_YEAR'] = df['CUST_NUM_SUBCAT_YEAR'] / df['CUST_NUM_TRANSACTIONS_YEAR']


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUST_AVG_BASKET_SIZE', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUST_AVG_BASKET_SIZE', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUST_AVG_BASKET_SIZE', 'CUST', True)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUST_AVG_BASKET_SIZE', 'CUST', True)

Subcategory features

In [17]:
# TOTAL NUMBER OF ORDERS

df['SUBCAT_NUM_TRANSACTIONS_MONTH'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_MONTH'] = df['SUBCAT_NUM_TRANSACTIONS_MONTH'].astype(int)

df['SUBCAT_NUM_TRANSACTIONS_QUARTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_QUARTER'] = df['SUBCAT_NUM_TRANSACTIONS_QUARTER'].astype(int)

df['SUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df['SUBCAT_NUM_TRANSACTIONS_SEMESTER'].astype(int)

df['SUBCAT_NUM_TRANSACTIONS_YEAR'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['SUBCAT_NUM_TRANSACTIONS_YEAR'] = df['SUBCAT_NUM_TRANSACTIONS_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'SUBCAT_NUM_TRANSACTIONS', 'SUBCAT', False)

In [18]:
# TOTAL QUATITY BOUGHT BY SUBCATEGORY

df['SUBCAT_TOTAL_QTY_BOUGHT_MONTH'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['QTY'].transform(pd.Series.cumsum)
df['SUBCAT_TOTAL_QTY_BOUGHT_MONTH'] = df['SUBCAT_TOTAL_QTY_BOUGHT_MONTH'].astype(int)

df['SUBCAT_TOTAL_QTY_BOUGHT_QUARTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['QTY'].transform(pd.Series.cumsum)
df['SUBCAT_TOTAL_QTY_BOUGHT_QUARTER'] = df['SUBCAT_TOTAL_QTY_BOUGHT_QUARTER'].astype(int)

df['SUBCAT_TOTAL_QTY_BOUGHT_SEMESTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['QTY'].transform(pd.Series.cumsum)
df['SUBCAT_TOTAL_QTY_BOUGHT_SEMESTER'] = df['SUBCAT_TOTAL_QTY_BOUGHT_SEMESTER'].astype(int)

df['SUBCAT_TOTAL_QTY_BOUGHT_YEAR'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR'])['QTY'].transform(pd.Series.cumsum)
df['SUBCAT_TOTAL_QTY_BOUGHT_YEAR'] = df['SUBCAT_TOTAL_QTY_BOUGHT_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'SUBCAT_TOTAL_QTY_BOUGHT', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'SUBCAT_TOTAL_QTY_BOUGHT', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'SUBCAT_TOTAL_QTY_BOUGHT', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'SUBCAT_TOTAL_QTY_BOUGHT', 'SUBCAT', False)

In [19]:
# UNIQUE NUMBER OF CUSTOMERS WHO BOUGHT FROM A SUBCATEGORY

df['SUBCAT_NUM_UNIQUE_CUST_MONTH'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_MONTH'] = df['SUBCAT_NUM_UNIQUE_CUST_MONTH'].astype(int)

df['SUBCAT_NUM_UNIQUE_CUST_QUARTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_QUARTER'] = df['SUBCAT_NUM_UNIQUE_CUST_QUARTER'].astype(int)

df['SUBCAT_NUM_UNIQUE_CUST_SEMESTER'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_SEMESTER'] = df['SUBCAT_NUM_UNIQUE_CUST_SEMESTER'].astype(int)

df['SUBCAT_NUM_UNIQUE_CUST_YEAR'] = df.groupby(['SUBCAT_CD_EXT', 'YEAR'])['CUSTOMER_ACCOUNT_NR_MASK'].transform(count_unique)
df['SUBCAT_NUM_UNIQUE_CUST_YEAR'] = df['SUBCAT_NUM_UNIQUE_CUST_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'SUBCAT_NUM_UNIQUE_CUST', 'SUBCAT', False)

Customer-Subcategory features

In [20]:
# TOTAL NUMBER OF ORDERS FOR A SUBCATEGORY BY A SPECIFIC CUSTOMER

df['CUSTSUBCAT_NUM_TRANSACTIONS_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_MONTH'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_MONTH'].astype(int)

df['CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER'].astype(int)

df['CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER'].astype(int)

df['CUSTSUBCAT_NUM_TRANSACTIONS_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR'])['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUSTSUBCAT_NUM_TRANSACTIONS_YEAR'] = df['CUSTSUBCAT_NUM_TRANSACTIONS_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUSTSUBCAT_NUM_TRANSACTIONS', 'CUSTSUBCAT', False)

In [21]:
# TOTAL QUATITY BOUGHT FOR A SUBCATEGORY BY A SPECIFIC CUSTOMER

df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_MONTH'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'MONTH'])['QTY'].transform(pd.Series.cumsum)
df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_MONTH'] = df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_MONTH'].astype(int)

df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_QUARTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'QUARTER'])['QTY'].transform(pd.Series.cumsum)
df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_QUARTER'] = df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_QUARTER'].astype(int)

df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_SEMESTER'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR', 'SEMESTER'])['QTY'].transform(pd.Series.cumsum)
df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_SEMESTER'] = df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_SEMESTER'].astype(int)

df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_YEAR'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'YEAR'])['QTY'].transform(pd.Series.cumsum)
df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_YEAR'] = df['CUSTSUBCAT_TOTAL_QTY_BOUGHT_YEAR'].astype(int)


ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUSTSUBCAT_TOTAL_QTY_BOUGHT', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUSTSUBCAT_TOTAL_QTY_BOUGHT', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUSTSUBCAT_TOTAL_QTY_BOUGHT', 'CUSTSUBCAT', False)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUSTSUBCAT_TOTAL_QTY_BOUGHT', 'CUSTSUBCAT', False)

In [23]:
# AVERAGE DAYS SINCE LAST CUSTOMER'S TRANSACTION

df = calculate_rolling_avg(df, 'MONTH', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')
df = calculate_rolling_avg(df, 'QUARTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')
df = calculate_rolling_avg(df, 'SEMESTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')
df = calculate_rolling_avg(df, 'YEAR', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT')

ml_dataset = create_aggregations(df, ml_dataset, 'MONTH', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)
ml_dataset = create_aggregations(df, ml_dataset, 'QUARTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)
ml_dataset = create_aggregations(df, ml_dataset, 'SEMESTER', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)
ml_dataset = create_aggregations(df, ml_dataset, 'YEAR', 'CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION', 'CUSTSUBCAT', True)

Other features

In [24]:
customer = fill_missing_values(df)  # Fills the missing values of the customer column

In [25]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
customer_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['GENDER']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['GENDER'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(customer_dict)

In [26]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
family_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['FAMILY_MEMBERS']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['FAMILY_MEMBERS'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(family_dict)

In [27]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
lifestyle_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['seg_lifestyle_cd']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['SEG_LIFESTYLE_CD'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(lifestyle_dict)

In [28]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
lifestage_dict = dict(zip(customer['CUSTOMER_ACCOUNT_NR_MASK'], customer['seg_lifestage_cd']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['SEG_LIFESTAGE_CD'] = ml_dataset['CUSTOMER_ACCOUNT_NR_MASK'].map(lifestage_dict)

In [29]:
query = client.query("""
   SELECT SUBCAT_CD_EXT, CAT_CD_EXT, PRICE_RANGE
   FROM tables_raw.dim_product
   """)

products = query.result().to_dataframe() # Wait for the job to complete.

In [30]:
# create a dictionary that maps CUSTOMER_ACCOUNT_NR_MASK values to their corresponding gender values
lifestage_dict = dict(zip(products['SUBCAT_CD_EXT'], products['CAT_CD_EXT']))

# map the gender values to the CUSTOMER_ACCOUNT_NR_MASK column in result_df using the customer_dict mapping
ml_dataset['CAT_CD_EXT'] = ml_dataset['SUBCAT_CD_EXT'].map(lifestage_dict)

In [33]:
ml_dataset

Unnamed: 0,CUSTOMER_ACCOUNT_NR_MASK,SUBCAT_CD_EXT,MONTH,YEAR,QUARTER,SEMESTER,CUST_NUM_TRANSACTIONS_MONTH,CUST_NUM_TRANSACTIONS_QUARTER,CUST_NUM_TRANSACTIONS_SEMESTER,CUST_NUM_TRANSACTIONS_YEAR,CUST_TOTAL_QTY_BOUGHT_MONTH,CUST_TOTAL_QTY_BOUGHT_QUARTER,CUST_TOTAL_QTY_BOUGHT_SEMESTER,CUST_TOTAL_QTY_BOUGHT_YEAR,CUST_NUM_UNIQUE_SUBCAT_MONTH,CUST_NUM_UNIQUE_SUBCAT_QUARTER,CUST_NUM_UNIQUE_SUBCAT_SEMESTER,CUST_NUM_UNIQUE_SUBCAT_YEAR,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_QUARTER,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_SEMESTER,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION_YEAR,REG_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH,CUST_AVG_BASKET_SIZE_MONTH,CUST_AVG_BASKET_SIZE_QUARTER,CUST_AVG_BASKET_SIZE_SEMESTER,CUST_AVG_BASKET_SIZE_YEAR,SUBCAT_NUM_TRANSACTIONS_MONTH,SUBCAT_NUM_TRANSACTIONS_QUARTER,SUBCAT_NUM_TRANSACTIONS_SEMESTER,SUBCAT_NUM_TRANSACTIONS_YEAR,SUBCAT_TOTAL_QTY_BOUGHT_MONTH,SUBCAT_TOTAL_QTY_BOUGHT_QUARTER,SUBCAT_TOTAL_QTY_BOUGHT_SEMESTER,SUBCAT_TOTAL_QTY_BOUGHT_YEAR,SUBCAT_NUM_UNIQUE_CUST_MONTH,SUBCAT_NUM_UNIQUE_CUST_QUARTER,SUBCAT_NUM_UNIQUE_CUST_SEMESTER,SUBCAT_NUM_UNIQUE_CUST_YEAR,CUSTSUBCAT_NUM_TRANSACTIONS_MONTH,CUSTSUBCAT_NUM_TRANSACTIONS_QUARTER,CUSTSUBCAT_NUM_TRANSACTIONS_SEMESTER,CUSTSUBCAT_NUM_TRANSACTIONS_YEAR,CUSTSUBCAT_TOTAL_QTY_BOUGHT_MONTH,CUSTSUBCAT_TOTAL_QTY_BOUGHT_QUARTER,CUSTSUBCAT_TOTAL_QTY_BOUGHT_SEMESTER,CUSTSUBCAT_TOTAL_QTY_BOUGHT_YEAR,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_MONTH,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_QUARTER,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_SEMESTER,CUSTSUBCAT_AVG_DAYS_SINCE_PRIOR_TRANSACTION_YEAR,GENDER,FAMILY_MEMBERS,SEG_LIFESTYLE_CD,SEG_LIFESTAGE_CD,CAT_CD_EXT
0,31655,10101,1,2021,1,1,2,2,2,2,9,9,9,9,6,6,6,6,8.00,8.00,8.00,8.00,0.00,4.00,4.00,4.00,4.00,2733,2733,2733,2733,4304,4304,4304,4304,1741,1741,1741,1741,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",2,4,101
1,31655,10101,2,2021,1,1,1,3,3,3,7,16,16,16,5,10,10,10,0.00,17.50,17.50,17.50,1.00,6.00,4.67,4.67,4.67,2640,5373,5373,5373,4140,8444,8444,8444,1676,2504,2504,2504,1,1,1,1,2,2,2,2,0.00,0.00,0.00,0.00,F,"(3, 8)",2,4,101
2,31655,10101,3,2021,1,1,2,5,5,5,16,32,32,32,5,14,14,14,1.00,15.25,15.25,15.25,1.00,6.50,5.40,5.40,5.40,2924,8297,8297,8297,4686,13130,13130,13130,1863,3090,3090,3090,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",2,4,101
3,31655,10101,4,2021,2,1,2,2,7,7,9,9,41,41,7,7,18,18,1.00,1.00,16.83,16.83,0.00,4.00,4.00,5.00,5.00,2577,2577,10874,10874,4200,4200,17330,17330,1760,1760,3511,3511,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",2,4,101
4,31655,10101,5,2021,2,1,1,3,8,8,12,21,53,53,8,12,22,22,0.00,7.00,16.29,16.29,0.00,10.00,6.00,5.62,5.62,2824,5401,13698,13698,4815,9015,22145,22145,1885,2731,3890,3890,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",2,4,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22197202,133624448,170305,12,2022,4,2,1,1,1,1,2,2,2,2,2,2,2,2,0.00,0.00,0.00,0.00,,2.00,2.00,2.00,2.00,103,188,272,421,114,214,306,479,91,138,178,258,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",4,6,1703
22197203,133624448,170307,12,2022,4,2,1,1,1,1,2,2,2,2,2,2,2,2,0.00,0.00,0.00,0.00,,2.00,2.00,2.00,2.00,580,840,1192,1724,993,1465,2029,2950,491,657,862,1126,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",4,6,1703
22197204,133624448,170309,12,2022,4,2,1,1,1,1,2,2,2,2,2,2,2,2,0.00,0.00,0.00,0.00,,2.00,2.00,2.00,2.00,131,223,342,567,152,250,395,656,117,186,264,372,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",4,6,1703
22197205,133624448,170310,12,2022,4,2,1,1,1,1,2,2,2,2,2,2,2,2,0.00,0.00,0.00,0.00,,2.00,2.00,2.00,2.00,375,603,854,1271,485,814,1177,1687,326,464,598,810,0,0,0,0,0,0,0,0,0.00,0.00,0.00,0.00,F,"(3, 8)",4,6,1703


Target

In [38]:
ml_dataset = ml_dataset.sort_values(['CUSTOMER_ACCOUNT_NR_MASK','SUBCAT_CD_EXT','YEAR','MONTH'])
ml_dataset = compute_target(ml_dataset)

# Load dataset into BigQuery

In [39]:
# Shuffle the dataframe
ml_dataset = ml_dataset.sample(frac=1).reset_index(drop=True)

In [40]:
#### SAVE DATAFRAME TO BIGQUERY ####
client.load_table_from_dataframe(ml_dataset, 'tables_staging.df_models_1').result()

LoadJob<project=continente-lced-feup, location=europe-southwest1, id=988de8eb-9888-4ede-9b6f-45d42f4f3b4b>