# Data Import & Connection to BigQuery

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from google.cloud import bigquery

# set display options to show all columns
pd.set_option('display.max_columns', None)
# Set the float format to display numbers without scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Set the client for future queries to BigQuery
client = bigquery.Client(project = "continente-lced-feup")

In [2]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=JWkQmCIZr8LTJxVwz5pHh1lcjKKqVn&access_type=offline&code_challenge=89RQUF6VNoU9RTI2_TshiojGT5VQuJo9I-4NG0pk4BU&code_challenge_method=S256


Credentials saved to file: [C:\Users\iankk\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "egd-project" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning t

# Data Loading

In [3]:
query = client.query("""
   SELECT *
   FROM 
       tables_raw.fact_transaction 
       JOIN tables_raw.dim_customer USING(CUSTOMER_ACCOUNT_NR_MASK)
       JOIN tables_raw.dim_location USING(LOCATION_CD)
       JOIN tables_raw.dim_product USING(SKU)
   WHERE
       SEG_AGE_DSC="]25;35]"
   LIMIT 100000
   """)

df = query.result().to_dataframe() # Wait for the job to complete.
df

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,LOC_BRAND_CD,POS_TP_CD,PRODUCT_KEY,QTY,NET_SLS_AMT,GROSS_SLS_AMT,PROD_DSCNT_ISSUED_AMT,TRANS_DSCNT_RAT_AMT,DIRECT_DSCNT_AMT,GENDER,FAMILY_MEMBERS,CP4,seg_lifestyle_cd,seg_lifestyle_dsc,SEG_AGE,SEG_AGE_DSC,seg_lifestage_cd,seg_lifestage_dsc,LOCATION_DSC,LOC_BRAND_CD_1,LOC_BRAND_DSC,cp7,PRODUCT_DSC,UNIT_BASE_CD_EXT,UNIT_BASE_DSC_EXT,SUBCAT_CD_EXT,SUBCAT_DSC_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,BIZ_UNIT_DSC_EXT,DEPARTMENT_CD_EXT,DEPARTMENT_DSC_EXT,PRODUCT_SHORT_DSC,BRAND_DSC,BRAND_TYPE_CD,PRICE_RANGE,CONVERSION_FACTOR,CAPACITY_UNIT
0,6759523,2632,58149188.00,20221112,8766115481888522219,302,P,6759523010001,-1.00,-2.00,-2.00,0.00,0.00,0.00,F,2,2810,4,Sem Valor,SA_3,]25;35],4,Family with Kids,CBD CHARNECA CAPARIC,302,continente bom dia,2820,TAÇA LISA BRANCA,38010503,38010503 - taças/pratos,380105,380105 - pequeno almoço,3801,3801 - mesa,38,38 - Casa-Mesa/Mobiliário,21,21 - CASA,TAÇA CEREAIS,KASA,MP,ECONOMICA,1.00,UN
1,4379971,8524,12117951.00,20211217,7283100217535699831,302,P,4379971010001,1.00,3.00,3.69,0.00,0.00,0.00,F,2,2830,2,Família,SA_3,]25;35],4,Family with Kids,CBD BAIXA BANHEIRA,302,continente bom dia,2835,COCKTAIL BAR ROYAL MELON 75CL,17031304,17031304 - Rtd/Cocktail,170313,170313 - espumant estrangeiro,1703,1703 - espirit/espum/fortif,17,17 - Vinho e Espirituosas,10,10 - ALIMENTAR,COCKT BAR ROYAL,BAR ROYAL,MF,SECUNDARIA,0.75,LT
2,6544394,2076,83612158.00,20220120,7316968592084935238,302,P,6544394010001,1.00,3.00,3.69,0.00,0.00,0.00,F,4,2605,1,Qualidade,SA_3,]25;35],4,Family with Kids,CBD CANEÇAS,302,continente bom dia,1685,COCKTAIL BAR ROYAL STRAWBERRY 75CL,17031304,17031304 - Rtd/Cocktail,170313,170313 - espumant estrangeiro,1703,1703 - espirit/espum/fortif,17,17 - Vinho e Espirituosas,10,10 - ALIMENTAR,COCKT BAR ROYAL,BAR ROYAL,MF,SECUNDARIA,0.75,LT
3,5719353,329,26570946.00,20210824,7492427219493738561,302,P,5719353010002,1.00,3.00,3.69,0.00,0.00,0.00,F,4,4905,1,Qualidade,SA_3,]25;35],5,Active Adults,CBD VIZ VIANA EST.,302,continente bom dia,4900,LAMINA DESCART BIC FLEX 3 CLASSIC 3+1UN,9010801,09010801 - pr. barbear impulso,90108,090108 - pr. homem impulso,901,0901 - produtos para homem,9,09 - Beleza,10,10 - ALIMENTAR,LÂM DESC BIC,BIC,MF,SECUNDARIA,3.00,UN
4,5316720,4418,19433355.00,20221126,9103077598547050382,302,P,5316720010001,1.00,3.00,3.69,0.00,0.00,3.70,F,4,3080,1,Qualidade,SA_3,]25;35],5,Active Adults,CBD BUARCOS FIG FOZ,302,continente bom dia,3080,LOCAO CORPO LACTOVIT NUTRICAO 400ML,9040101,09040101 - Leite/Lo�ão/Gel,90401,090401 - corpo,904,0904 - cuidados do corpo,9,09 - Beleza,10,10 - ALIMENTAR,LOCAO LACTOVIT,LACTOVIT,MF,SECUNDARIA,0.40,LT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6840653,3886,92228356.00,20210601,7408351512713966652,303,P,6840653010001,1.00,3.97,4.21,0.00,0.63,2.28,F,0,0000,1,Qualidade,SA_3,]25;35],4,Family with Kids,MDL LISBOA MARECHAL,303,continente modelo,1800,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALHITAS BEBÉ,DODOT,MF,SECUNDARIA,1.00,UN
99996,6840653,241,80239180.00,20220314,8847230074213977340,303,P,6840653010001,1.00,3.97,4.21,0.00,0.00,2.28,F,2,6000,2,Família,SA_3,]25;35],4,Family with Kids,MDL C. BRANCO,303,continente modelo,6000,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALHITAS BEBÉ,DODOT,MF,SECUNDARIA,1.00,UN
99997,7049892,3366,71475037.00,20220524,8880003564909201412,303,P,7049892010001,-1.00,-2.19,-2.19,0.00,0.00,0.00,F,3,3050,1,Qualidade,SA_3,]25;35],4,Family with Kids,MDL FIGUEIRA FOZ,303,continente modelo,3080,TOALHITAS CFARME WATER ESSENCIAL 56UN,5040205,05040205 - toalh be/cri eco/bio,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOA WATER 56UN,CORINE FARME,MF,SECUNDARIA,1.00,UN
99998,6283101,842,19514535.00,20220625,8913150315591533253,303,P,6283101010001,2.00,2.84,2.98,0.00,0.00,0.00,F,2,9060,3,Preço,SA_3,]25;35],4,Family with Kids,MDL CANCELA,303,continente modelo,9050,TOALHITAS SUAVIDADE CNT DO BEBÉ 64UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALH. SUAV,CONT.DO BEBÉ,MP,PROPRIA,64.00,UN


# Data Preparation (more to be done...)

In [4]:
df = df.drop(columns=['LOC_BRAND_CD','PROD_DSCNT_ISSUED_AMT','NET_SLS_AMT','TRANS_DSCNT_RAT_AMT','DIRECT_DSCNT_AMT','CP4',
                 'seg_lifestyle_dsc','SEG_AGE','SEG_AGE_DSC','seg_lifestage_dsc','LOCATION_DSC','LOC_BRAND_DSC','cp7',
                 'UNIT_BASE_DSC_EXT','SUBCAT_CD_EXT','SUBCAT_DSC_EXT','BIZ_UNIT_DSC_EXT','DEPARTMENT_DSC_EXT',
                'PRODUCT_SHORT_DSC','BRAND_DSC','BRAND_TYPE_CD','CONVERSION_FACTOR','CAPACITY_UNIT'])

df  # Remover devoluções?

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE
0,6759523,2632,58149188.00,20221112,8766115481888522219,P,6759523010001,-1.00,-2.00,F,2,4,4,302,TAÇA LISA BRANCA,38010503,3801,3801 - mesa,38,21,ECONOMICA
1,4379971,8524,12117951.00,20211217,7283100217535699831,P,4379971010001,1.00,3.69,F,2,2,4,302,COCKTAIL BAR ROYAL MELON 75CL,17031304,1703,1703 - espirit/espum/fortif,17,10,SECUNDARIA
2,6544394,2076,83612158.00,20220120,7316968592084935238,P,6544394010001,1.00,3.69,F,4,1,4,302,COCKTAIL BAR ROYAL STRAWBERRY 75CL,17031304,1703,1703 - espirit/espum/fortif,17,10,SECUNDARIA
3,5719353,329,26570946.00,20210824,7492427219493738561,P,5719353010002,1.00,3.69,F,4,1,5,302,LAMINA DESCART BIC FLEX 3 CLASSIC 3+1UN,9010801,901,0901 - produtos para homem,9,10,SECUNDARIA
4,5316720,4418,19433355.00,20221126,9103077598547050382,P,5316720010001,1.00,3.69,F,4,1,5,302,LOCAO CORPO LACTOVIT NUTRICAO 400ML,9040101,904,0904 - cuidados do corpo,9,10,SECUNDARIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6840653,3886,92228356.00,20210601,7408351512713966652,P,6840653010001,1.00,4.21,F,0,1,4,303,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,504,0504 - hig e prote�ão bebé,5,10,SECUNDARIA
99996,6840653,241,80239180.00,20220314,8847230074213977340,P,6840653010001,1.00,4.21,F,2,2,4,303,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,504,0504 - hig e prote�ão bebé,5,10,SECUNDARIA
99997,7049892,3366,71475037.00,20220524,8880003564909201412,P,7049892010001,-1.00,-2.19,F,3,1,4,303,TOALHITAS CFARME WATER ESSENCIAL 56UN,5040205,504,0504 - hig e prote�ão bebé,5,10,SECUNDARIA
99998,6283101,842,19514535.00,20220625,8913150315591533253,P,6283101010001,2.00,2.98,F,2,3,4,303,TOALHITAS SUAVIDADE CNT DO BEBÉ 64UN,5040201,504,0504 - hig e prote�ão bebé,5,10,PROPRIA


In [5]:
df.isnull().sum()  # Maybe we shouldn't remove null values... What they mean? Insert values so that we don't lose precious data

SKU                             0
LOCATION_CD                     0
CUSTOMER_ACCOUNT_NR_MASK        0
TIME_KEY                        0
TRANSACTION_ID_MASK             0
POS_TP_CD                       0
PRODUCT_KEY                     0
QTY                             0
GROSS_SLS_AMT                   0
GENDER                       4991
FAMILY_MEMBERS              18748
seg_lifestyle_cd                0
seg_lifestage_cd                0
LOC_BRAND_CD_1                  0
PRODUCT_DSC                     0
UNIT_BASE_CD_EXT                0
CAT_CD_EXT                      0
CAT_DSC_EXT                     0
BIZ_UNIT_CD_EXT                 0
DEPARTMENT_CD_EXT               0
PRICE_RANGE                     0
dtype: int64

# Feature Engineering

Explicit Features Engineering

In [6]:
# convert the 'TIME_KEY' column to datetime format
df['TIME_KEY'] = pd.to_datetime(df['TIME_KEY'], format='%Y%m%d')

# create new columns for the day, week, day of the week, month, quarter, and year
df['DAY'] = df['TIME_KEY'].dt.day
df['WEEK'] = df['TIME_KEY'].dt.isocalendar().week
df['DOW'] = df['TIME_KEY'].dt.dayofweek
df['MONTH'] = df['TIME_KEY'].dt.month
df['QUARTER'] = df['TIME_KEY'].dt.quarter
df['YEAR'] = df['TIME_KEY'].dt.year

Customer Feature Engineering

In [7]:
# TOTAL NUMBER OF ITEMS BOUGHT

# sort the data by customer account and date
df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])

# group the data by customer account and count the number of transactions up to that date
df['CUST_NUM_ITEMS_BOUGHT'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TIME_KEY'].cumcount()+1

In [8]:
# TOTAL NUMBER OF PRIOR ORDERS

def count_unique_transactions(customer_transactions):
    prev_transaction = None
    count = -1
    counts = []
    for transaction in customer_transactions:
        if transaction != prev_transaction:
            count += 1
        counts.append(count)
        prev_transaction = transaction
    return counts

df['CUST_NUM_PRIOR_TRANSACTIONS'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_PRIOR_TRANSACTIONS'] = df['CUST_NUM_PRIOR_TRANSACTIONS'].astype(int)

In [9]:
# UNIQUE NUMBER OF CATEGORIES BOUGHT

def count_unique(customer_categories):
    prev_categories = []
    count = 0
    counts = []
    for category in customer_categories:
        if category not in prev_categories:
            count += 1
        counts.append(count)
        prev_categories.append(category)
    return counts

df['CUST_NUM_UNIQUE_CATEGORIES'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['CAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_CATEGORIES'] = df['CUST_NUM_UNIQUE_CATEGORIES'].astype(int)

In [10]:
# UNIQUE NUMBER OF ORDERS??? Makes sense at all?

In [11]:
# DAYS SINCE LAST TRANSACTION

df['LAST_TIME_KEY'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TIME_KEY'].shift()
df['CUST_DAYS_SINCE_LAST_TRANSACTION'] = (df['TIME_KEY'] - df['LAST_TIME_KEY']).dt.days.fillna(-1)
df['CUST_DAYS_SINCE_LAST_TRANSACTION'] = df['CUST_DAYS_SINCE_LAST_TRANSACTION'].astype(int)
df = df.drop(columns=['LAST_TIME_KEY'])

In [12]:
# AVERAGE DAYS SINCE LAST TRANSACTION

def calculate_rolling_avg(df):
    customer_groups = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')
    for name, group in customer_groups:
        # Calculate the rolling average for each customer
        rolling_avg = group['CUST_DAYS_SINCE_LAST_TRANSACTION'].rolling(window=1, min_periods=1).mean()
        # Reset the rolling average to 0 for the first transaction of each customer
        rolling_avg.iloc[0] = 0
        # Set the rolling average values back into the original dataframe
        df.loc[group.index, 'rolling_avg_days_since_prior_transaction'] = rolling_avg.values
    # Calculate the sum and count of the rolling average for each customer
    df['rolling_avg_days_since_prior_transaction_sum'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['rolling_avg_days_since_prior_transaction'].cumsum()
    # Calculate the customer average for the rolling average
    df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'] = df['rolling_avg_days_since_prior_transaction_sum'] / df['CUST_NUM_PRIOR_TRANSACTIONS']
    # Round the customer average to 2 decimal places
    df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'] = df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'].round(2).fillna(0)
    df = df.drop(columns=['rolling_avg_days_since_prior_transaction','rolling_avg_days_since_prior_transaction_sum'])
    return df


df = calculate_rolling_avg(df)

In [13]:
# SIZE OF THE LAST BASKET

def add_cust_last_transaction_count(df):
    df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
    
    df['CUST_LAST_BASKET_SIZE'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].apply(lambda x: x.shift().map(x.value_counts()).fillna(0)).astype(int)
    
    # update values for rows with repeated transaction IDs
    for _, group in df.groupby('TRANSACTION_ID_MASK'):
        if len(group) > 1:
            first_row_index = group.index[0]
            df.loc[group.index, 'CUST_LAST_BASKET_SIZE'] = df.at[first_row_index, 'CUST_LAST_BASKET_SIZE'] 
    return df

df = add_cust_last_transaction_count(df)

In [14]:
# AVERAGE LAST BASKET SIZE (NOT 100% CORRECT YET)

def calculate_rolling_avg_basket(df):
    df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
    customer_groups = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')
    for name, group in customer_groups:
        # Calculate the rolling average for each customer
        rolling_avg = group['CUST_LAST_BASKET_SIZE'].rolling(window=1, min_periods=1).mean()
        # Reset the rolling average to 0 for the first transaction of each customer
        rolling_avg.iloc[0] = 0
        # Set the rolling average values back into the original dataframe
        df.loc[group.index, 'rolling_avg_days_since_prior_transaction'] = rolling_avg.values
    # Calculate the sum and count of the rolling average for each customer
    df['rolling_avg_days_since_prior_transaction_sum'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['rolling_avg_days_since_prior_transaction'].cumsum()
    # Calculate the customer average for the rolling average
    df['CUST_AVG_LAST_BASKET_SIZE'] = df['rolling_avg_days_since_prior_transaction_sum'] / df['CUST_NUM_PRIOR_TRANSACTIONS']
    # Round the customer average to 2 decimal places
    df['CUST_AVG_LAST_BASKET_SIZE'] = df['CUST_AVG_LAST_BASKET_SIZE'].round(2).fillna(0)
    df = df.drop(columns=['rolling_avg_days_since_prior_transaction','rolling_avg_days_since_prior_transaction_sum'])
    return df


df = calculate_rolling_avg_basket(df)

Category features

In [15]:
# TOTAL NUMBER OF CATEGORIES BOUGHT

df = df.sort_values('TIME_KEY')
df['CAT_NUM_CAT_BOUGHT'] = df.groupby('CAT_DSC_EXT')['TIME_KEY'].rank(method='first')
df['CAT_NUM_CAT_BOUGHT'] = df['CAT_NUM_CAT_BOUGHT'].astype(int)
df = df.sample(frac = 1)

In [16]:
# NUMBER OF CUSTOMERS WHO BOUGHT A CATEGORY

df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT', 'TIME_KEY'], inplace=True)
first_appearance_mask = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['TIME_KEY'].transform(lambda x: x == x.min())
df['CAT_CUST_BOUGHT_CAT'] = first_appearance_mask.groupby(df['CAT_DSC_EXT']).cumsum()
df['CAT_CUST_BOUGHT_CAT'].fillna(0, inplace=True)
df['CAT_CUST_BOUGHT_CAT'] = df['CAT_CUST_BOUGHT_CAT'].astype(int)

Customer-category features

In [17]:
# TOTAL NUMBER OF REORDERS PER CUSTOMER

df['CAT_REORDER_NUM'] = 0
for customer, category in df[['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT']].drop_duplicates().values:
    mask = (df['CUSTOMER_ACCOUNT_NR_MASK'] == customer) & (df['CAT_DSC_EXT'] == category)
    df.loc[mask, 'CAT_REORDER_NUM'] = df[mask]['TIME_KEY'].rank(method='min') - 1

KeyboardInterrupt: 

In [18]:
# NUMBER OF CUSTOMERS WHO BOUGHT ONLT ONE TIME

counts = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT']).size().to_dict()
def is_unique(row):
    if counts[(row['CUSTOMER_ACCOUNT_NR_MASK'], row['CAT_DSC_EXT'])] == 1:
        return 1
    else:
        return 0


df['CAT_CUS_ONLY_1T'] = df.apply(is_unique, axis=1)

In [19]:
# DAYS SINCE A CUSTOMER BOUGHT A CATEGORY
# DAYS SINCE THE FIRST CATEGORY WAS BOUGHT


df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
df['DAYS_SINCE_LAST_PURCHASE'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['TIME_KEY'].diff().dt.days.fillna(0)
df['DAYS_SINCE_FIRST_PURCHASE'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['DAYS_SINCE_LAST_PURCHASE'].cumsum()
df['DAYS_SINCE_FIRST_PURCHASE'] = df['DAYS_SINCE_FIRST_PURCHASE'].astype(int)

df['DAYS_SINCE_FIRST_PURCHASE'] = pd.to_timedelta(df['DAYS_SINCE_FIRST_PURCHASE'], unit='D')
df['DAYS_SINCE_LAST_PURCHASE'] = pd.to_timedelta(df['DAYS_SINCE_LAST_PURCHASE'], unit='D')

# df.drop('DAYS_SINCE_LAST_PURCHASE', axis=1, inplace=True)

df['DAYS_SINCE_LAST_PURCHASE'] = df['DAYS_SINCE_LAST_PURCHASE'].dt.floor('D')
df['DAYS_SINCE_FIRST_PURCHASE'] = df['DAYS_SINCE_FIRST_PURCHASE'].dt.floor('D')

In [20]:
#LABEL

df['LABEL'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['TIME_KEY'].apply(lambda x: ((x >= pd.Timestamp('2022-01-01')) & ((x - x.max()) <= pd.Timedelta(days=365))).astype(int))

In [21]:
df = df.sample(frac = 1)
df.head(10)

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE,DAY,WEEK,DOW,MONTH,QUARTER,YEAR,CUST_NUM_ITEMS_BOUGHT,CUST_NUM_PRIOR_TRANSACTIONS,CUST_NUM_UNIQUE_CATEGORIES,CUST_DAYS_SINCE_LAST_TRANSACTION,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION,CUST_LAST_BASKET_SIZE,CUST_AVG_LAST_BASKET_SIZE,CAT_NUM_CAT_BOUGHT,CAT_CUST_BOUGHT_CAT,CAT_REORDER_NUM,CAT_CUS_ONLY_1T,DAYS_SINCE_LAST_PURCHASE,DAYS_SINCE_FIRST_PURCHASE,LABEL
39341,2595860,4121,6554495.0,2021-03-05,6996172590076725228,P,2595860010001,2.0,2.9,F,3.0,2,5,302,MINI TOSTAS CONTINENTE 350GR,16030205,1603,1603 - pão industrial&tosta,16,12,PROPRIA,5,9,4,3,1,2021,2,1,2,45,45.0,1,1.0,183,157,0.0,0,0 days,0 days,0
21886,7214074,247,83748199.0,2021-05-04,7379247172656540348,P,7214074010001,1.0,1.88,M,,3,5,303,IOG OIKOS DANONE AMORA 4*110G,8040302,804,0804 - iogurtes e sobrem.,8,10,SECUNDARIA,4,18,1,5,2,2021,2,1,2,62,62.0,1,1.0,900,2389,0.0,1,0 days,0 days,0
33267,3807647,245,10344850.0,2022-07-07,8672876355407761699,P,3807647010001,2.0,5.16,F,0.0,1,5,303,AGUA C/GAS PET .PEDRAS LIMÃO 4*33CL,3040303,304,0304 - águas,3,10,SECUNDARIA,7,27,3,7,3,2022,13,9,9,152,56.56,1,1.89,1525,230,1.0,0,152 days,152 days,1
47588,4233565,2754,83520001.0,2021-10-09,7213049291742119219,P,4233565010001,2.0,1.54,F,,2,4,302,SACOS P/CONGELA�ÃO CONTINENTE 50 UND,6040504,604,0604 - prod papel e consum.,6,10,PROPRIA,9,40,5,10,4,2021,10,9,9,3,30.89,1,1.0,763,1092,0.0,1,0 days,0 days,0
10110,6579062,4961,69169289.0,2021-12-27,7293117933008839769,P,6579062010001,1.0,4.19,F,0.0,3,5,302,QJ MINI BABYBEL LIGHT 12*20G,13010603,1301,1301 - queijos,13,12,SECUNDARIA,27,52,0,12,4,2021,40,36,15,19,9.61,1,1.22,1419,1287,0.0,0,118 days,217 days,0
34681,7371625,4895,6752523.0,2022-10-30,8787806158274192514,P,7371625010001,5.0,1.05,F,0.0,1,4,303,(AT) CACETINHO 87G (CONG),16010501,1601,1601 - pão tradicional,16,12,SECUNDARIA,30,43,6,10,4,2022,113,97,40,4,6.79,1,1.36,2198,168,4.0,0,14 days,617 days,1
88901,5063155,231,86769304.0,2022-11-28,9106146357010108116,P,5063155010001,1.52,1.5,F,4.0,2,4,303,CENOURA KG,15020602,1502,1502 - legumes,15,12,SECUNDARIA,28,48,0,11,4,2022,86,76,37,8,9.07,1,1.24,4912,2240,0.0,0,597 days,663 days,1
72821,4353196,1908,58308505.0,2022-05-31,8924076632449941610,P,4353196010001,1.0,2.29,F,5.0,3,5,302,(I) CARAMELOS DE FRUTA 500G,2020301,202,0202 - doçaria,2,10,ECONOMICA,31,22,1,5,2,2022,1,0,1,-1,0.0,0,0.0,1882,965,0.0,1,0 days,0 days,1
74271,4240474,3883,19104132.0,2022-04-15,8842007618508859900,P,4240474010001,1.0,0.64,F,,3,5,302,COELHO CEMOI CHOCO LEITE 60G,2020501,202,0202 - doçaria,2,10,SECUNDARIA,15,15,4,4,2,2022,15,12,12,77,35.33,1,1.33,1718,409,0.0,1,0 days,0 days,1
71267,7408497,278,102771240.0,2022-07-09,8675905531418401999,P,7408497010001,1.0,2.49,F,3.0,3,5,303,GEL MAG CHO LEITE AMÊN CONTINENTE 6*120M,7030101,703,0703 - sobremesas cong,7,10,SECUNDARIA,9,27,5,7,3,2022,5,4,5,20,82.25,1,1.0,662,638,0.0,1,0 days,0 days,1


### Load dataset into BigQuery

In [24]:
#### SAVE DATAFRAME TO BIGQUERY
#df.to_gbq(destination_table='data_mining_dataset.data_mining_dataset', project_id='data-mining-310618', if_exists='replace')
client = bigquery.Client(project = "continente-lced-feup")
client.load_table_from_dataframe(df, 'tables_staging.df_models').result()

  ", ".join(field.name for field in unknown_type_fields)


LoadJob<project=continente-lced-feup, location=europe-southwest1, id=f09aa090-edd8-4f6e-96d5-d51ba8b1c76d>