# Data Import & Connection to BigQuery

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from google.cloud import bigquery

# set display options to show all columns
pd.set_option('display.max_columns', None)
# Set the float format to display numbers without scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Set the client for future queries to BigQuery
client = bigquery.Client(project = "continente-lced-feup")



In [2]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=cyvg2FTo3bLRayouIsWtmQBbvUK0LW&access_type=offline&code_challenge=_qcV-EbcHj-SPFaTXJ_TwF3keqOSpskqi9ut9aWr2kg&code_challenge_method=S256


Credentials saved to file: [C:\Users\luish\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Cannot find a quota project to add to ADC. You might receive a "quota exceeded" or "API not enabled" error. Run $ gcloud auth application-default set-quota-project to add a 

# Data Loading

In [3]:
query = client.query("""
   SELECT *
   FROM 
       tables_raw.fact_transaction 
       JOIN tables_raw.dim_customer USING(CUSTOMER_ACCOUNT_NR_MASK)
       JOIN tables_raw.dim_location USING(LOCATION_CD)
       JOIN tables_raw.dim_product USING(SKU)
   WHERE
       SEG_AGE_DSC="]25;35]"
   LIMIT 100000
   """)

df = query.result().to_dataframe() # Wait for the job to complete.
df

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,LOC_BRAND_CD,POS_TP_CD,PRODUCT_KEY,QTY,NET_SLS_AMT,GROSS_SLS_AMT,PROD_DSCNT_ISSUED_AMT,TRANS_DSCNT_RAT_AMT,DIRECT_DSCNT_AMT,GENDER,FAMILY_MEMBERS,CP4,seg_lifestyle_cd,seg_lifestyle_dsc,SEG_AGE,SEG_AGE_DSC,seg_lifestage_cd,seg_lifestage_dsc,LOCATION_DSC,LOC_BRAND_CD_1,LOC_BRAND_DSC,cp7,PRODUCT_DSC,UNIT_BASE_CD_EXT,UNIT_BASE_DSC_EXT,SUBCAT_CD_EXT,SUBCAT_DSC_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,BIZ_UNIT_DSC_EXT,DEPARTMENT_CD_EXT,DEPARTMENT_DSC_EXT,PRODUCT_SHORT_DSC,BRAND_DSC,BRAND_TYPE_CD,PRICE_RANGE,CONVERSION_FACTOR,CAPACITY_UNIT
0,6749843,4578,80240952.00,20210627,7397271847911420510,302,P,6749843010001,1.00,0.00,0.00,0.00,0.00,0.10,F,0,3885,1,Qualidade,SA_3,]25;35],5,Active Adults,CBD ESMORIZ,302,continente bom dia,3885,CARTÃO DÁ TROCAS,78010602,78010602 - cartões dá,780106,780106 - packs_cart presente,7801,7801 - publica�ões,78,78 - Serviços,23,23 - BAZAR,CARTÃO DÁ TROCA,SONAE,MF,SECUNDARIA,1.00,UN
1,6437055,6989,35145587.00,20221216,9088179255559281379,302,P,6437055010001,2.00,3.00,3.18,0.00,0.00,0.00,F,2,8375,1,Qualidade,SA_3,]25;35],5,Active Adults,CBD FARO BOM JOAO,302,continente bom dia,8000,BEBIDA DE ARROZ CONTINENTE BIO 1L,8050150,08050150 - Beb Veg Arroz bio,80501,080501 - Beb Vegetal Arroz,805,0805 - Bebida&Creme Vegetal,8,08 - Laticínios/Beb. Veg.,10,10 - ALIMENTAR,BEBIDA ARROZ BI,CONTINENTE BIO,MP,SECUNDARIA,1.00,LT
2,7182869,3583,35206138.00,20220724,8653905904922031452,302,P,7182869010001,1.00,3.00,3.69,0.00,0.00,0.00,M,0,2865,3,Preço,SA_3,]25;35],4,Family with Kids,CBD SETUBAL VARIANTE,302,continente bom dia,2900,DOCE CASA MATEUS 4 FRUTOS 0%280G,14040201,14040201 - Compotas,140402,140402 - compotas,1404,1404 - mel e compotas,14,14 - Pequeno Almoço,10,10 - ALIMENTAR,DOCE4 FRUTOS 0%,CASA MATEUS,MF,SECUNDARIA,0.28,KG
3,6437059,1911,35203400.00,20211112,7248048036018236791,302,P,6437059010001,2.00,3.00,3.18,0.00,0.00,0.00,M,2,2890,1,Qualidade,SA_3,]25;35],4,Family with Kids,CBD ALCOCHETE,302,continente bom dia,2890,BEBIDA DE AMÊNDOA CONTINENTE BIO 1L,8050250,08050250 - Beb Veg Frtsec bio,80502,080502 - Beb Vegetal Frt seco,805,0805 - Bebida&Creme Vegetal,8,08 - Laticínios/Beb. Veg.,10,10 - ALIMENTAR,BEBIDA AMÊNDOA,CONTINENTE BIO,MP,SECUNDARIA,1.00,LT
4,7289692,8736,87555786.00,20210530,7404314561606875777,302,P,7289692010001,-1.00,-4.00,-4.00,0.00,0.00,0.00,F,1,4415,1,Qualidade,SA_3,]25;35],5,Active Adults,CBD CARVALHOS,302,continente bom dia,4415,CAIXA ORGANIZA�ÃO COM 9 DIVISÓRIAS CZ1,50010404,50010404 - organiza�ão bzr,500104,500104 - organiza�ão bzr,5001,5001 - casa bzr,50,50 - Bazarão,20,20 - NOVOS BAZAR,CAIXA DE ORGANI,CHINA LIGHT,MF,ECONOMICA,1.00,UN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6840653,206,87626770.00,20210110,6978148042397353434,303,P,6840653010001,1.00,3.97,4.21,0.00,0.00,2.28,M,1,2870,3,Preço,SA_3,]25;35],4,Family with Kids,MDL MONTIJO,303,continente modelo,2870,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALHITAS BEBÉ,DODOT,MF,SECUNDARIA,1.00,UN
99996,6840653,288,58118605.00,20210808,7439149271581960070,303,P,6840653010001,1.00,3.97,4.21,0.00,0.42,2.28,M,4,4780,1,Qualidade,SA_3,]25;35],4,Family with Kids,MDL S.FELIX MARINHA,303,continente modelo,4410,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALHITAS BEBÉ,DODOT,MF,SECUNDARIA,1.00,UN
99997,6840653,1393,83591831.00,20211201,7301947962008974660,303,P,6840653010001,2.00,7.94,8.42,0.00,0.00,4.56,F,2,4750,1,Qualidade,SA_3,]25;35],4,Family with Kids,MDL BARCELOS,303,continente modelo,4750,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALHITAS BEBÉ,DODOT,MF,SECUNDARIA,1.00,UN
99998,6840653,2075,104624095.00,20220122,7319192877063524329,303,P,6840653010001,2.00,7.94,8.42,0.00,0.59,4.56,F,,5090,2,Família,SA_3,]25;35],4,Family with Kids,MDL MIRANDELA,303,continente modelo,5370,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,05040201 - toalhitas bebé,50402,050402 - toalhitas bebé&crian,504,0504 - hig e prote�ão bebé,5,05 - Higiene,10,10 - ALIMENTAR,TOALHITAS BEBÉ,DODOT,MF,SECUNDARIA,1.00,UN


# Data Preparation (more to be done...)

In [4]:
df = df.drop(columns=['LOC_BRAND_CD','PROD_DSCNT_ISSUED_AMT','NET_SLS_AMT','TRANS_DSCNT_RAT_AMT','DIRECT_DSCNT_AMT','CP4',
                 'seg_lifestyle_dsc','SEG_AGE','SEG_AGE_DSC','seg_lifestage_dsc','LOCATION_DSC','LOC_BRAND_DSC','cp7',
                 'UNIT_BASE_DSC_EXT','SUBCAT_CD_EXT','SUBCAT_DSC_EXT','BIZ_UNIT_DSC_EXT','DEPARTMENT_DSC_EXT',
                'PRODUCT_SHORT_DSC','BRAND_DSC','BRAND_TYPE_CD','CONVERSION_FACTOR','CAPACITY_UNIT'])

df  # Remover devoluções?

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,CAT_CD_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE
0,6749843,4578,80240952.00,20210627,7397271847911420510,P,6749843010001,1.00,0.00,F,0,1,5,302,CARTÃO DÁ TROCAS,78010602,7801,78,23,SECUNDARIA
1,6437055,6989,35145587.00,20221216,9088179255559281379,P,6437055010001,2.00,3.18,F,2,1,5,302,BEBIDA DE ARROZ CONTINENTE BIO 1L,8050150,805,8,10,SECUNDARIA
2,7182869,3583,35206138.00,20220724,8653905904922031452,P,7182869010001,1.00,3.69,M,0,3,4,302,DOCE CASA MATEUS 4 FRUTOS 0%280G,14040201,1404,14,10,SECUNDARIA
3,6437059,1911,35203400.00,20211112,7248048036018236791,P,6437059010001,2.00,3.18,M,2,1,4,302,BEBIDA DE AMÊNDOA CONTINENTE BIO 1L,8050250,805,8,10,SECUNDARIA
4,7289692,8736,87555786.00,20210530,7404314561606875777,P,7289692010001,-1.00,-4.00,F,1,1,5,302,CAIXA ORGANIZA�ÃO COM 9 DIVISÓRIAS CZ1,50010404,5001,50,20,ECONOMICA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,6840653,206,87626770.00,20210110,6978148042397353434,P,6840653010001,1.00,4.21,M,1,3,4,303,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,504,5,10,SECUNDARIA
99996,6840653,288,58118605.00,20210808,7439149271581960070,P,6840653010001,1.00,4.21,M,4,1,4,303,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,504,5,10,SECUNDARIA
99997,6840653,1393,83591831.00,20211201,7301947962008974660,P,6840653010001,2.00,8.42,F,2,1,4,303,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,504,5,10,SECUNDARIA
99998,6840653,2075,104624095.00,20220122,7319192877063524329,P,6840653010001,2.00,8.42,F,,2,4,303,TOALHITAS BEBÉ DODOT PURE 3X48UN,5040201,504,5,10,SECUNDARIA


In [5]:
df.isnull().sum()  # Maybe we shouldn't remove null values... What they mean? Insert values so that we don't lose precious data

SKU                             0
LOCATION_CD                     0
CUSTOMER_ACCOUNT_NR_MASK        0
TIME_KEY                        0
TRANSACTION_ID_MASK             0
POS_TP_CD                       0
PRODUCT_KEY                     0
QTY                             0
GROSS_SLS_AMT                   0
GENDER                       5126
FAMILY_MEMBERS              18930
seg_lifestyle_cd                0
seg_lifestage_cd                0
LOC_BRAND_CD_1                  0
PRODUCT_DSC                     0
UNIT_BASE_CD_EXT                0
CAT_CD_EXT                      0
BIZ_UNIT_CD_EXT                 0
DEPARTMENT_CD_EXT               0
PRICE_RANGE                     0
dtype: int64

# Feature Engineering

Explicit Features Engineering

In [6]:
# convert the 'TIME_KEY' column to datetime format
df['TIME_KEY'] = pd.to_datetime(df['TIME_KEY'], format='%Y%m%d')

# create new columns for the day, week, day of the week, month, quarter, and year
df['DAY'] = df['TIME_KEY'].dt.day
df['WEEK'] = df['TIME_KEY'].dt.week
df['DOW'] = df['TIME_KEY'].dt.dayofweek
df['MONTH'] = df['TIME_KEY'].dt.month
df['QUARTER'] = df['TIME_KEY'].dt.quarter
df['YEAR'] = df['TIME_KEY'].dt.year

  df['WEEK'] = df['TIME_KEY'].dt.week


Customer Feature Engineering

In [7]:
# TOTAL NUMBER OF ITEMS BOUGHT

# sort the data by customer account and date
df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])

# group the data by customer account and count the number of transactions up to that date
df['CUST_NUM_ITEMS_BOUGHT'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TIME_KEY'].cumcount()+1

In [8]:
# TOTAL NUMBER OF PRIOR ORDERS

def count_unique_transactions(customer_transactions):
    prev_transaction = None
    count = -1
    counts = []
    for transaction in customer_transactions:
        if transaction != prev_transaction:
            count += 1
        counts.append(count)
        prev_transaction = transaction
    return counts

df['CUST_NUM_PRIOR_TRANSACTIONS'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_PRIOR_TRANSACTIONS'] = df['CUST_NUM_PRIOR_TRANSACTIONS'].astype(int)

In [9]:
# UNIQUE NUMBER OF CATEGORIES BOUGHT

def count_unique(customer_categories):
    prev_categories = []
    count = 0
    counts = []
    for category in customer_categories:
        if category not in prev_categories:
            count += 1
        counts.append(count)
        prev_categories.append(category)
    return counts

df['CUST_NUM_UNIQUE_CATEGORIES'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['CAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_CATEGORIES'] = df['CUST_NUM_UNIQUE_CATEGORIES'].astype(int)

In [10]:
# UNIQUE NUMBER OF ORDERS??? Makes sense at all?

In [11]:
# DAYS SINCE LAST TRANSACTION

df['LAST_TIME_KEY'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TIME_KEY'].shift()
df['CUST_DAYS_SINCE_LAST_TRANSACTION'] = (df['TIME_KEY'] - df['LAST_TIME_KEY']).dt.days.fillna(-1)
df['CUST_DAYS_SINCE_LAST_TRANSACTION'] = df['CUST_DAYS_SINCE_LAST_TRANSACTION'].astype(int)
df = df.drop(columns=['LAST_TIME_KEY'])

In [12]:
# AVERAGE DAYS SINCE LAST TRANSACTION

def calculate_rolling_avg(df):
    customer_groups = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')
    for name, group in customer_groups:
        # Calculate the rolling average for each customer
        rolling_avg = group['CUST_DAYS_SINCE_LAST_TRANSACTION'].rolling(window=1, min_periods=1).mean()
        # Reset the rolling average to 0 for the first transaction of each customer
        rolling_avg.iloc[0] = 0
        # Set the rolling average values back into the original dataframe
        df.loc[group.index, 'rolling_avg_days_since_prior_transaction'] = rolling_avg.values
    # Calculate the sum and count of the rolling average for each customer
    df['rolling_avg_days_since_prior_transaction_sum'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['rolling_avg_days_since_prior_transaction'].cumsum()
    # Calculate the customer average for the rolling average
    df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'] = df['rolling_avg_days_since_prior_transaction_sum'] / df['CUST_NUM_PRIOR_TRANSACTIONS']
    # Round the customer average to 2 decimal places
    df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'] = df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'].round(2).fillna(0)
    df = df.drop(columns=['rolling_avg_days_since_prior_transaction','rolling_avg_days_since_prior_transaction_sum'])
    return df


df = calculate_rolling_avg(df)

In [13]:
# SIZE OF THE LAST BASKET

def add_cust_last_transaction_count(df):
    df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
    
    df['CUST_LAST_BASKET_SIZE'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].apply(lambda x: x.shift().map(x.value_counts()).fillna(0)).astype(int)
    
    # update values for rows with repeated transaction IDs
    for _, group in df.groupby('TRANSACTION_ID_MASK'):
        if len(group) > 1:
            first_row_index = group.index[0]
            df.loc[group.index, 'CUST_LAST_BASKET_SIZE'] = df.at[first_row_index, 'CUST_LAST_BASKET_SIZE'] 
    return df

df = add_cust_last_transaction_count(df)

In [18]:
# AVERAGE LAST BASKET SIZE (NOT 100% CORRECT YET)

def calculate_rolling_avg_basket(df):
    df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
    customer_groups = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')
    for name, group in customer_groups:
        # Calculate the rolling average for each customer
        rolling_avg = group['CUST_LAST_BASKET_SIZE'].rolling(window=1, min_periods=1).mean()
        # Reset the rolling average to 0 for the first transaction of each customer
        rolling_avg.iloc[0] = 0
        # Set the rolling average values back into the original dataframe
        df.loc[group.index, 'rolling_avg_days_since_prior_transaction'] = rolling_avg.values
    # Calculate the sum and count of the rolling average for each customer
    df['rolling_avg_days_since_prior_transaction_sum'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['rolling_avg_days_since_prior_transaction'].cumsum()
    # Calculate the customer average for the rolling average
    df['CUST_AVG_LAST_BASKET_SIZE'] = df['rolling_avg_days_since_prior_transaction_sum'] / df['CUST_NUM_PRIOR_TRANSACTIONS']
    # Round the customer average to 2 decimal places
    df['CUST_AVG_LAST_BASKET_SIZE'] = df['CUST_AVG_LAST_BASKET_SIZE'].round(2).fillna(0)
    df = df.drop(columns=['rolling_avg_days_since_prior_transaction','rolling_avg_days_since_prior_transaction_sum'])
    return df


df = calculate_rolling_avg_basket(df)

Category features

In [None]:
# TOTAL NUMBER OF CATEGORIES BOUGHT

df = df.sort_values('TIME_KEY')
df['CAT_NUM_CAT_BOUGHT'] = df.groupby('CAT_DSC_EXT')['TIME_KEY'].rank(method='first')
df['CAT_NUM_CAT_BOUGHT'] = df['CAT_NUM_CAT_BOUGHT'].astype(int)
df = df.sample(frac = 1)

In [None]:
# NUMBER OF CUSTOMERS WHO BOUGHT A CATEGORY

df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT', 'TIME_KEY'], inplace=True)
first_appearance_mask = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['TIME_KEY'].transform(lambda x: x == x.min())
df['CAT_CUST_BOUGHT_CAT'] = first_appearance_mask.groupby(df['CAT_DSC_EXT']).cumsum()
df['CAT_CUST_BOUGHT_CAT'].fillna(0, inplace=True)
df['CAT_CUST_BOUGHT_CAT'] = df['CAT_CUST_BOUGHT_CAT'].astype(int)

Customer-category features

In [None]:
# TOTAL NUMBER OF REORDERS PER CUSTOMER

df['CAT_REORDER_NUM'] = 0
for customer, category in df[['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT']].drop_duplicates().values:
    mask = (df['CUSTOMER_ACCOUNT_NR_MASK'] == customer) & (df['CAT_DSC_EXT'] == category)
    df.loc[mask, 'CAT_REORDER_NUM'] = df[mask]['TIME_KEY'].rank(method='min') - 1

In [None]:
# NUMBER OF CUSTOMERS WHO BOUGHT ONLT ONE TIME

counts = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT']).size().to_dict()
def is_unique(row):
    if counts[(row['CUSTOMER_ACCOUNT_NR_MASK'], row['CAT_DSC_EXT'])] == 1:
        return 1
    else:
        return 0


df['CAT_CUS_ONLY_1T'] = df.apply(is_unique, axis=1)

In [None]:
# DAYS SINCE A CUSTOMER BOUGHT A CATEGORY
# DAYS SINCE THE FIRST CATEGORY WAS BOUGHT


df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
df['DAYS_SINCE_LAST_PURCHASE'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['TIME_KEY'].diff().dt.days.fillna(0)
df['DAYS_SINCE_FIRST_PURCHASE'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['DAYS_SINCE_LAST_PURCHASE'].cumsum()
df['DAYS_SINCE_FIRST_PURCHASE'] = df['DAYS_SINCE_FIRST_PURCHASE'].astype(int)

df['DAYS_SINCE_FIRST_PURCHASE'] = pd.to_timedelta(df['DAYS_SINCE_FIRST_PURCHASE'], unit='D')
df['DAYS_SINCE_LAST_PURCHASE'] = pd.to_timedelta(df['DAYS_SINCE_LAST_PURCHASE'], unit='D')

# df.drop('DAYS_SINCE_LAST_PURCHASE', axis=1, inplace=True)

df['DAYS_SINCE_LAST_PURCHASE'] = df['DAYS_SINCE_LAST_PURCHASE'].dt.floor('D')
df['DAYS_SINCE_FIRST_PURCHASE'] = df['DAYS_SINCE_FIRST_PURCHASE'].dt.floor('D')

In [None]:
#LABEL

df['LABEL'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT'])['TIME_KEY'].apply(lambda x: ((x >= pd.Timestamp('2022-01-01')) & ((x - x.max()) <= pd.Timedelta(days=365))).astype(int))

In [21]:
df = df.sample(frac = 1)
df.head(10)

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,CAT_CD_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE,DAY,WEEK,DOW,MONTH,QUARTER,YEAR,CUST_NUM_ITEMS_BOUGHT,CUST_NUM_PRIOR_TRANSACTIONS,CUST_NUM_UNIQUE_CATEGORIES,CUST_DAYS_SINCE_LAST_TRANSACTION,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION,CUST_LAST_BASKET_SIZE,CUST_AVG_LAST_BASKET_SIZE
51196,7209848,1707,9450.0,2021-02-08,6971105328407029615,P,7209848010001,1.0,0.84,F,3,2,5,302,CANELA PAU CONTINENTE 17G,1020404,102,1,10,SECUNDARIA,8,6,0,2,1,2021,1,0,1,-1,0.0,0,0.0
7829,2907920,4860,9450.0,2022-05-14,8870991484202297335,P,2907920010001,2.0,1.24,F,3,2,5,302,(I)RISSOIS CAMARAO SOLARA CONG UN,18010201,1801,18,12,SECUNDARIA,14,19,5,5,2,2022,2,1,2,460,460.0,1,1.0
25760,5697161,222,31655.0,2021-02-16,7015142966983204826,P,5697161010001,1.0,1.29,F,3,2,4,303,IOG MYTHOS CNT MORANGO 4*125G,8040302,804,8,10,PROPRIA,16,7,1,2,1,2021,1,0,1,-1,0.0,0,0.0
22833,6949938,222,31655.0,2021-07-16,7453425676156549230,P,6949938010001,1.0,1.46,F,3,2,4,303,Q C/SAB YOCO SUIS MOR/MAC/PER/ALP 6*42G,8040503,804,8,10,SECUNDARIA,16,28,4,7,3,2021,2,1,1,150,150.0,1,1.0
30256,2050555,222,31655.0,2021-12-26,7292114608739590201,P,2050555010001,3.0,3.03,F,3,2,4,303,"REF. S/GAS JOI LAR./ MARACUJA 1,5L",3020401,302,3,10,SECUNDARIA,26,51,6,12,4,2021,3,2,2,163,156.5,1,1.0
10382,2824596,222,31655.0,2022-09-29,8722014037223134572,P,2824596010001,0.3,2.75,F,3,2,4,303,QJ FLAMENGO BARRA INT TERRA NOSTRA KG,13010101,1301,13,12,SECUNDARIA,29,39,3,9,3,2022,4,3,3,277,196.67,1,1.0
51601,7151135,222,31655.0,2022-11-14,8768018954966553904,P,7151135010001,1.0,1.55,F,3,2,4,303,CALVÉ MAIONESE TD 240G,1020102,102,1,10,SECUNDARIA,14,46,0,11,4,2022,5,4,4,46,159.0,1,1.0
36496,7394615,1054,36467.0,2021-06-10,7417365885809941026,P,7394615010001,6.0,6.0,M,5,2,4,303,(I) O NOSSO CROISSANT 120GR (AB)(AV),16020603,1602,16,12,SECUNDARIA,10,23,3,6,2,2021,1,0,1,-1,0.0,0,0.0
19022,7621886,1054,36467.0,2022-10-07,8730022861244227230,P,7621886010001,1.0,2.79,M,5,2,4,303,CREME P/BARRAR PLANTA SM 400G,8020301,802,8,10,SECUNDARIA,7,40,4,10,4,2022,2,1,2,484,484.0,1,1.0
65741,4953341,471,37912.0,2021-01-06,6972859340408002950,P,4953341010001,1.0,3.59,M,7,2,4,303,CHAMPÔ ELVIVE FULL RESIST 400ML,9020101,902,9,10,PREMIUM,6,1,2,1,1,2021,1,0,1,-1,0.0,0,0.0
