# Data Import & Connection to BigQuery

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from google.cloud import bigquery
# set display options to show all columns
pd.set_option('display.max_columns', None)
# Set the float format to display numbers without scientific notation
pd.options.display.float_format = '{:.2f}'.format
# Set the client for future queries to BigQuery
client = bigquery.Client(project = "continente-lced-feup")

In [3]:
!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=0a2qTbhOXrSd0ntr9VpTECsdgpzJxq&access_type=offline&code_challenge=pDwYlzqeoVoA9JZB_7qsG-BKKmUy-kwKtHXBbY1z2KQ&code_challenge_method=S256


Credentials saved to file: [C:\Users\luish\AppData\Roaming\gcloud\application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Cannot find a quota project to add to ADC. You might receive a "quota exceeded" or "API not enabled" error. Run $ gcloud auth application-default set-quota-project to add a 

# Data Loading

In [37]:
query = client.query("""
   SELECT *
   FROM 
       tables_raw.fact_transaction 
       JOIN tables_raw.dim_customer USING(CUSTOMER_ACCOUNT_NR_MASK)
       JOIN tables_raw.dim_location USING(LOCATION_CD)
       JOIN tables_raw.dim_product USING(SKU)
   WHERE
       SEG_AGE_DSC="]25;35]"
       AND CAT_CD_EXT IN (1701, 0804, 1703, 0302, 0801, 0203, 0102, 0603, 0503, 0902, 0604, 1002, 0601, 0303, 0202, 1403, 1402, 0103, 0702, 0504, 0101, 0502, 0304, 1001)
   LIMIT 100000
   """)

df = query.result().to_dataframe() # Wait for the job to complete.
df

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,LOC_BRAND_CD,POS_TP_CD,PRODUCT_KEY,QTY,NET_SLS_AMT,GROSS_SLS_AMT,PROD_DSCNT_ISSUED_AMT,TRANS_DSCNT_RAT_AMT,DIRECT_DSCNT_AMT,GENDER,FAMILY_MEMBERS,CP4,seg_lifestyle_cd,seg_lifestyle_dsc,SEG_AGE,SEG_AGE_DSC,seg_lifestage_cd,seg_lifestage_dsc,LOCATION_DSC,LOC_BRAND_CD_1,LOC_BRAND_DSC,cp7,PRODUCT_DSC,UNIT_BASE_CD_EXT,UNIT_BASE_DSC_EXT,SUBCAT_CD_EXT,SUBCAT_DSC_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,BIZ_UNIT_DSC_EXT,DEPARTMENT_CD_EXT,DEPARTMENT_DSC_EXT,PRODUCT_SHORT_DSC,BRAND_DSC,BRAND_TYPE_CD,PRICE_RANGE,CONVERSION_FACTOR,CAPACITY_UNIT
0,7085749,282,29430523.00,20210312,7002964964819120454,302,P,7085749010001,1.00,2.00,2.46,0.00,0.00,1.33,M,2,4505,2,Família,SA_3,]25;35],4,Family with Kids,CBD VIZ S. JOÃO VER,302,continente bom dia,4520,ANTI-RAÇAS RAID GANCHOS PERF LAV 2UN,6030503,06030503 - Anti-Traças,60305,060305 - inseticidas,603,0603 - Limpeza e Conforto,6,06 - Limpeza do Lar,10,10 - ALIMENTAR,RAID GANCHO 2UN,RAID,MF,SECUNDARIA,2.00,UN
1,6340248,2404,83710943.00,20210626,7396055308971025169,302,P,6340248010001,1.00,3.00,3.69,0.00,0.00,0.00,M,0,3850,3,Preço,SA_3,]25;35],5,Active Adults,CBD ALBERGARIA,302,continente bom dia,3850,SIDRA C/ALC.T/P CONTINENTE 6X33CL,3030402,03030402 - Sidra Original - TP,30304,030304 - Sidras,303,0303 - cervejas,3,03 - Soft Drinks,10,10 - ALIMENTAR,SIDRA CONT 6X33,CONTINENTE,MP,PROPRIA,1.98,LT
2,6841782,2632,106200134.00,20210305,6996172658162819381,302,P,6841782010001,1.00,3.00,3.69,0.00,0.55,2.06,F,0,2855,1,Qualidade,SA_3,]25;35],4,Family with Kids,CBD CHARNECA CAPARIC,302,continente bom dia,2820,CERV S/ ALC HEINEKEN 0.0% T/P 6X25CL,3030602,03030602 - S/ Alc. Brancas - TP,30306,030306 - Cervejas sem álcool,303,0303 - cervejas,3,03 - Soft Drinks,10,10 - ALIMENTAR,HEINEKEN,HEINEKEN ZERO,MF,SECUNDARIA,1.50,LT
3,6340248,9663,6036837.00,20220918,8747052483133782708,302,P,6340248010001,1.00,3.00,3.69,0.00,0.00,0.00,F,2,4430,2,Família,SA_3,]25;35],4,Family with Kids,CBD CANEDO,302,continente bom dia,4525,SIDRA C/ALC.T/P CONTINENTE 6X33CL,3030402,03030402 - Sidra Original - TP,30304,030304 - Sidras,303,0303 - cervejas,3,03 - Soft Drinks,10,10 - ALIMENTAR,SIDRA CONT 6X33,CONTINENTE,MP,PROPRIA,1.98,LT
4,6891551,2401,35264545.00,20221019,8776851041952592805,302,P,6891551010001,2.00,3.00,3.18,0.00,0.00,0.00,F,3,4520,3,Preço,SA_3,]25;35],5,Active Adults,CBD BARBOSA BOCAGE,302,continente bom dia,1050,CAPPUCCINO GO CHILL DELTA 230 ML,8010104,08010104 - Pasteuriz.Aromatizad,80101,080101 - leite pasteurizado,801,0801 - leites e natas,8,08 - Laticínios/Beb. Veg.,10,10 - ALIMENTAR,CAPPUCCINO GO C,DELTA,MF,SECUNDARIA,0.23,LT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2952760,3192,595418.00,20210524,7364278627519205963,303,P,2952760010001,2.00,1.63,2.00,0.00,0.25,1.00,F,3,2675,3,Preço,SA_3,]25;35],4,Family with Kids,MDL ALTA LISBOA,303,continente modelo,1750,"REF.C/GÁS FANTA MARACUJÁ ZERO 1,5 L",3020301,03020301 - Ref. C/Gas Regulares,30203,030203 - Ref com gás,302,0302 - refrigerantes,3,03 - Soft Drinks,10,10 - ALIMENTAR,REF.C/GÁS FANTA,FANTA,MF,SECUNDARIA,1.50,LT
99996,2051511,201,103819501.00,20210820,7488229349175656838,303,P,2051511010001,1.00,0.48,0.59,0.00,0.00,0.06,,,,3,Preço,SA_3,]25;35],6,No Value,MDL ALBUFEIRA,303,continente modelo,8200,REF.C/GAS ANANAS FANTA LATA 33CL,3020301,03020301 - Ref. C/Gas Regulares,30203,030203 - Ref com gás,302,0302 - refrigerantes,3,03 - Soft Drinks,10,10 - ALIMENTAR,REF.C/GAS FANTA,FANTA,MF,?,0.33,LT
99997,2274633,279,59881194.00,20221231,9138164800309256985,303,P,2274633010001,1.00,2.43,2.99,0.00,0.00,1.00,M,1,3320,3,Preço,SA_3,]25;35],3,Family w/ Young Adul,MDL FUNDÃO,303,continente modelo,6230,REF.C/GAS MACA CHAMPOMY 75CL,3020301,03020301 - Ref. C/Gas Regulares,30203,030203 - Ref com gás,302,0302 - refrigerantes,3,03 - Soft Drinks,10,10 - ALIMENTAR,R.C/G CHAMPOMY,CHAMPOMY,MF,?,0.75,LT
99998,3621849,801,87674438.00,20220410,8838244452710486998,303,P,3621849010001,1.00,0.94,1.09,0.00,0.00,0.40,M,1,9500,3,Preço,SA_3,]25;35],4,Family with Kids,MDL P Delgad-Açores,303,continente modelo,9500,"REF.C/GÁS FANTA UVA ZERO 1,5L",3020302,03020302 - C/ Gas Low Cal,30203,030203 - Ref com gás,302,0302 - refrigerantes,3,03 - Soft Drinks,10,10 - ALIMENTAR,REF.C/GÁS FANTA,FANTA,MF,SECUNDARIA,1.50,LT


# Data Preparation (more to be done...)

In [38]:
df = df.drop(columns=['LOC_BRAND_CD','PROD_DSCNT_ISSUED_AMT','NET_SLS_AMT','TRANS_DSCNT_RAT_AMT','DIRECT_DSCNT_AMT','CP4',
                 'seg_lifestyle_dsc','SEG_AGE','SEG_AGE_DSC','seg_lifestage_dsc','LOCATION_DSC','LOC_BRAND_DSC','cp7',
                 'UNIT_BASE_DSC_EXT','SUBCAT_DSC_EXT','BIZ_UNIT_DSC_EXT','DEPARTMENT_DSC_EXT',
                'PRODUCT_SHORT_DSC','BRAND_DSC','BRAND_TYPE_CD','CONVERSION_FACTOR','CAPACITY_UNIT'])

df  # Remover devoluções?

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,SUBCAT_CD_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE
0,7085749,282,29430523.00,20210312,7002964964819120454,P,7085749010001,1.00,2.46,M,2,2,4,302,ANTI-RAÇAS RAID GANCHOS PERF LAV 2UN,6030503,60305,603,0603 - Limpeza e Conforto,6,10,SECUNDARIA
1,6340248,2404,83710943.00,20210626,7396055308971025169,P,6340248010001,1.00,3.69,M,0,3,5,302,SIDRA C/ALC.T/P CONTINENTE 6X33CL,3030402,30304,303,0303 - cervejas,3,10,PROPRIA
2,6841782,2632,106200134.00,20210305,6996172658162819381,P,6841782010001,1.00,3.69,F,0,1,4,302,CERV S/ ALC HEINEKEN 0.0% T/P 6X25CL,3030602,30306,303,0303 - cervejas,3,10,SECUNDARIA
3,6340248,9663,6036837.00,20220918,8747052483133782708,P,6340248010001,1.00,3.69,F,2,2,4,302,SIDRA C/ALC.T/P CONTINENTE 6X33CL,3030402,30304,303,0303 - cervejas,3,10,PROPRIA
4,6891551,2401,35264545.00,20221019,8776851041952592805,P,6891551010001,2.00,3.18,F,3,3,5,302,CAPPUCCINO GO CHILL DELTA 230 ML,8010104,80101,801,0801 - leites e natas,8,10,SECUNDARIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2952760,3192,595418.00,20210524,7364278627519205963,P,2952760010001,2.00,2.00,F,3,3,4,303,"REF.C/GÁS FANTA MARACUJÁ ZERO 1,5 L",3020301,30203,302,0302 - refrigerantes,3,10,SECUNDARIA
99996,2051511,201,103819501.00,20210820,7488229349175656838,P,2051511010001,1.00,0.59,,,3,6,303,REF.C/GAS ANANAS FANTA LATA 33CL,3020301,30203,302,0302 - refrigerantes,3,10,?
99997,2274633,279,59881194.00,20221231,9138164800309256985,P,2274633010001,1.00,2.99,M,1,3,3,303,REF.C/GAS MACA CHAMPOMY 75CL,3020301,30203,302,0302 - refrigerantes,3,10,?
99998,3621849,801,87674438.00,20220410,8838244452710486998,P,3621849010001,1.00,1.09,M,1,3,4,303,"REF.C/GÁS FANTA UVA ZERO 1,5L",3020302,30203,302,0302 - refrigerantes,3,10,SECUNDARIA


In [39]:
df.isnull().sum()  # Maybe we shouldn't remove null values... What they mean? Insert values so that we don't lose precious data

SKU                             0
LOCATION_CD                     0
CUSTOMER_ACCOUNT_NR_MASK        0
TIME_KEY                        0
TRANSACTION_ID_MASK             0
POS_TP_CD                       0
PRODUCT_KEY                     0
QTY                             0
GROSS_SLS_AMT                   0
GENDER                       4956
FAMILY_MEMBERS              18848
seg_lifestyle_cd                0
seg_lifestage_cd                0
LOC_BRAND_CD_1                  0
PRODUCT_DSC                     0
UNIT_BASE_CD_EXT                0
SUBCAT_CD_EXT                   0
CAT_CD_EXT                      0
CAT_DSC_EXT                     0
BIZ_UNIT_CD_EXT                 0
DEPARTMENT_CD_EXT               0
PRICE_RANGE                     0
dtype: int64

# Feature Engineering

Explicit Features Engineering

In [40]:
# convert the 'TIME_KEY' column to datetime format
df['TIME_KEY'] = pd.to_datetime(df['TIME_KEY'], format='%Y%m%d')

# create new columns for the day, week, day of the week, month, quarter, and year
df['DAY'] = df['TIME_KEY'].dt.day
df['WEEK'] = df['TIME_KEY'].dt.week
df['DOW'] = df['TIME_KEY'].dt.dayofweek
df['MONTH'] = df['TIME_KEY'].dt.month
df['QUARTER'] = df['TIME_KEY'].dt.quarter
df['YEAR'] = df['TIME_KEY'].dt.year

  df['WEEK'] = df['TIME_KEY'].dt.week


Customer Feature Engineering

In [41]:
# TOTAL NUMBER OF ITEMS BOUGHT

# sort the data by customer account and date
df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])

# group the data by customer account and count the number of transactions up to that date
df['CUST_NUM_ITEMS_BOUGHT'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TIME_KEY'].cumcount()+1

In [42]:
# TOTAL NUMBER OF PRIOR ORDERS

def count_unique_transactions(customer_transactions):
    prev_transaction = None
    count = -1
    counts = []
    for transaction in customer_transactions:
        if transaction != prev_transaction:
            count += 1
        counts.append(count)
        prev_transaction = transaction
    return counts

df['CUST_NUM_PRIOR_TRANSACTIONS'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].transform(count_unique_transactions)
df['CUST_NUM_PRIOR_TRANSACTIONS'] = df['CUST_NUM_PRIOR_TRANSACTIONS'].astype(int)

In [44]:
# UNIQUE NUMBER OF SUBCATEGORIES BOUGHT

def count_unique(customer_categories):
    prev_categories = []
    count = 0
    counts = []
    for category in customer_categories:
        if category not in prev_categories:
            count += 1
        counts.append(count)
        prev_categories.append(category)
    return counts

df['CUST_NUM_UNIQUE_SUBCAT'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['SUBCAT_CD_EXT'].transform(count_unique)
df['CUST_NUM_UNIQUE_SUBCAT'] = df['CUST_NUM_UNIQUE_SUBCAT'].astype(int)

In [11]:
# UNIQUE NUMBER OF ORDERS??? Makes sense at all?

In [45]:
# DAYS SINCE LAST TRANSACTION

df['LAST_TIME_KEY'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TIME_KEY'].shift()
df['CUST_DAYS_SINCE_LAST_TRANSACTION'] = (df['TIME_KEY'] - df['LAST_TIME_KEY']).dt.days.fillna(-1)
df['CUST_DAYS_SINCE_LAST_TRANSACTION'] = df['CUST_DAYS_SINCE_LAST_TRANSACTION'].astype(int)
df = df.drop(columns=['LAST_TIME_KEY'])

In [46]:
# AVERAGE DAYS SINCE LAST TRANSACTION

def calculate_rolling_avg(df):
    customer_groups = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')
    for name, group in customer_groups:
        # Calculate the rolling average for each customer
        rolling_avg = group['CUST_DAYS_SINCE_LAST_TRANSACTION'].rolling(window=1, min_periods=1).mean()
        # Reset the rolling average to 0 for the first transaction of each customer
        rolling_avg.iloc[0] = 0
        # Set the rolling average values back into the original dataframe
        df.loc[group.index, 'rolling_avg_days_since_prior_transaction'] = rolling_avg.values
    # Calculate the sum and count of the rolling average for each customer
    df['rolling_avg_days_since_prior_transaction_sum'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['rolling_avg_days_since_prior_transaction'].cumsum()
    # Calculate the customer average for the rolling average
    df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'] = df['rolling_avg_days_since_prior_transaction_sum'] / df['CUST_NUM_PRIOR_TRANSACTIONS']
    # Round the customer average to 2 decimal places
    df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'] = df['CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION'].round(2).fillna(0)
    df = df.drop(columns=['rolling_avg_days_since_prior_transaction','rolling_avg_days_since_prior_transaction_sum'])
    return df


df = calculate_rolling_avg(df)

In [47]:
# SIZE OF THE LAST BASKET

def add_cust_last_transaction_count(df):
    df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
    
    df['CUST_LAST_BASKET_SIZE'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].apply(lambda x: x.shift().map(x.value_counts()).fillna(0)).astype(int)
    
    # update values for rows with repeated transaction IDs
    for _, group in df.groupby('TRANSACTION_ID_MASK'):
        if len(group) > 1:
            first_row_index = group.index[0]
            df.loc[group.index, 'CUST_LAST_BASKET_SIZE'] = df.at[first_row_index, 'CUST_LAST_BASKET_SIZE'] 
    return df

df = add_cust_last_transaction_count(df)

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df['CUST_LAST_BASKET_SIZE'] = df.groupby('CUSTOMER_ACCOUNT_NR_MASK')['TRANSACTION_ID_MASK'].apply(lambda x: x.shift().map(x.value_counts()).fillna(0)).astype(int)


In [49]:
# AVERAGE BASKET SIZE - V2 -Ian's Fix
# Remove transactions with the same negative quantity
def calculate_avg_basket(df):
    df_avg_bkt_size = df[df['QTY'] > 0]
    df_avg_bkt_size = df[['CUSTOMER_ACCOUNT_NR_MASK','TRANSACTION_ID_MASK','SUBCAT_CD_EXT']]
    df_avg_bkt_size = df_avg_bkt_size.groupby(['CUSTOMER_ACCOUNT_NR_MASK','TRANSACTION_ID_MASK']).count()
    df_avg_bkt_size = df_avg_bkt_size.groupby('CUSTOMER_ACCOUNT_NR_MASK').mean().round(0).astype(int)
    df_avg_bkt_size.reset_index(inplace=True)
    df_avg_bkt_size = df_avg_bkt_size.rename(columns={'SUBCAT_CD_EXT':'CUST_AVG_BASKET_SIZE'})
    df = df.merge(df_avg_bkt_size, on='CUSTOMER_ACCOUNT_NR_MASK', how='left')
    return df

df = calculate_avg_basket(df)

In [48]:
df.head(40)

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,SUBCAT_CD_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE,DAY,WEEK,DOW,MONTH,QUARTER,YEAR,CUST_NUM_ITEMS_BOUGHT,CUST_NUM_PRIOR_TRANSACTIONS,CUST_NUM_UNIQUE_SUBCAT,CUST_DAYS_SINCE_LAST_TRANSACTION,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION,CUST_LAST_BASKET_SIZE
57894,2005830,222,31655.0,2021-02-16,7015142966983204826,P,2005830010001,2.0,1.7,F,3,2,4,303,BATATA FRITA ONDULADA CONTINENTE 200 G,1010102,10101,101,0101 - aperitivos,1,10,PROPRIA,16,7,1,2,1,2021,1,0,1,-1,0.0,0
81982,5319185,222,31655.0,2021-04-22,7043062624436512118,P,5319185010001,1.0,3.99,F,3,2,4,303,TOALHITAS BEBÉ CHICCO 3X72UN,5040201,50402,504,0504 - hig e prote�ão bebé,5,10,PREMIUM,22,16,3,4,2,2021,2,1,2,65,65.0,1
15374,4647030,222,31655.0,2021-07-21,7457186623997306704,P,4647030010001,1.0,1.29,F,3,2,4,303,ÓLEO ALIMENTAR PÔR DO SOL 1L,10010101,100101,1001,1001 - gorduras líquidas,10,10,ECONOMICA,21,29,2,7,3,2021,3,2,3,90,77.5,1
88793,2003827,222,31655.0,2021-07-23,7423061002543857694,P,2003827010001,2.0,3.98,F,3,2,4,303,CARAMELOS PENHA FRUTA 250G,2020301,20203,202,0202 - doçaria,2,10,SECUNDARIA,23,29,4,7,3,2021,4,3,4,2,52.33,1
16573,7068359,222,31655.0,2021-11-04,7274871734290529305,P,7068359010001,1.0,2.79,F,3,2,4,303,"AZEITE 5 SOLDOS 0,75L",10010201,100102,1001,1001 - gorduras líquidas,10,10,SECUNDARIA,4,44,3,11,4,2021,5,4,5,104,65.25,1
46593,5832855,222,31655.0,2021-11-04,7274871734290529305,P,5832855010001,1.0,1.34,F,3,2,4,303,BOLACHA DE ARROZ CNT C/54% CHOCO 122G,2030609,20306,203,0203 - bolachas,2,10,PROPRIA,4,44,3,11,4,2021,6,4,6,0,65.25,1
3346,2050555,222,31655.0,2022-12-26,9134159825966191975,P,2050555010001,3.0,3.12,F,3,2,4,303,"REF. S/GAS JOI LAR./ MARACUJA 1,5L",3020401,30204,302,0302 - refrigerantes,3,10,SECUNDARIA,26,52,0,12,4,2022,7,5,7,417,135.6,2
85727,7244148,1054,36467.0,2021-06-10,7417365885809941026,P,7244148010001,1.0,2.84,M,5,2,4,303,TABLETE EXTRAFINO LEITE 270G,2020101,20201,202,0202 - doçaria,2,10,SECUNDARIA,10,23,3,6,2,2021,1,0,1,-1,0.0,0
66371,6445542,1054,36467.0,2021-11-30,7302068296398464643,P,6445542010001,1.0,3.14,M,5,2,4,303,RECARGAS MOPA SWIFFER HUMIDAS 12 PANOS,6030704,60307,603,0603 - Limpeza e Conforto,6,10,SECUNDARIA,30,48,1,11,4,2021,2,1,2,173,173.0,1
4354,7273120,203,37912.0,2021-03-14,7005184781989662809,P,7273120010001,1.0,29.99,M,7,2,4,143,WHISKY SINGLETON 12 ANOS 70CL C/2 COPOS,17030303,170303,1703,1703 - espirit/espum/fortif,17,10,SECUNDARIA,14,10,6,3,1,2021,1,0,1,-1,0.0,0


Category features

In [50]:
# TOTAL NUMBER OF SUBCATEGORIES BOUGHT

df = df.sort_values('TIME_KEY')
df['CAT_NUM_SUBCAT_BOUGHT'] = df.groupby('SUBCAT_CD_EXT')['TIME_KEY'].rank(method='first')
df['CAT_NUM_SUBCAT_BOUGHT'] = df['CAT_NUM_SUBCAT_BOUGHT'].astype(int)
df = df.sample(frac = 1)

In [51]:
# NUMBER OF CUSTOMERS WHO BOUGHT A CATEGORY

df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT', 'TIME_KEY'], inplace=True)
first_appearance_mask = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT'])['TIME_KEY'].transform(lambda x: x == x.min())
df['SUBCAT_CUST_BOUGHT'] = first_appearance_mask.groupby(df['SUBCAT_CD_EXT']).cumsum()
df['SUBCAT_CUST_BOUGHT'].fillna(0, inplace=True)
df['SUBCAT_CUST_BOUGHT'] = df['SUBCAT_CUST_BOUGHT'].astype(int)

Customer-category features

In [19]:
# TOTAL NUMBER OF REORDERS PER CUSTOMER

df['CAT_REORDER_NUM'] = 0
for customer, category in df[['CUSTOMER_ACCOUNT_NR_MASK', 'CAT_DSC_EXT']].drop_duplicates().values:
    mask = (df['CUSTOMER_ACCOUNT_NR_MASK'] == customer) & (df['CAT_DSC_EXT'] == category)
    df.loc[mask, 'CAT_REORDER_NUM'] = df[mask]['TIME_KEY'].rank(method='min') - 1

KeyboardInterrupt: 

In [52]:
# NUMBER OF CUSTOMERS WHO BOUGHT ONLY ONE TIME

counts = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT']).size().to_dict()
def is_unique(row):
    if counts[(row['CUSTOMER_ACCOUNT_NR_MASK'], row['SUBCAT_CD_EXT'])] == 1:
        return 1
    else:
        return 0


df['SUBCAT_CUS_ONLY_1T'] = df.apply(is_unique, axis=1)

In [57]:
# DAYS SINCE A CUSTOMER BOUGHT A SUBCATEGORY
# DAYS SINCE THE FIRST CATEGORY WAS BOUGHT


df = df.sort_values(['CUSTOMER_ACCOUNT_NR_MASK', 'TIME_KEY'])
df['DAYS_SINCE_LAST_PURCHASE'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT'])['TIME_KEY'].diff().dt.days.fillna(0)
df['DAYS_SINCE_FIRST_PURCHASE'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT'])['DAYS_SINCE_LAST_PURCHASE'].cumsum()
df['DAYS_SINCE_FIRST_PURCHASE'] = df['DAYS_SINCE_FIRST_PURCHASE'].astype(int)

df['DAYS_SINCE_FIRST_PURCHASE'] = pd.to_timedelta(df['DAYS_SINCE_FIRST_PURCHASE'], unit='D')
df['DAYS_SINCE_LAST_PURCHASE'] = pd.to_timedelta(df['DAYS_SINCE_LAST_PURCHASE'], unit='D')

# df.drop('DAYS_SINCE_LAST_PURCHASE', axis=1, inplace=True)

df['DAYS_SINCE_LAST_PURCHASE'] = df['DAYS_SINCE_LAST_PURCHASE'].dt.floor('D')
df['DAYS_SINCE_FIRST_PURCHASE'] = df['DAYS_SINCE_FIRST_PURCHASE'].dt.floor('D')

In [54]:
#LABEL

df['LABEL'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT'])['TIME_KEY'].apply(lambda x: ((x >= pd.Timestamp('2022-01-01')) & ((x - x.max()) <= pd.Timedelta(days=365))).astype(int))

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df['LABEL'] = df.groupby(['CUSTOMER_ACCOUNT_NR_MASK', 'SUBCAT_CD_EXT'])['TIME_KEY'].apply(lambda x: ((x >= pd.Timestamp('2022-01-01')) & ((x - x.max()) <= pd.Timedelta(days=365))).astype(int))


In [58]:
df = df.sample(frac = 1)
df.head(20)

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,SUBCAT_CD_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE,DAY,WEEK,DOW,MONTH,QUARTER,YEAR,CUST_NUM_ITEMS_BOUGHT,CUST_NUM_PRIOR_TRANSACTIONS,CUST_NUM_UNIQUE_SUBCAT,CUST_DAYS_SINCE_LAST_TRANSACTION,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION,CUST_LAST_BASKET_SIZE,CUST_AVG_BASKET_SIZE,CAT_NUM_SUBCAT_BOUGHT,SUBCAT_CUST_BOUGHT,SUBCAT_CUS_ONLY_1T,DAYS_SINCE_LAST_PURCHASE,DAYS_SINCE_FIRST_PURCHASE,LABEL
35267,4950222,1943,34678896.0,2022-12-14,9085079739940421100,P,4950222010001,1.0,0.75,M,3.0,2,4,303,*AGUA S/GAS CONTINENTE 6L,3040101,30401,304,0304 - águas,3,10,PROPRIA,14,50,2,12,4,2022,14,10,11,102,62.6,1,1,3035,720,1,0 days,0 days,1
67534,4920353,326,79628652.0,2022-03-25,8858220392088941464,P,4920353010001,1.0,1.85,M,2.0,1,4,302,KETCHUP TD CALVE 275G,1020103,10201,102,0102 - temperos,1,10,SECUNDARIA,25,12,4,3,1,2022,11,8,10,35,37.5,1,1,804,715,1,0 days,0 days,1
2130,7099734,464,1394767.0,2022-02-19,8824351927297953530,P,7099734010001,1.0,2.99,F,5.0,1,3,143,AZ VE OLIVE SERRA SELECIONADO PET750ML,10010203,100102,1001,1001 - gorduras líquidas,10,10,SECUNDARIA,19,7,5,2,1,2022,8,5,8,142,66.0,1,1,492,23,1,0 days,0 days,1
54817,2627786,1501,58524204.0,2022-12-19,9091336520010538967,P,2627786010001,1.0,4.04,F,1.0,1,4,303,BOMB MON CHERI LICOR C/CEREJA T15 150G,2020105,20201,202,0202 - doçaria,2,10,LIDER,19,51,0,12,4,2022,15,13,12,109,54.08,1,1,2666,992,1,0 days,0 days,1
44445,6804954,3466,40044972.0,2021-06-13,7420312049671057698,P,6804954010001,1.0,1.15,F,1.0,1,5,302,IOG CONTINENTE NATURAL 8*125G,8040101,80401,804,0804 - iogurtes e sobrem.,8,10,PROPRIA,13,23,6,6,2,2021,1,0,1,-1,0.0,0,2,168,267,1,0 days,0 days,0
53993,5907926,3464,58455302.0,2021-08-10,7477156104337510367,P,5907926010001,1.0,0.49,F,2.0,2,4,302,GUARDANAPOS CONTINENTE BRANCOS 100UN,6040301,60403,604,0604 - prod papel e consum.,6,10,PROPRIA,10,32,1,8,3,2021,4,3,4,24,25.33,1,1,276,407,0,0 days,0 days,0
5971,4626253,5751,4186037.0,2021-07-21,7457186595537047569,P,4626253010001,1.0,2.21,F,5.0,2,4,302,DEO ROLL ON NIVEA PURE&NATURALS 50ML,5030101,50301,503,0503 - higiene corporal,5,10,LIDER,21,29,2,7,3,2021,5,3,5,47,63.33,1,1,164,26,1,0 days,0 days,0
27147,4403919,262,19598649.0,2022-07-20,8651115345105187804,P,4403919010001,2.0,5.28,M,1.0,2,4,303,OLEO GIRASSOL FULA PURO 1L,10010102,100101,1001,1001 - gorduras líquidas,10,10,SECUNDARIA,20,29,2,7,3,2022,22,16,17,7,33.56,1,2,561,172,1,0 days,0 days,1
65615,7304170,1415,75980641.0,2021-07-24,7425194662592929272,P,7304170010001,1.0,1.99,M,0.0,1,5,303,(I)STICKS PERFUMADOS CNT MAÇA CANELA,6030606,60306,603,0603 - Limpeza e Conforto,6,10,PROPRIA,24,29,5,7,3,2021,9,8,8,21,24.12,1,1,159,337,1,0 days,0 days,0
70424,6482860,273,80390373.0,2021-03-21,7048005890866597392,P,6482860010001,3.0,4.77,M,3.0,2,5,303,MOLHO FRANCESINHA CONTINENTE 500G,1020204,10202,102,0102 - temperos,1,10,SECUNDARIA,21,11,6,3,1,2021,2,1,2,73,73.0,1,1,97,373,1,0 days,0 days,0


In [None]:
df

Unnamed: 0,SKU,LOCATION_CD,CUSTOMER_ACCOUNT_NR_MASK,TIME_KEY,TRANSACTION_ID_MASK,POS_TP_CD,PRODUCT_KEY,QTY,GROSS_SLS_AMT,GENDER,FAMILY_MEMBERS,seg_lifestyle_cd,seg_lifestage_cd,LOC_BRAND_CD_1,PRODUCT_DSC,UNIT_BASE_CD_EXT,CAT_CD_EXT,CAT_DSC_EXT,BIZ_UNIT_CD_EXT,DEPARTMENT_CD_EXT,PRICE_RANGE,DAY,WEEK,DOW,MONTH,QUARTER,YEAR,CUST_NUM_ITEMS_BOUGHT,CUST_NUM_PRIOR_TRANSACTIONS,CUST_NUM_UNIQUE_CATEGORIES,CUST_DAYS_SINCE_LAST_TRANSACTION,CUST_AVG_DAYS_SINCE_PRIOR_TRANSACTION,CUST_LAST_BASKET_SIZE,CUST_AVG_BASKET_SIZE,CAT_NUM_CAT_BOUGHT,CAT_CUST_BOUGHT_CAT,CAT_REORDER_NUM,CAT_CUS_ONLY_1T,DAYS_SINCE_LAST_PURCHASE,DAYS_SINCE_FIRST_PURCHASE,LABEL
91150,7473538,215,102861382.00,2022-01-23,7320163240166623192,P,7473538010001,1.00,1.79,F,3,1,5,143,IOG MAG C DANONE NATUR COCO 4*115G,8040603,804,0804 - iogurtes e sobrem.,8,10,SECUNDARIA,23,3,6,1,1,2022,18,12,12,14,21.58,1,1,2760,2759,1,0,14 days,14 days,1
83668,3872611,3194,87715510.00,2022-12-28,9136133435023531632,P,3872611010001,1.00,0.99,F,2,3,4,302,BOL.C/PEPITAS CHOCOLATE CONTINENTE 150G,2030205,203,0203 - bolachas,2,10,PROPRIA,28,52,2,12,4,2022,40,28,23,0,17.11,2,1,3439,1778,8,0,26 days,479 days,1
22293,7173279,3698,19090893.00,2022-09-15,8743013460133673207,P,7173279010001,1.00,1.09,M,2,2,4,302,BATATA FRITA OND KETCHUP CONTINENTE 170G,1010104,101,0101 - aperitivos,1,10,SECUNDARIA,15,37,3,9,3,2022,17,14,7,20,44.21,1,1,1993,343,0,1,0 days,0 days,1
53395,7589297,2403,58414300.00,2022-09-05,8696755861140737237,P,7589297010001,1.00,12.49,F,1,1,4,302,DET LÍQ MÁQ ROUPA SKIP BABY 60D,6010201,601,0601 - limp. e tratam roupa,6,10,SECUNDARIA,5,36,0,9,3,2022,50,33,28,0,16.70,1,1,880,519,0,1,0 days,0 days,1
70393,7279167,9665,80389522.00,2021-08-30,7462187326230047474,P,7279167010001,1.00,1.75,F,5,3,5,302,PACK CROIS FOLHADO 5 + 1 OFERTA(AB),16020603,1602,1602 - pastelaria,16,12,SECUNDARIA,30,35,0,8,3,2021,3,2,3,58,109.00,1,1,665,1051,0,0,0 days,0 days,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94217,6458580,324,104629652.00,2021-12-02,7302885218764091002,P,6458580010001,2.00,1.38,F,,1,5,302,CROISSANT FOLHADO CREME OVO (AB),16020603,1602,1602 - pastelaria,16,12,SECUNDARIA,2,48,3,12,4,2021,2,1,2,6,6.00,1,1,932,1380,0,1,0 days,0 days,0
37802,6683336,3698,34965200.00,2022-01-10,7342979663741549570,P,6683336010001,1.00,2.69,F,3,2,4,302,CHOURIÇO DA GUARDA CONTINENTE 200G PM,13020902,1302,1302 - Charcutaria,13,12,SECUNDARIA,10,2,0,1,1,2022,25,24,17,2,14.88,1,1,1289,673,1,0,195 days,195 days,1
20972,6410386,1908,18937039.00,2022-11-23,9101294664818024232,P,6410386010001,1.00,1.29,F,3,2,5,302,CEBOLA P/ ASSAR CNT 500G,15020702,1502,1502 - legumes,15,12,SECUNDARIA,23,47,2,11,4,2022,31,25,19,38,18.24,1,1,4899,548,4,0,104 days,247 days,1
95287,2224229,340,108150126.00,2021-09-25,7236058670342128018,P,2224229010001,1.00,5.99,,,1,3,302,PAPEL ALUMINIO VILEDA 25M,6040502,604,0604 - prod papel e consum.,6,10,LIDER,25,38,5,9,3,2021,3,2,3,11,26.50,1,1,739,1414,0,0,0 days,0 days,0


In [None]:
df.