# EDA RFM

In [1]:
import pandas as pd
import datetime as dt

In [2]:
# Carregando o dataset limpo
df = pd.read_csv('../data/processed/online_retail_clean_uk.csv')

In [3]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [4]:
# Definindo a data de hoje
data_referencia = df['InvoiceDate'].max() + dt.timedelta(days=1)
print(f"Data de referência para RFM: {data_referencia}")

Data de referência para RFM: 2011-12-10 12:49:00


In [6]:
# Precisamos separar os dados agora por:
# Recency, Frequency, Monetary
rfm = df.groupby("Customer ID").agg({
    'InvoiceDate': lambda x: (data_referencia - x.max()).days,
    'Invoice': 'nunique',
    'Total_Price': 'sum'
})

rfm.rename(columns={
    'InvoiceDate': 'Recency',
    'Invoice': 'Frequency',
    'Total_Price': 'Monetary'
}, inplace=True)

rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,326,12,77556.46
12608,405,1,415.79
12745,487,2,723.85
12746,541,1,254.55
12747,2,26,9276.54


In [10]:
print(rfm.shape[0])
print(rfm.describe())

5350
           Recency    Frequency       Monetary
count  5350.000000  5350.000000    5350.000000
mean    203.000935     6.269346    2751.990190
std     209.959231    11.995550   12080.466564
min       1.000000     1.000000       2.950000
25%      26.000000     1.000000     336.167500
50%      98.500000     3.000000     849.910000
75%     382.000000     7.000000    2214.905000
max     739.000000   336.000000  608821.650000


In [11]:
# Fazendo a divisão por quartis em 5 grupos
r_labels = range(5, 0, -1)
f_labels = range(1, 6)
m_labels = range(1, 6)

In [12]:
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=5, labels=r_labels)
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=5, labels=f_labels)
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=5, labels=m_labels)

# Concatenação
rfm['RFM_Segment'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)


In [13]:
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Segment
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,326,12,77556.46,2,5,5,255
12608,405,1,415.79,2,1,2,212
12745,487,2,723.85,1,2,3,123
12746,541,1,254.55,1,1,1,111
12747,2,26,9276.54,5,5,5,555


In [None]:
def segmentar_clientes(row):
    r = int(row['R_Score'])
    f = int(row['F_Score'])
    m = int(row['M_Score'])

    rfm_score = r * 100 + f * 10 + m

    if r >= 4 and f >= 4 and m >= 4:
        return 'VIP / Elite'
    elif r >= 3 and f >= 3 and m >= 3:
        return 'Cliente Fiel'
    elif r >=4 and f == 1:
        return 'Novo Cliente'
    elif r >=3 and r <= 4 and f <= 2:
        return 'Promissor'
    elif r <= 2 and f >= 4:
        return 'Risco de Churn'
    elif r <= 2 and f <= 2:
        return 'Inativo / Perdido'
    else:
        return 'Comum'
    
rfm['Segmento'] = rfm.apply(segmentar_clientes, axis=1)
print(rfm['Segmento'].value_counts())

Segmento
Inativo / Perdido    1393
VIP / Elite          1193
Cliente Fiel         1036
Comum                 761
Promissor             499
Risco de Churn        318
Novo Cliente          150
Name: count, dtype: int64


In [15]:
rfm_final = rfm.reset_index()

In [16]:
rfm_final.to_csv('../data/processed/rfm_segmentacao_final.csv', index=False)
print("Arquivo salvo com sucesso")
print(rfm_final.head())

Arquivo salvo com sucesso
   Customer ID  Recency  Frequency  Monetary R_Score F_Score M_Score  \
0        12346      326         12  77556.46       2       5       5   
1        12608      405          1    415.79       2       1       2   
2        12745      487          2    723.85       1       2       3   
3        12746      541          1    254.55       1       1       1   
4        12747        2         26   9276.54       5       5       5   

  RFM_Segment           Segmento  
0         255     Risco de Churn  
1         212  Inativo / Perdido  
2         123  Inativo / Perdido  
3         111  Inativo / Perdido  
4         555        VIP / Elite  
