In [1]:
import pandas as pd

# Load customer_features.csv file
rfm_df = pd.read_csv('../data/customer_features.csv')

# Preview
rfm_df.head()


Unnamed: 0,CustomerID,LastPurchase,FirstPurchase,OrderCount,TotalSpent,AvgOrderValue,Recency
0,12346.0,2011-01-18 10:01:00,2011-01-18 10:01:00,1,77183.6,77183.6,325
1,12347.0,2011-12-07 15:52:00,2010-12-07 14:57:00,182,4310.0,23.681319,1
2,12348.0,2011-09-25 13:13:00,2010-12-16 19:09:00,31,1797.24,57.975484,74
3,12349.0,2011-11-21 09:51:00,2011-11-21 09:51:00,73,1757.55,24.076027,18
4,12350.0,2011-02-02 16:01:00,2011-02-02 16:01:00,17,334.4,19.670588,309


In [2]:
# RFM values
rfm_df['Recency'] = rfm_df['Recency']
rfm_df['Frequency'] = rfm_df['OrderCount']
rfm_df['Monetary'] = rfm_df['TotalSpent']


In [3]:
# Recency: lower is better (reverse)
rfm_df['R_Score'] = pd.qcut(rfm_df['Recency'], 5, labels=[5, 4, 3, 2, 1]).astype(int)

# Frequency: higher is better
rfm_df['F_Score'] = pd.qcut(rfm_df['Frequency'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5]).astype(int)

# Monetary: higher is better
rfm_df['M_Score'] = pd.qcut(rfm_df['Monetary'], 5, labels=[1, 2, 3, 4, 5]).astype(int)


In [4]:
# Combine scores
rfm_df['RFM_Segment'] = rfm_df['R_Score'].astype(str) + rfm_df['F_Score'].astype(str) + rfm_df['M_Score'].astype(str)

# Total RFM Score
rfm_df['RFM_Score'] = rfm_df[['R_Score', 'F_Score', 'M_Score']].sum(axis=1)

# Segment labeling
def rfm_label(row):
    if row['RFM_Score'] >= 13:
        return 'Champions'
    elif row['RFM_Score'] >= 10:
        return 'Loyal Customers'
    elif row['RFM_Score'] >= 6:
        return 'Potential'
    else:
        return 'At Risk'

rfm_df['Segment'] = rfm_df.apply(rfm_label, axis=1)
rfm_df.head()


Unnamed: 0,CustomerID,LastPurchase,FirstPurchase,OrderCount,TotalSpent,AvgOrderValue,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Segment,RFM_Score,Segment
0,12346.0,2011-01-18 10:01:00,2011-01-18 10:01:00,1,77183.6,77183.6,325,1,77183.6,1,1,5,115,7,Potential
1,12347.0,2011-12-07 15:52:00,2010-12-07 14:57:00,182,4310.0,23.681319,1,182,4310.0,5,5,5,555,15,Champions
2,12348.0,2011-09-25 13:13:00,2010-12-16 19:09:00,31,1797.24,57.975484,74,31,1797.24,2,3,4,234,9,Potential
3,12349.0,2011-11-21 09:51:00,2011-11-21 09:51:00,73,1757.55,24.076027,18,73,1757.55,4,4,4,444,12,Loyal Customers
4,12350.0,2011-02-02 16:01:00,2011-02-02 16:01:00,17,334.4,19.670588,309,17,334.4,1,2,2,122,5,At Risk


In [5]:
rfm_df.to_csv('../data/rfm_segments.csv', index=False)
print("✅ Saved: rfm_segments.csv")


✅ Saved: rfm_segments.csv


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Count of each segment
segment_counts = rfm_df['Segment'].value_counts().reset_index()
segment_counts.columns = ['Segment', 'Count']

plt.figure(figsize=(10, 5))
sns.barplot(data=segment_counts, x='Segment', y='Count', palette='viridis')
plt.title("📊 Customer Count by Segment")
plt.ylabel("Number of Customers")
plt.xlabel("Segment")
plt.grid(True)
plt.show()
