# Task Overview
Objective: Analyze customer purchase behavior using RFM (Recency, Frequency, Monetary) analysis and segment customers accordingly.

In [4]:
# Step 1: Load Dataset
import pandas as pd

# Load data
df = pd.read_excel(r"C:\Users\geeze\Downloads\Data Analytics Internship by Elevvo Pathways\Level_2\Task_3_Customer_Segmentation_Using_RFM_Analysis\online+retail\Online Retail.xlsx")
df.head()


KeyboardInterrupt: 

In [None]:
# Step 2: Data Cleaning

# Remove missing customer IDs
df = df[df['CustomerID'].notnull()]

# Remove returns (those with 'InvoiceNo' starting with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Filter only positive quantities and prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]


In [None]:
# Step 3: Feature Engineering for RFM

# Latest date in the dataset
import datetime
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

# Create RFM table
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': lambda x: (x).sum() if 'TotalPrice' in df.columns else (df['Quantity'] * df['UnitPrice']).groupby(df['CustomerID']).sum()
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm = rfm[rfm['Monetary'] > 0]  # Remove customers with zero spending


In [None]:
# Step 4: RFM Scoring

# Scoring function
r_labels = f_labels = m_labels = [4, 3, 2, 1]

r_quartiles = pd.qcut(rfm['Recency'], q=4, labels=r_labels)
f_quartiles = pd.qcut(rfm['Frequency'].rank(method="first"), q=4, labels=f_labels)
m_quartiles = pd.qcut(rfm['Monetary'], q=4, labels=m_labels)

rfm['R'] = r_quartiles.astype(int)
rfm['F'] = f_quartiles.astype(int)
rfm['M'] = m_quartiles.astype(int)

rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)



In [None]:
# Step 5: Segment Customers

def segment(x):
    if x >= 9:
        return 'Champions'
    elif x >= 7:
        return 'Loyal Customers'
    elif x >= 5:
        return 'Potential Loyalist'
    elif x >= 3:
        return 'Needs Attention'
    else:
        return 'At Risk'

rfm['Segment'] = rfm['RFM_Score'].apply(segment)


In [None]:
# Step 6: Visualization

import seaborn as sns
import matplotlib.pyplot as plt

# Segment count
sns.countplot(data=rfm, x='Segment', order=rfm['Segment'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Customer Segments Count")
plt.show()


In [None]:
# Heatmap of average R, F, M per segment
segment_avg = rfm.groupby('Segment')[['Recency','Frequency','Monetary']].mean().round(1)
sns.heatmap(segment_avg, annot=True, cmap='YlGnBu')
plt.title("RFM Average by Segment")
plt.show()

In [None]:
rfm.to_excel("rfm_segments.xlsx")
