# Introduction

Every business wants to understand its customers better in order to grow and succeed. In this project, we use a popular method called RFM analysis—which stands for Recency, Frequency, and Monetary value—to study customer shopping patterns from an online retail dataset. By looking at how recently customers made purchases, how often they buy, and how much they spend, we can group customers into different segments. This helps businesses identify their most loyal customers, those who might be at risk of leaving, and new or potential customers. With these insights, companies can create more effective marketing strategies and build stronger relationships with customers

installing all important python libraries

In [None]:
%pip install plotly

In [None]:
%pip install nbformat --upgrade

 python code

In [5]:
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.graph_objects as go
import plotly.colors

In [9]:
data = pd.read_csv(r"C:\Users\HARMANPREET KAUR\Downloads\online_retail.csv\online_retail.csv")
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [11]:
data.tail()


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
541908,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


In [12]:
data.dropna(subset=['CustomerID'], inplace=True)

In [14]:
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']

In [15]:
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [16]:
reference_date = data['InvoiceDate'].max() + dt.timedelta(days=1)

In [18]:
reference_date = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalPrice': 'sum'
}).reset_index()

In [20]:
rfm = reference_date.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'TotalPrice': 'Monetary'})
rfm.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12346.0,326,2,0.0
1,12347.0,2,182,4310.0
2,12348.0,75,31,1797.24
3,12349.0,19,73,1757.55
4,12350.0,310,17,334.4


In [23]:
#defining RFM quantiles
quantiles = rfm.quantile(q=[0.25, 0.5, 0.75])

# assigning RFM scores for Recency
def RScore(x, p, d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]:
        return 2
    else:
        return 1

# assigning RFM scores for Frequency and Monetary
def FMScore(x, p, d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]:
        return 3
    else:
        return 4

rfm['R'] = rfm['Recency'].apply(RScore, args=('Recency', quantiles,))
rfm['F'] = rfm['Frequency'].apply(FMScore, args=('Frequency', quantiles,))
rfm['M'] = rfm['Monetary'].apply(FMScore, args=('Monetary', quantiles,))

rfm.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M
0,12346.0,326,2,0.0,1,1,1
1,12347.0,2,182,4310.0,4,4,4
2,12348.0,75,31,1797.24,2,2,4
3,12349.0,19,73,1757.55,3,3,4
4,12350.0,310,17,334.4,1,1,2


In [25]:
rfm = segment = rfm.assign(RFM=rfm['R'].astype(str) + rfm['F'].astype(str) + rfm['M'].astype(str))
rfm ["rfm_score"] = rfm[['R', 'F', 'M']].sum(axis=1) 
rfm.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M,RFM,rfm_score
0,12346.0,326,2,0.0,1,1,1,111,3
1,12347.0,2,182,4310.0,4,4,4,444,12
2,12348.0,75,31,1797.24,2,2,4,224,8
3,12349.0,19,73,1757.55,3,3,4,334,10
4,12350.0,310,17,334.4,1,1,2,112,4


In [27]:


def assign_segment(score):
    if score <= 6:
        return 'low value'
    elif score <= 9:
        return 'medium value'
    elif score <= 12:
        return 'high value'
    else:
        return 'very high value'

rfm['segment'] = rfm['rfm_score'].apply(assign_segment)
rfm.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R,F,M,RFM,rfm_score,segment
0,12346.0,326,2,0.0,1,1,1,111,3,low value
1,12347.0,2,182,4310.0,4,4,4,444,12,high value
2,12348.0,75,31,1797.24,2,2,4,224,8,medium value
3,12349.0,19,73,1757.55,3,3,4,334,10,high value
4,12350.0,310,17,334.4,1,1,2,112,4,low value


In [31]:
segment_counts = rfm['segment'].value_counts().reset_index()
segment_counts.columns = ['Segment', 'Count']
segment_counts = segment_counts.sort_values('Segment')


Bar Chart 

In [33]:
#create a bar chart using plotly
fig = px.bar(segment_counts,
             x='Segment',
             y='Count',
             title='customer distribution by RFM Segment',
             labels={'Segment': 'RFM Segment', 'Count': 'Number of Customers'},
             color='Segment',
             color_discrete_sequence=px.colors.qualitative.Pastel
            )
fig.show()


In [37]:
rfm['RFM_customer segment'] = ""
rfm.loc[rfm['rfm_score'] >= 9, 'RFM_customer segment'] = 'VIP/loyal customers'
rfm.loc[(rfm['rfm_score'] >= 6) & (rfm['rfm_score'] < 9), 'RFM_customer segment'] = 'Potential loyal customers'
rfm.loc[(rfm['rfm_score'] >= 5) & (rfm['rfm_score'] < 6), 'RFM_customer segment'] = 'At risk customers'
rfm.loc[(rfm['rfm_score'] >= 4) & (rfm['rfm_score'] < 5), 'RFM_customer segment'] = 'Lost customers'
rfm.loc[(rfm['rfm_score'] >= 3) & (rfm['rfm_score'] < 4), 'RFM_customer segment'] = 'New customers'
segment_counts = rfm['RFM_customer segment'].value_counts().sort_index()

In [39]:
segment_product_counts = rfm.groupby(['segment', 'RFM_customer segment']).size().reset_index(name='Count')
segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

In [40]:
fig = px.treemap(segment_product_counts,
                 path=['segment', 'RFM_customer segment'],
                 values='Count',
                 color='segment',
                 title='Treemap of Customer Segments by RFM')
fig.show()

In [41]:
vip_segment = rfm[rfm['RFM_customer segment'] == 'VIP/loyal customers']

In [43]:
fig = go.Figure()
fig.add_trace(go.Box(y=vip_segment['Recency'], name='Recency'))
fig.add_trace(go.Box(y=vip_segment['Frequency'], name='Frequency'))
fig.add_trace(go.Box(y=vip_segment['Monetary'], name='Monetary'))
fig.show()


In [44]:
correlation_matrix = vip_segment[['R', 'F', 'M']].corr()

In [45]:
fig_heatmap = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis',
    colorbar=dict(title='Correlation ')
))
#display the heatmap
fig_heatmap.show()

In [47]:
pastel_colors = plotly.colors.qualitative.Pastel
fig = go.Figure(
    data=[
        go.Bar(
            x=segment_counts.index,
            y=segment_counts.values,
            marker=dict(color=pastel_colors)
        )
    ]
)

vip_color ='rgb(158,202,225)'
fig.update_traces(marker_color = [vip_color if segment == 'champion' else pastel_colors[i] for i, segment in enumerate(segment_counts.index)])
marker_line_color = 'rgb(8,48,107)'
marker_line_width = 1.5
opacity = 0.6
#update the layout of the figure
fig.update_traces(marker_line_color=marker_line_color, marker_line_width=marker_line_width, opacity=opacity)
fig.update_layout(title='comparison of rfm segments ' ,
                 xaxis_title='RFM Segment',
                 yaxis_title='Number of Customers',
                 showlegend=False)

fig.show()


## Project Summary

This project analyzes customer purchasing behavior using the RFM (Recency, Frequency, Monetary) model on an online retail dataset. The workflow includes data cleaning, RFM metric calculation, customer segmentation, and visualization of segment distributions. The goal is to identify key customer groups such as VIP/loyal customers, potential loyal customers, at-risk customers, lost customers, and new customers, enabling targeted marketing strategies and improved customer relationship management.