# MLG382 
- ## Project 2 
- Group M

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from datetime import date, datetime, timedelta
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import warnings
import numpy as np
import seaborn as sns
import datetime as dt
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

warnings.simplefilter('ignore', category=Warning, lineno=0, append=False)

# Business Understanding

## Problem

A small to medium enterprise wants to improve their marketing strategy by tailoring it to different groups of customers according to their behaviour, hoping to increase sales. They want to know which customers they should offer incentitives, to encourage them to make more purchases and spend more money. They also want to know which customers are loyal and big spenders in order to offer them rewards for supporting the business.

## Approach

According to the business needs a customer segmentation based on behaviour is needed. A Regency Frequency Monetary (RFM) analysis is a good reflection of customer behaviour. Once a RFM analysis is done, customers can be segmented into groups by use of k-means clustering. Behaviour of these groups can be determined by looking at the mean of each group and a different marketing strategy can then be recommneded for a specific customer based on which group they fall under.


## Data

The business provides a data set containing the following information about their sales: InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID and Country. For a behavioural customer segmentation the geographical location is irrellevant. For the RFM data analysis we only need the InvoiceNo, Quantity, InvoiceDate, UnitPrice and Customer ID.


# Data Preparation

In [None]:
# import data
raw_df = pd.read_csv('..\data\\Online_Retail.csv')
raw_df.head()


In [None]:
# Understand data structure
print(raw_df.info())

In [None]:
raw_df.isnull().sum()

In [None]:
#Data Cleaning
# Drop 'Country' and 'InvoiceNo' columns
processed_df = raw_df.drop(['Country','Description'], axis=1)

# Remove rows with quantity less than or equal to zero
processed_df = processed_df[processed_df['Quantity'] >= 0]

# Remove rows with missing CustomerID
processed_df = processed_df.dropna(subset=['CustomerID'])

# Reset the index after removing rows
processed_df.reset_index(drop=True, inplace=True)

# Display the first few rows of the processed DataFrame
print(processed_df.head())

In [None]:
#Data Processing
processed_df['Quantity'] = processed_df['Quantity'].astype(int)
processed_df['CustomerID'] = processed_df['CustomerID'].astype(str)
print(processed_df.info())

In [None]:
processed_df['Amount'] = processed_df['Quantity']*processed_df['UnitPrice']
rfm_ds_n = processed_df.groupby('CustomerID')['Amount'].sum()
rfm_ds_n.reset_index()
rfm_ds_n.columns = ['CustomerID', 'Amount']
print(rfm_ds_n)

In [None]:
rfm_ds_f = processed_df.groupby('CustomerID')['InvoiceNo'].count()
rfm_ds_f = rfm_ds_f.reset_index()
rfm_ds_f.columns = ['CustomerID','Frequency']
print(rfm_ds_f)

In [None]:
processed_df['InvoiceDate'] = pd.to_datetime(processed_df['InvoiceDate'],format='%m/%d/%Y %H:%M')
max_date = max(processed_df['InvoiceDate'])
processed_df['Diff'] = max_date - processed_df['InvoiceDate']
rfm_ds_p = processed_df.groupby('CustomerID')['Diff'].min()
rfm_ds_p = rfm_ds_p.reset_index()
rfm_ds_p.columns = ['CustomerID', 'Diff']
rfm_ds_p['Diff'] = rfm_ds_p['Diff'].dt.days
print(rfm_ds_p)

In [None]:
rfm_ds_final = pd.merge(rfm_ds_n, rfm_ds_f, on='CustomerID',how='inner')
rfm_ds_final = pd.merge(rfm_ds_final, rfm_ds_p, on='CustomerID', how='inner')
rfm_ds_final.columns = ['CustomerID', 'Amount', 'Frequency', 'Recency']
print(rfm_ds_final)

In [None]:
#Outliers
Q1 = rfm_ds_final.Amount.quantile(0.25)
Q3 = rfm_ds_final.Amount.quantile(0.75)
IQR = Q3-Q1
rfm_ds_final = rfm_ds_final[(rfm_ds_final.Amount > Q1 - 1.5*IQR) & (rfm_ds_final.Amount < Q3 + 1.5*IQR)]

Q1 = rfm_ds_final.Recency.quantile(0.25)
Q3 = rfm_ds_final.Recency.quantile(0.75)
IQR = Q3-Q1
rfm_ds_final = rfm_ds_final[(rfm_ds_final.Recency > Q1 - 1.5*IQR) & (rfm_ds_final.Recency < Q3 + 1.5*IQR)]

Q1 = rfm_ds_final.Frequency.quantile(0.25)
Q3 = rfm_ds_final.Frequency.quantile(0.75)
IQR = Q3-Q1
rfm_ds_final = rfm_ds_final[(rfm_ds_final.Frequency > Q1 - 1.5*IQR) & (rfm_ds_final.Frequency < Q3 + 1.5*IQR)]

In [None]:
print(rfm_ds_final.shape)

In [None]:
#scaling
X = rfm_ds_final[['Amount', 'Frequency', 'Recency']]
scaler = MinMaxScaler()
rfm_ds_scaled = scaler.fit_transform(X)

In [None]:
rfm_ds_scaled = pd.DataFrame(rfm_ds_scaled)
rfm_ds_scaled.columns = ['Amount', 'Frequency','Recency']
rfm_ds_scaled.head()

# Data Visualisation

In [None]:
rfm_df = rfm_ds_scaled.copy()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm_df, x='CustomerID', y='Recency', color='blue', edgecolor='k')
plt.title('Recency')
plt.xlabel('CustomerID')
plt.ylabel('Recency')
plt.show()


plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm_df, x='CustomerID', y='Frequency', color='red', edgecolor='k')
plt.title('Frequency')
plt.xlabel('CustomerID')
plt.ylabel('Frequency')
plt.show()


plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm_df, x='CustomerID', y='Amount', color='green', edgecolor='k')
plt.title('Monetary')
plt.xlabel('CustomerID')
plt.ylabel('Monetary')
plt.show()


plt.figure(figsize=(10, 6))
sns.barplot(data=rfm_df, x='CustomerID', y='Recency', color='blue', edgecolor='k')
plt.title('Recency')
plt.xlabel('CustomerID')
plt.ylabel('Recency')
plt.show()


plt.figure(figsize=(10, 6))
sns.barplot(data=rfm_df, x='CustomerID', y='Frequency', color='red', edgecolor='k')
plt.title('Frequency')
plt.xlabel('CustomerID')
plt.ylabel('Frequency')
plt.show()


plt.figure(figsize=(10, 6))
sns.barplot(data=rfm_df, x='CustomerID', y='Amount', color='green', edgecolor='k')
plt.title('Monetary')
plt.xlabel('CustomerID')
plt.ylabel('Monetary')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm_df, x='Frequency', y='Recency', sizes=(50, 500), alpha=0.6, edgecolor='k', color='blue')


plt.title('Frequency vs Recency')
plt.xlabel('Frequency')
plt.ylabel('Recency')



plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm_df, x='Frequency', y='Amount', sizes=(50, 500), alpha=0.6, edgecolor='k', color='blue')


plt.title('Frequency vs Monetary')
plt.xlabel('Frequency')
plt.ylabel('Monetary')


plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(data=rfm_df, x='Recency', y='Amount', sizes=(50, 500), alpha=0.6, edgecolor='k', color='blue')

plt.title('Recency vs Monetary')
plt.xlabel('Recency')
plt.ylabel('Monetary')

plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns


fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')  


scatter = ax.scatter(rfm_df['Recency'], rfm_df['Frequency'], rfm_df['Amount'], 
                     c='blue', alpha=0.6, edgecolor='k', s=50)


ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')


ax.set_title('3D Plot of Recency, Frequency, and Monetary')


plt.show()

# Modelling

In [None]:
#model creation
kmeans = KMeans(n_clusters= 3,max_iter= 50)
kmeans.fit(rfm_ds_scaled)
lbs = kmeans.labels_
print(kmeans.labels_)

In [None]:
#model creation
kmeans = KMeans(n_clusters= 3,max_iter= 50)
kmeans.fit(rfm_ds_scaled)
lbs = kmeans.labels_
print(kmeans.labels_)

In [None]:
#silhouette score
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters= num_clusters, max_iter= 50)
    kmeans.fit(rfm_ds_scaled)
    cluster_labels = kmeans.labels_
    silhouette_avg = silhouette_score(rfm_ds_scaled, cluster_labels)
    print('For n_clusters{0}, the silhouette score is {1}'.format(num_clusters, silhouette_avg))

In [None]:
X = rfm_ds_final
X.head()

In [None]:
# final_model labels
final_model = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=3, random_state=42)
)
# Predict class labels
labels = final_model.fit_predict(X)
print(labels[:10])

In [None]:
X['labels'] = final_model.named_steps["kmeans"].labels_
print(labels[:10])

In [None]:
import plotnine as pn
from plotnine import *

# Create a plot area
p = (
    ggplot(X, aes(x='Recency', y='Amount')) +
    # Add points into the plot area
    geom_point(aes(fill=X['labels'].astype(str))) +
    # Add a blue smooth line
    geom_smooth(color="blue", se=False) +
    # Format x-axis scale to display 'Frequency'
    scale_x_continuous(
        name='Recency (Days)',
        labels=lambda x: [f'{x:,.0f}' for x in x],
        limits=(0, 400)  # Adjust x-axis limits to zoom in
    ) +
    # Format y-axis scale to display 'Rands'
    scale_y_continuous(
        name='Amount (Rands)',
        labels=lambda y: [f'R{y:,.0f}' for y in y]
    ) +
    # Add title to the plot
    labs(title="Customer Clusters: Recency vs Amount") +
    # Add plot theme
    theme_classic()
)

# Show the plot
print(p)


In [None]:
import plotnine as pn
from plotnine import *

# Create a plot area
p = (
    ggplot(X, aes(x='Frequency', y='Amount')) +
    # Add points into the plot area
    geom_point(aes(fill=X['labels'].astype(str))) +
    # Add a blue smooth line
    geom_smooth(color="blue", se=False) +
    # Format x-axis scale to display 'Frequency'
    scale_x_continuous(
        name='Frequency',
        labels=lambda x: [f'{x:,.0f}' for x in x],
        limits=(0, 250)  # Adjust x-axis limits to zoom in
    ) +
    # Format y-axis scale to display 'Rands'
    scale_y_continuous(
        name='Amount (Rands)',
        labels=lambda y: [f'R{y:,.0f}' for y in y]
    ) +
    # Add title to the plot
    labs(title="Customer Clusters: Frequency vs Amount") +
    # Add plot theme
    theme_classic()
)

# Show the plot
print(p)

# Web Application

# Future Work And Reflection

# Conclusion