<a href="https://colab.research.google.com/github/Keerthana8888/Customer-Segmentation/blob/main/Code%20file%20/%20Segmentation_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Libraries And Dataset


In [None]:
# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans , AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import LabelEncoder
from scipy.cluster.hierarchy import linkage , dendrogram
import warnings
warnings.simplefilter(action = 'ignore')

In [None]:
dataset = pd.read_excel('/content/Online Retail.xlsx')  # Read the dataset

In [None]:
dataset.info() # Check information

In [None]:
dataset.isnull().sum()  # Check for missing values

In [None]:
dataset.dropna(inplace=True) # drop rows with any missing values

In [None]:
dataset.info() # recheck

# K-Means Clustering

In [None]:
# RFM feature engineering
# get the total price of each order
dataset['TotalPrice'] = dataset['Quantity'] * dataset['UnitPrice']

# create customer dataset and features
customer_dataset = dataset.groupby('CustomerID').agg({'InvoiceDate': lambda x: (dataset['InvoiceDate'].max() - x.max()).days,
                                            'InvoiceNo': 'count',
                                            'TotalPrice': 'sum'})

# rename the features
customer_dataset.rename(columns={'InvoiceDate': 'recency',
                            'InvoiceNo': 'frequency',
                            'TotalPrice': 'monetary'}, inplace=True)

customer_dataset.info()

In [None]:
customer_dataset.sample(10)

In [None]:
customer_dataset.describe()

In [None]:
sns.pairplot(customer_dataset)

In [None]:
# We choose to use minmaxscaler because we are mostly looking to normalize the data and not looking for a mean equal to zero.
scaler = MinMaxScaler()

norm_customer = scaler.fit_transform(customer_dataset)

print(norm_customer)

In [None]:
# fit model with different num of clsuters
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=33)
    kmeans.fit(norm_customer)
    sse.append(kmeans.inertia_) # sse of the centroid of cluster

# create sse elbow plot
plt.plot(range(1, 11), sse, marker='o')
plt.title('Elbow plot')
plt.xlabel('n_clusters')
plt.ylabel('sse')

In [None]:
# create the final kmeans model
fin_kmeans = KMeans(n_clusters=4, n_init=10, random_state=33)

fin_kmeans.fit(norm_customer)

In [None]:
fin_dataset = pd.DataFrame(norm_customer, columns=customer_dataset.columns, index=customer_dataset.index)

fin_dataset['cluster'] = fin_kmeans.labels_
fin_dataset

In [None]:
cluster_mean = fin_dataset.groupby('cluster')[['recency', 'frequency', 'monetary']].mean().reset_index()

fig, axes = plt.subplots(nrows=3, figsize=(4, 6))

sns.barplot(cluster_mean, x='cluster', y='recency', ax=axes[0])
sns.barplot(cluster_mean, x='cluster', y='frequency', ax=axes[1])
sns.barplot(cluster_mean, x='cluster', y='monetary', ax=axes[2])

plt.tight_layout()

In [None]:
fin_dataset['cluster'].value_counts()

# Hierarchical Clustering

In [None]:
merging = linkage(norm_customer, method='ward')
dendrogram(merging, leaf_rotation=90)
plt.xlabel('Data Points')
plt.ylabel('Euclidean distance')
plt.show()

In [None]:
hc = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
cluster_hc = hc.fit_predict(norm_customer)

In [None]:
with_hc_dataset = pd.DataFrame(norm_customer, columns=customer_dataset.columns, index=customer_dataset.index)
with_hc_dataset['cluster'] = cluster_hc
with_hc_dataset.head()

In [None]:
with_hc_dataset['cluster'].value_counts(ascending=True)

In [None]:
cluster_mean = with_hc_dataset.groupby('cluster')[['recency', 'frequency', 'monetary']].mean().reset_index()

fig, axes = plt.subplots(nrows=3, figsize=(4, 6))

sns.barplot(cluster_mean, x='cluster', y='recency', ax=axes[0])
sns.barplot(cluster_mean, x='cluster', y='frequency', ax=axes[1])
sns.barplot(cluster_mean, x='cluster', y='monetary', ax=axes[2])

plt.show()