In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

data_path = r'C:\Users\hamza\Downloads\ML Tasks\Customer Segmentation Dataset.xlsx'
customer_data = pd.read_excel(data_path)

customer_data['InvoiceDate'] = pd.to_datetime(customer_data['InvoiceDate'])
customer_data = customer_data[(customer_data['Quantity'] > 0) & (customer_data['UnitPrice'] > 0)]
customer_data['TotalPrice'] = customer_data['Quantity'] * customer_data['UnitPrice']

snapshot_date = customer_data['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = customer_data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalPrice': 'sum'
}).rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'TotalPrice': 'Monetary'})

scaler = MinMaxScaler()
rfm_scaled = pd.DataFrame(scaler.fit_transform(rfm), columns=rfm.columns, index=rfm.index)

sse = []
silhouette_scores = []
for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    sse.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(rfm_scaled, kmeans.labels_))

plt.figure(figsize=(10, 6))
plt.plot(range(2, 10), sse, marker='o', label='SSE (Elbow Method)')
plt.plot(range(2, 10), silhouette_scores, marker='x', label='Silhouette Score')
plt.title('Elbow Method and Silhouette Score')
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.legend()
plt.show()

kmeans = KMeans(n_clusters=4, random_state=42)
rfm_scaled['KMeans_Cluster'] = kmeans.fit_predict(rfm_scaled)

dbscan = DBSCAN(eps=0.3, min_samples=5)
rfm_scaled['DBSCAN_Cluster'] = dbscan.fit_predict(rfm_scaled.iloc[:, :-1])

pca = PCA(n_components=2)
rfm_pca = pca.fit_transform(rfm_scaled.iloc[:, :-2])

plt.figure(figsize=(12, 8))
sns.scatterplot(x=rfm_pca[:, 0], y=rfm_pca[:, 1], hue=rfm_scaled['KMeans_Cluster'], palette='Set1', s=80)
plt.title('K-Means Clustering Visualization with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Cluster')
plt.show()

rfm['Cluster'] = rfm_scaled['KMeans_Cluster']
cluster_summary = rfm.groupby('Cluster').mean()
print("Cluster Summary:\n", cluster_summary)

rfm.to_csv(r'C:\Users\hamza\Downloads\ML Tasks\RFM_Clustered.csv')
print("Preprocessed RFM dataset and clustering results saved.")
