Download the dataset and load the data

In [None]:
%pip install kagglehub
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install matplotlib

import kagglehub
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
# Download the dataset
path = kagglehub.dataset_download("thedevastator/online-retail-transaction-data")

# print the path
print("Path to dataset files:", path)


Display the dataset and ETL process

In [None]:
data = pd.read_csv(path + "/online_retail.csv")

data.head()
#print("Info:") 
#data.info()

In [None]:

# Check the total number of rows before cleaning
print(f"Total rows before cleaning: {data.shape[0]}")

# Cleaning the data
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])  # Convert InvoiceDate to datetime
data = data[(data['Quantity'] > 0) & (data['UnitPrice'] > 0)]  # Remove rows with negative values

# Create a new column for the total price
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']

# Check the total number of rows after cleaning
print(f"Total rows after cleaning: {data.shape[0]}")

# Display the first few rows of the cleaned dataset
data.head()



Calc params for further analysis

In [None]:
# most recent date
current_date = data['InvoiceDate'].max()

# RFM
rfm = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (current_date - x.max()).days,  # Recency
    'InvoiceNo': 'count',                                   # Frequency
    'TotalPrice': 'sum'                                     # Monetary
})

# Rename the column
rfm.columns = ['Recency', 'Frequency', 'Monetary']

# 3. Normalizzazione dei dati, non so se serve
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm)

rfm.head()

Kmeans

In [None]:
# 4. Elbow Method
inertia = []
k_range = range(1, 11)  # Number of clusters from 1 to 10

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    inertia.append(kmeans.inertia_)

# Elbow plot
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, marker='o', label="Inertia")

# Highlight the elbow with a red dashed circle
optimal_k = 4  # Assuming the elbow is at k=4
#circle = plt.Circle((optimal_k, inertia[optimal_k-1]), 0.5, color='red', fill=False, linestyle='--', linewidth=2)
#plt.gca().add_artist(circle)

# Add the red point for the elbow
plt.scatter(optimal_k, inertia[optimal_k-1], color='red', s=100, edgecolors='black', linewidth=2, zorder=5)

# Show the inertia value at the elbow point
plt.text(optimal_k + 0.1, inertia[optimal_k-1] - 500, f'Inertia: {inertia[optimal_k-1]:,.2f}', fontsize=12)

plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Determining Optimal Number of Clusters')

plt.show()

# 5. Apply KMeans with the optimal number of clusters (e.g., k=4)
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42)
rfm['Cluster'] = kmeans_optimal.fit_predict(rfm_scaled)

# 6. Calculate the percentage of customers in each cluster
cluster_counts = rfm['Cluster'].value_counts(normalize=True) * 100

# Calculate the absolute number of customers in each cluster
cluster_sizes = rfm['Cluster'].value_counts()

# Print the number and percentage for each cluster
print("\nNumber and percentage of customers in each cluster:")
for cluster in cluster_sizes.index:
    percentage = cluster_counts[cluster]
    size = cluster_sizes[cluster]
    print(f"Cluster {cluster}: {size} customers ({percentage:.2f}%)")


# 7. 3D Scatter plot based on RFM
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot for each cluster with different colors
#colors = ['blue', 'purple', 'yellow', 'orange']  # 4 distinct colors

for i, cluster in enumerate(np.unique(rfm['Cluster'])):
    ax.scatter(rfm['Recency'][rfm['Cluster'] == cluster], 
            rfm['Frequency'][rfm['Cluster'] == cluster], 
            rfm['Monetary'][rfm['Cluster'] == cluster], 
            c=colors[i], label=f'Cluster {cluster}', s=50, alpha=0.6)

# Add labels and title
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
ax.set_title('3D Visualization of Clusters based on RFM')

# Add legend
ax.legend(title="Clusters")

# Show the plot
plt.show()


# Calcolare le medie delle variabili RFM per ciascun cluster
cluster_centroids = rfm.groupby('Cluster')[['Recency', 'Frequency', 'Monetary']].mean()
print(cluster_centroids)


In [None]:
# Calculate silhouette scores for each point
silhouette_values = silhouette_samples(rfm_scaled, kmeans_optimal.labels_)

# Calculate the average silhouette score
silhouette_avg = np.mean(silhouette_values)
print(f"Average silhouette score: {silhouette_avg:.4f}")

# Create a plot for the silhouette scores for each cluster
fig, ax = plt.subplots(figsize=(8, 6))

# For each cluster, calculate and plot the silhouette scores
y_lower, y_upper = 0, 0
for i in range(optimal_k):
    # Get the silhouette scores for points in cluster i
    cluster_silhouette_values = silhouette_values[kmeans_optimal.labels_ == i]
    
    # Sort the silhouette scores for cluster i
    cluster_silhouette_values.sort()
    
    # Calculate the limits for each cluster
    y_upper = y_lower + len(cluster_silhouette_values)
    
    # Plot the silhouette scores for the cluster i
    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_values, alpha=0.7, label=f"Cluster {i}")
    
    # Update the lower limit for the next cluster
    y_lower = y_upper

# Plot a vertical line for the average silhouette score
ax.axvline(x=silhouette_avg, color="red", linestyle="--", label=f"Average Silhouette: {silhouette_avg:.4f}")

# Add labels and title
ax.set_xlabel("Silhouette Score")
ax.set_ylabel("Cluster")
ax.set_title("Silhouette Score Distribution for Each Cluster")
ax.legend()

plt.show()

DBSCAN

In [None]:
# Number of neighbors
k = 8

# Initialize and fit the NearestNeighbors model
nearest_neighbors = NearestNeighbors(n_neighbors=k)
nearest_neighbors.fit(rfm_scaled)

# Compute distances and indices of k-nearest neighbors
distances, indices = nearest_neighbors.kneighbors(rfm_scaled)

# Sort the distances for the k-th nearest neighbor
k_distances = np.sort(distances[:, k-1])

# Plot the k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(k_distances, label="k-distance")
plt.xlabel("Points sorted by distance to 8th nearest neighbor")
plt.ylabel("8-distance")
plt.title("K-distance Graph for DBSCAN (k = 8)")
plt.legend()
plt.grid()
plt.show()


In [None]:
# Funzione per calcolare la distanza media tra i noise points e i 8 più vicini
def calculate_noise_distance(rfm_scaled, labels, k=8):
    # Trova i punti etichettati come noise (label == -1 in DBSCAN)
    noise_points = rfm_scaled[labels == -1]
    
    if len(noise_points) == 0:
        return np.nan  # Se non ci sono outlier, restituisce NaN
    
    # Calcola le distanze dei punti ai loro k-nearest neighbors
    nearest_neighbors = NearestNeighbors(n_neighbors=k)
    nearest_neighbors.fit(rfm_scaled)
    distances, _ = nearest_neighbors.kneighbors(noise_points)
    
    # Calcola la distanza media tra gli outliers e i loro k-nearest neighbors
    return np.mean(distances[:, -1])  # La distanza all'8° nearest neighbor

# Funzione per testare diverse combinazioni di epsilon e minPts
def test_dbscan_combinations(rfm_scaled, epsilon_range, min_pts_range, k=8):
    results = []
    
    # Ciclo su tutte le combinazioni di epsilon e minPts
    for epsilon in epsilon_range:
        for min_pts in min_pts_range:
            # Applica DBSCAN
            dbscan = DBSCAN(eps=epsilon, min_samples=min_pts)
            labels = dbscan.fit_predict(rfm_scaled)
            
            # Calcola le metriche
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # Escludi il rumore (-1)
            silhouette_avg = silhouette_score(rfm_scaled, labels) if n_clusters > 1 else -1  # La silhouette è -1 se ci sono pochi cluster
            noise_distance = calculate_noise_distance(rfm_scaled, labels, k)
            
            # Salva i risultati
            results.append({
                'epsilon': epsilon,
                'min_pts': min_pts,
                'n_clusters': n_clusters,
                'silhouette_avg': silhouette_avg,
                'noise_distance': noise_distance
            })
    
    return pd.DataFrame(results)

# Definisci i range per epsilon e minPts
epsilon_range = np.linspace(1, 2, 10)  # Epsilon tra 1 e 2 con 10 valori
min_pts_range = range(5, 21)  # minPts tra 5 e 20

# Testa le combinazioni
results_df = test_dbscan_combinations(rfm_scaled, epsilon_range, min_pts_range)

# Visualizza i risultati
print(results_df)

# Se vuoi trovare la combinazione con la migliore silhouette e la distanza media degli outlier
best_combination = results_df.loc[results_df['silhouette_avg'].idxmax()]
print("\nBest combination based on silhouette score:")
print(best_combination)


################################################################################################
#codice provvisorio per fare la heat map ma non viene
# Function to calculate the average distance of noise points to their k-nearest neighbors
# Funzione per calcolare la distanza media tra i noise points e i 8 più vicini
#def calculate_noise_distance(rfm_scaled, labels, k=8):
    # Trova i punti etichettati come noise (label == -1 in DBSCAN)
#    noise_points = rfm_scaled[labels == -1]
    
#    if len(noise_points) == 0:
#        return np.nan  # Se non ci sono outlier, restituisce NaN
    
    # Calcola le distanze dei punti ai loro k-nearest neighbors
#    nearest_neighbors = NearestNeighbors(n_neighbors=k)
#    nearest_neighbors.fit(rfm_scaled)
#    distances, _ = nearest_neighbors.kneighbors(noise_points)
    
    # Calcola la distanza media tra gli outliers e i loro k-nearest neighbors
#    return np.mean(distances[:, -1])  # La distanza all'8° nearest neighbor

# Funzione per testare diverse combinazioni di epsilon e minPts
#def test_dbscan_combinations(rfm_scaled, epsilon_range, min_pts_range, k=8):
#    results = []
    
    # Ciclo su tutte le combinazioni di epsilon e minPts
#    for epsilon in epsilon_range:
#        for min_pts in min_pts_range:
            # Applica DBSCAN
#            dbscan = DBSCAN(eps=epsilon, min_samples=min_pts)
#            labels = dbscan.fit_predict(rfm_scaled)
            
            # Calcola le metriche
#            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # Escludi il rumore (-1)
#            silhouette_avg = silhouette_score(rfm_scaled, labels) if n_clusters > 1 else -1  # La silhouette è -1 se ci sono pochi cluster
#            noise_distance = calculate_noise_distance(rfm_scaled, labels, k)
            
            # Salva i risultati
#            results.append({
#                'epsilon': epsilon,
#                'min_pts': min_pts,
#                'n_clusters': n_clusters,
#                'silhouette_avg': silhouette_avg,
#                'noise_distance': noise_distance
#            })
    
#    return pd.DataFrame(results)

# Definisci i range per epsilon e minPts
#epsilon_range = np.linspace(1, 2, 10)  # Epsilon tra 1 e 2 con 10 valori
#min_pts_range = range(5, 21)  # minPts tra 5 e 20

# Testa le combinazioni
#sns.heatmap(noise_distance_pivot, ax=axes[0], annot=True, fmt=".2f", cmap="Reds", cbar_kws={'label': 'Mean Noise Distance'})
#axes[0].set_title("METRIC: Mean Noise Points Distance")
#axes[0].set_xlabel("N")
#axes[0].set_ylabel("EPSILON")

# Heatmap: Number of Clusters
#sns.heatmap(n_clusters_pivot, ax=axes[1], annot=True, fmt="d", cmap="Purples", cbar_kws={'label': 'Number of Clusters'})
#axes[1].set_title("METRIC: Number of Clusters")
#axes[1].set_xlabel("N")
#axes[1].set_ylabel("EPSILON")

# Heatmap: Silhouette Score
#sns.heatmap(silhouette_pivot, ax=axes[2], annot=True, fmt=".2f", cmap="Blues", cbar_kws={'label': 'Silhouette Score'})
#axes[2].set_title("METRIC: Silhouette")
#axes[2].set_xlabel("N")
#axes[2].set_ylabel("EPSILON")

# Save or show the plots
#plt.tight_layout()
#plt.show()

# Print the best combination based on silhouette score
#best_combination = results_df.loc[results_df['silhouette_avg'].idxmax()]
#print("\nBest combination based on silhouette score:")
#print(best_combination)




In [None]:
# Convert NumPy array back to a DataFrame with column names
rfm_scaled = pd.DataFrame(rfm_scaled, columns=['Recency', 'Frequency', 'Monetary'])

# Apply the DBSCAN algorithm
epsilon = 1.888889  # previously determined epsilon value
min_pts = 6         # previously determined min_pts value
dbscan = DBSCAN(eps=epsilon, min_samples=min_pts)
dbscan_labels = dbscan.fit_predict(rfm_scaled)

# Add a new column to the DataFrame for cluster labels
rfm_scaled['Cluster'] = dbscan_labels

# Create the figure for the 3D scatter plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Define custom colors: yellow, blue, and red for noise
colors = {0: 'blue', 1: 'yellow', -1: 'red'}
point_colors = [colors[label] for label in dbscan_labels]

# Plot points in 3D space, colored by cluster
scatter = ax.scatter(
    rfm_scaled['Recency'], rfm_scaled['Frequency'], rfm_scaled['Monetary'],
    c=point_colors, marker='o', s=10
)

# Set axis labels
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')

# Set the plot title
ax.set_title('3D Scatter Plot of RFM Data with DBSCAN Clusters')

# Manually create the legend
for label, color in colors.items():
    if label == -1:
        ax.scatter([], [], [], c=color, label='Noise', s=30)  # Red for noise
    else:
        ax.scatter([], [], [], c=color, label=f'Cluster {label}', s=30)

# Add the legend
ax.legend()

# Show the plot
plt.show()