# import dependecies

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

# load the cleaned data

In [2]:
km_data = pd.read_csv("../data/kmeans_data.csv")
kp_data = pd.read_csv("../data/k_pro_data.csv")

In [3]:
km_data.head()

Unnamed: 0,minutes_watched,clv,clv_wins,region_USA/Canada/As,region_West_EU,channel_Friend,channel_Google,channel_Instagram,channel_LinkedIn,channel_Other,channel_Twitter,channel_YouTube
0,3197.0,205.42,205.42,0,0,0,0,0,1,0,0,0
1,63.0,149.99,149.99,1,0,0,1,0,0,0,0,0
2,605.0,119.0,119.0,0,0,0,0,0,0,0,0,0
3,20.0,240.0,240.0,0,0,0,0,0,1,0,0,0
4,245.0,184.36,184.36,0,1,0,0,0,1,0,0,0


In [4]:
kp_data.head()

Unnamed: 0.1,Unnamed: 0,minutes_watched,clv,clv_wins,region,channel
0,0,3197.0,205.42,205.42,Rest_of_the_world,LinkedIn
1,1,63.0,149.99,149.99,USA/Canada/As,Google
2,2,605.0,119.0,119.0,Rest_of_the_world,Facebook
3,3,20.0,240.0,240.0,Rest_of_the_world,LinkedIn
4,4,245.0,184.36,184.36,West_EU,LinkedIn


# Dropping repeated columns

In [5]:
kmd = km_data.drop(columns={"clv"})

In [6]:
kpd = kp_data.drop(columns={"clv"})

---

# Scaling

In [7]:
from sklearn.preprocessing import MinMaxScaler , StandardScaler

In [8]:
scale = StandardScaler()

In [9]:
km_data.loc[:,["minutes_watched" , "clv_wins"]] = scale.fit_transform(km_data.loc[:,["minutes_watched" , "clv_wins"]])

In [10]:
km_data

Unnamed: 0,minutes_watched,clv,clv_wins,region_USA/Canada/As,region_West_EU,channel_Friend,channel_Google,channel_Instagram,channel_LinkedIn,channel_Other,channel_Twitter,channel_YouTube
0,0.173536,205.42,1.414667,0,0,0,0,0,1,0,0,0
1,-0.255845,149.99,0.535605,1,0,0,1,0,0,0,0,0
2,-0.181587,119.00,0.044135,0,0,0,0,0,0,0,0,0
3,-0.261736,240.00,1.963070,0,0,0,0,0,1,0,0,0
4,-0.230910,184.36,1.080677,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3829,-0.262969,79.00,-0.590224,1,0,0,0,0,0,0,0,1
3830,-0.260640,79.00,-0.590224,1,0,0,0,0,0,0,0,0
3831,-0.256667,36.00,-1.272159,0,0,0,0,0,0,0,0,0
3832,-0.258448,36.00,-1.272159,0,0,0,1,0,0,0,0,0


In [11]:
kmd = km_data.drop(columns={"clv"})

# Model

In [12]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score

In [13]:
import mlflow 
import mlflow.sklearn

In [16]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Customer_Segmentation_Experiment_1")

In [17]:

results = []

for k in range(2, 24): # choose from 2 to 11 K
    with mlflow.start_run(run_name=f"KMeans_K_{k}"):
    
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(kmd)

        sil_score = silhouette_score(kmd, labels)
        dbi = davies_bouldin_score(kmd, labels)
        inertia = kmeans.inertia_

        results.append({
            'K': k,
            'Silhouette': sil_score,
            'Davies_Bouldin': dbi,
            'Inertia': inertia
        })

        mlflow.log_param("K", k)
        mlflow.log_metric("Silhouette", sil_score)
        mlflow.log_metric("Davies_Bouldin", dbi)
        mlflow.log_metric("Inertia", inertia)
        
        input_example = kmd[:5] if hasattr(kmd, "__getitem__") else None
        mlflow.sklearn.log_model(
            sk_model=kmeans,
            name=f"kmeans_model_k_{k}",
            input_example=input_example)
        
# results of DataFrame
results_df = pd.DataFrame(results)
print(results_df)

#-----------------------
# 3. choosing best K
#-----------------------
best_k_sil = results_df.loc[results_df['Silhouette'].idxmax(), 'K']
best_k_dbi = results_df.loc[results_df['Davies_Bouldin'].idxmin(), 'K']

print(f"\n✅ best K depending on Silhouette = {best_k_sil}")
print(f"✅ best K depending on Silhouette Davies–Bouldin = {best_k_dbi}")

#-----------------------
# 4. plotting
#-----------------------
plt.figure(figsize=(10,4))

plt.subplot(1,2,1)
plt.plot(results_df['K'], results_df['Silhouette'], 'o-')
plt.xlabel('K')
plt.ylabel('Silhouette Score')
plt.title('Silhouette vs K')

plt.subplot(1,2,2)
plt.plot(results_df['K'], results_df['Davies_Bouldin'], 'o-', color='orange')
plt.xlabel('K')
plt.ylabel('Davies–Bouldin Index')
plt.title('Davies–Bouldin vs K')

plt.tight_layout()
plt.show()

KeyboardInterrupt: 

In [None]:
plt.plot(results_df["K"], results_df["Inertia"], 'o-')
plt.xlabel("K")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()


In [None]:
from sklearn.metrics import silhouette_samples
import matplotlib.cm as cm

best_k = 10
kmeans = KMeans(n_clusters=best_k, random_state=42)
labels = kmeans.fit_predict(kmd)
sample_values = silhouette_samples(kmd, labels)
y_lower = 10
for i in range(best_k):
    ith_cluster_silhouette_values = sample_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = cm.nipy_spectral(float(i) / best_k)
    plt.fill_betweenx(np.arange(y_lower, y_upper),
                      0, ith_cluster_silhouette_values,
                      facecolor=color, edgecolor=color, alpha=0.7)
    y_lower = y_upper + 10
plt.xlabel("Silhouette Coefficient Values")
plt.ylabel("Cluster Label")
plt.title(f"Silhouette Plot for K = {best_k}")
plt.show()


In [None]:
wcss = []
for i in results:
    wcss.append(i["Inertia"])

In [None]:
k_range = range(2,24)
plt.figure(figsize=(8,5))
plt.plot(k_range, wcss, 'bo-', markersize=8)
plt.xlabel('Number of Clusters K')
plt.ylabel('WCSS (Inertia)')
plt.title('Elbow Method For Optimal K')
plt.grid(True)
plt.show()

# conclusion
- the best K is 10 according to the metrics