In [2]:
import pandas as pd
import numpy as np
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score

In [3]:
#import our dataset
df = pd.read_csv(r'C:\Users\Keshet\programmingyear3\final project\MLFP20000PPLFILTERED.csv', index_col = '_id')

#standardizing the df column names to lower and removing spaces to work comfortably
df.columns = df.columns.str.replace(' ', '').str.lower()

#adding the zero back to the phone number that was dropped
df['phonenumber'] = '0' + df['phonenumber'].astype(str)

In [4]:
#dropping data rows that miss any values
df.dropna(inplace=True)

In [5]:
#creating a df that only contains the columns relevant for kmodes clustering
clustering_columns = ['wantstotravelto', 'isspontanious', 'wantstoleaveon']
df_cluster = df[clustering_columns]
df_cluster.head()

Unnamed: 0_level_0,wantstotravelto,isspontanious,wantstoleaveon
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
66472652705d24319dfd51ee,"Central Asia - For example, Kazakhstan, Uzbeki...",No,February
66472652705d24319dfd51f6,"Southern Europe - For example, Italy, Spain, G...",Yes,August
66472652705d24319dfd51fd,"Central Asia - For example, Kazakhstan, Uzbeki...",Yes,April
66472652705d24319dfd5201,"Eastern Europe - For example, Russia, Poland, ...",Yes,May
66472652705d24319dfd5205,"Southern Europe - For example, Italy, Spain, G...",Yes,March


In [6]:
print(df.shape)

(20218, 18)


In [7]:
# Function to perform K-Modes clustering
def cluster_data(df_cluster, n_clusters):
    km = KModes(n_clusters=n_clusters, init='Huang', n_init=20, verbose=1)
    clusters = km.fit_predict(df_cluster)
    return clusters, km

In [8]:
def merge_small_clusters(df, cluster_column='cluster'):
    cluster_sizes = df[cluster_column].value_counts()
    too_small = cluster_sizes[cluster_sizes < 10].index.tolist()
    if not too_small:
        return df  # No small clusters to merge
    for cluster in too_small:
        cluster_data = df[df[cluster_column] == cluster]
        if not cluster_data.empty:
            cluster_modes = cluster_data.mode().iloc[0]
            distances = []
            for target_cluster in cluster_sizes[cluster_sizes >= 10].index:
                target_data = df[df[cluster_column] == target_cluster]
                target_modes = target_data.mode().iloc[0]
                distance = (cluster_modes != target_modes).sum()
                distances.append((distance, target_cluster))
            if distances:
                nearest_cluster = min(distances, key=lambda x: x[0])[1]
                df.loc[df[cluster_column] == cluster, cluster_column] = nearest_cluster
    return df

In [9]:
# Define the maximum desired size of any cluster
max_size = 60

In [10]:
def split_large_clusters(df, max_size, cluster_column='cluster', clustering_columns=None):
    cluster_sizes = df[cluster_column].value_counts()
    too_large = cluster_sizes[cluster_sizes > max_size].index.tolist()
    if not too_large:
        return df  # No large clusters to split
    for cluster in too_large:
        cluster_data = df[df[cluster_column] == cluster]
        # Increase the number of sub-clusters to ensure smaller cluster sizes
        num_sub_clusters = int(np.ceil(len(cluster_data) / (max_size * 0.75)))
        if num_sub_clusters > 1:
            sub_k = KModes(n_clusters=num_sub_clusters, init='Huang', n_init=5, verbose=1)
            sub_clusters = sub_k.fit_predict(cluster_data[clustering_columns])
            new_labels = [f"{cluster}_{i}" for i in range(num_sub_clusters)]
            label_mapping = {i: new_labels[i] for i in range(len(new_labels))}
            df.loc[cluster_data.index, cluster_column] = [label_mapping[x] for x in sub_clusters]
    return df

In [11]:
def verify_clusters(df, cluster_column='cluster'):
    if df[cluster_column].isnull().any():
        print("There are unclustered rows.")
    else:
        print("All rows are clustered.")

In [12]:
# Initial estimate of k
k = 200

In [None]:
# Initial clustering
clusters, km = cluster_data(df_cluster, k)
df['cluster'] = clusters

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 1508, cost: 9397.0
Run 1, iteration: 2/100, moves: 321, cost: 9397.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 1578, cost: 9426.0
Run 2, iteration: 2/100, moves: 134, cost: 9407.0
Run 2, iteration: 3/100, moves: 148, cost: 9407.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 1500, cost: 9456.0
Run 3, iteration: 2/100, moves: 193, cost: 9450.0
Run 3, iteration: 3/100, moves: 36, cost: 9450.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 1285, cost: 9445.0
Run 4, iteration: 2/100, moves: 576, cost: 9430.0
Run 4, iteration: 3/100, moves: 87, cost: 9430.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...


In [None]:
# Assuming df_cluster contains your categorical columns
df_encoded = pd.get_dummies(df_cluster)

In [None]:
# Ensure 'clusters' contains the cluster labels obtained from K-modes clustering of the original data
score = silhouette_score(df_encoded, clusters, metric='euclidean')  # Using 'euclidean' as all data is now binary
print("Silhouette Score: ", score)

In [None]:
# First pass: Iterative merging and splitting
iteration_limit = 50
for iteration in range(iteration_limit):
    old_cluster_count = df['cluster'].nunique()
    df = merge_small_clusters(df, 'cluster')
    df = split_large_clusters(df, 60, 'cluster', clustering_columns)
    new_cluster_count = df['cluster'].nunique()
    if old_cluster_count == new_cluster_count:
        break  # Stop if no changes in number of clusters

In [None]:
# Apply additional splitting specifically for clusters still too large
additional_rounds = 5  # Number of additional rounds to attempt
for _ in range(additional_rounds):
    large_clusters = df['cluster'].value_counts()
    large_clusters = large_clusters[large_clusters > max_size].index.tolist()
    if not large_clusters:
        break  # Exit loop if no large clusters remain
    df = split_large_clusters(df, 60, 'cluster', clustering_columns)

In [None]:
verify_clusters(df)

In [None]:
# Save the results
df.to_csv('clustered_data.csv', index=False)

In [None]:
# Set display options
pd.set_option('display.max_rows', None) 

# Assuming 'df' is your DataFrame
print(df['cluster'].value_counts())

In [None]:
# Print the total number of unique clusters
total_clusters = df['cluster'].nunique()
print("Total number of clusters:", total_clusters)

In [None]:
# Specify the cluster label you want to view
cluster_label_to_view = 57

# Filter the DataFrame to only include rows from the specified cluster
specific_cluster = df[df['cluster'] == cluster_label_to_view]

# Display the records from the specified cluster
display(specific_cluster)
