# 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.cluster import KMeans

from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin



# Reading Dataset

In [None]:
dataSetPath = r"..\1_DataSets\worldcities.csv"

dataset = pd.read_csv(dataSetPath)
dataSetName = os.path.basename(dataSetPath)

print(f"Dataset Columns: {dataset.columns.to_list()}")
print("****************************")
print(f"Dataset Shape: {dataset.shape}")
print("****************************")

totalNumberOfRowInActualDataset = len(dataset)
print(f"Total Rows in dataset File: {totalNumberOfRowInActualDataset}")


# Pre-proccessing Steps

In [None]:
dataset.drop_duplicates(inplace=True)
print(f"Dataset Shape after removing duplcates: {dataset.shape}")
print("****************************")

numberOfRowAfterRemovingDuplicates = len(dataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingDuplicates}")

print("****************************")
print(f"Total Duplicates: {(totalNumberOfRowInActualDataset - numberOfRowAfterRemovingDuplicates)}")

In [None]:
# Step 2: Handle missing values (remove rows with missing values)
cleanedDataset = dataset.dropna()
numberOfRowAfterRemovingNullValues = len(cleanedDataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingNullValues}")

print("****************************")
print(f"Removed Number of rows containing Null Values: {( numberOfRowAfterRemovingDuplicates - numberOfRowAfterRemovingNullValues)}")

In [None]:
target_column = cleanedDataset[cleanedDataset.columns.to_list()[-1]].unique()
print(target_column) 

In [None]:
cleanedDataset.head()

In [None]:
cleanedDataset.info()

In [None]:
cleanedDataset.describe()

# K-Mean Clustering, Calculating Centroids Manually

In [None]:
# Extract relevant features: latitude (lat) and longitude (lng)
coordinates = cleanedDataset[['lat', 'lng']].values

# Define the number of clusters (k)
k = 5
# K-Means Clustering Process
max_iterations = 30


output_dir = f"kmeans_steps_{dataSetName}_Features_lat_lng"  # Directory to save the plots
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize random centroids (pick k random points from the data)
np.random.seed(42)
centroids = coordinates[np.random.choice(coordinates.shape[0], k, replace=False)]

# Function to plot clusters and centroids
def plot_clusters(coordinates, labels, centroids, iteration):
    plt.figure(figsize=(10, 6))
    colors = ['red', 'blue', 'green', 'purple', 'orange']
    
    # Plot each cluster
    for cluster in range(k):
        cluster_points = coordinates[labels == cluster]
        plt.scatter(cluster_points[:, 1], cluster_points[:, 0], s=100, color=colors[cluster], label=f'Cluster {cluster + 1}')
    
    # Plot centroids
    plt.scatter(centroids[:, 1], centroids[:, 0], c='black', marker='x', s=200, label='Centroids')
    plt.title(f'K-Means Clustering - Iteration {iteration}')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    # Save the plot
    file_name = os.path.join(output_dir, f"iteration_{iteration + 1}.png")
    plt.savefig(file_name)
    #plt.show()
    plt.close()

for iteration in range(1, max_iterations + 1):
    # Assign clusters based on the closest centroid
    labels = pairwise_distances_argmin(coordinates, centroids)
    
    # Plot the current state of clusters and centroids
    plot_clusters(coordinates, labels, centroids, iteration)
    
    # Recalculate centroids as the mean of assigned points
    new_centroids = np.array([coordinates[labels == cluster].mean(axis=0) for cluster in range(k)])
    
    # Check for convergence (if centroids do not change)
    if np.all(centroids == new_centroids):
        print(f"Convergence reached at iteration {iteration}")
        break
    
    centroids = new_centroids

In [None]:
cleanedDataset.head()