# 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.cluster import KMeans

from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin



# Reading Dataset

In [2]:
dataSetPath = r"..\1_DataSets\worldcities.csv"

dataset = pd.read_csv(dataSetPath)
dataSetName = os.path.basename(dataSetPath)

print(f"Dataset Columns: {dataset.columns.to_list()}")
print("****************************")
print(f"Dataset Shape: {dataset.shape}")
print("****************************")

totalNumberOfRowInActualDataset = len(dataset)
print(f"Total Rows in dataset File: {totalNumberOfRowInActualDataset}")


Dataset Columns: ['city', 'city_ascii', 'lat', 'lng', 'country', 'iso2', 'iso3', 'admin_name', 'capital', 'population', 'id']
****************************
Dataset Shape: (47868, 11)
****************************
Total Rows in dataset File: 47868


# Pre-proccessing Steps

In [3]:
dataset.drop_duplicates(inplace=True)
print(f"Dataset Shape after removing duplcates: {dataset.shape}")
print("****************************")

numberOfRowAfterRemovingDuplicates = len(dataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingDuplicates}")

print("****************************")
print(f"Total Duplicates: {(totalNumberOfRowInActualDataset - numberOfRowAfterRemovingDuplicates)}")

Dataset Shape after removing duplcates: (47868, 11)
****************************
Remaining Rows in Dataset: 47868
****************************
Total Duplicates: 0


In [4]:
# Step 2: Handle missing values (remove rows with missing values)
cleanedDataset = dataset.dropna()
numberOfRowAfterRemovingNullValues = len(cleanedDataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingNullValues}")

print("****************************")
print(f"Removed Number of rows containing Null Values: {( numberOfRowAfterRemovingDuplicates - numberOfRowAfterRemovingNullValues)}")

Remaining Rows in Dataset: 12764
****************************
Removed Number of rows containing Null Values: 35104


In [5]:
target_column = cleanedDataset[cleanedDataset.columns.to_list()[-1]].unique()
print(target_column) 

[1392685764 1360771077 1356872604 ... 1591425685 1705541759 1434044820]


In [6]:
cleanedDataset.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37732000.0,1392685764
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629


In [7]:
cleanedDataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12764 entries, 0 to 47652
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   city        12764 non-null  object 
 1   city_ascii  12764 non-null  object 
 2   lat         12764 non-null  float64
 3   lng         12764 non-null  float64
 4   country     12764 non-null  object 
 5   iso2        12764 non-null  object 
 6   iso3        12764 non-null  object 
 7   admin_name  12764 non-null  object 
 8   capital     12764 non-null  object 
 9   population  12764 non-null  float64
 10  id          12764 non-null  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 1.2+ MB


In [8]:
cleanedDataset.describe()

Unnamed: 0,lat,lng,population,id
count,12764.0,12764.0,12764.0,12764.0
mean,25.78232,6.130952,241719.9,1421425000.0
std,23.571168,63.193506,1221234.0,261086700.0
min,-54.9333,-178.1585,2.0,1004003000.0
25%,9.1176,-47.8319,15313.0,1170399000.0
50%,32.4575,15.2693,32942.5,1398791000.0
75%,45.4167,38.9667,95042.0,1643345000.0
max,70.6634,179.2,37732000.0,1901975000.0


# K-Mean Clustering, Calculating Centroids Manually

In [13]:
# Extract relevant features: latitude (lat) and longitude (lng)
coordinates = cleanedDataset[['lat', 'lng']].values

# Define the number of clusters (k)
k = 5
# K-Means Clustering Process
max_iterations = 30


output_dir = f"kmeans_steps_{dataSetName}_Features_lat_lng"  # Directory to save the plots
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Initialize random centroids (pick k random points from the data)
np.random.seed(42)
centroids = coordinates[np.random.choice(coordinates.shape[0], k, replace=False)]

# Function to plot clusters and centroids
def plot_clusters(coordinates, labels, centroids, iteration):
    plt.figure(figsize=(10, 6))
    colors = ['red', 'blue', 'green', 'purple', 'orange']
    
    # Plot each cluster
    for cluster in range(k):
        cluster_points = coordinates[labels == cluster]
        plt.scatter(cluster_points[:, 1], cluster_points[:, 0], s=100, color=colors[cluster], label=f'Cluster {cluster + 1}')
    
    # Plot centroids
    plt.scatter(centroids[:, 1], centroids[:, 0], c='black', marker='x', s=200, label='Centroids')
    plt.title(f'K-Means Clustering - Iteration {iteration}')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()
    plt.grid(True)
    # Save the plot
    file_name = os.path.join(output_dir, f"iteration_{iteration + 1}.png")
    plt.savefig(file_name)
    #plt.show()
    plt.close()

for iteration in range(1, max_iterations + 1):
    # Assign clusters based on the closest centroid
    labels = pairwise_distances_argmin(coordinates, centroids)
    
    # Plot the current state of clusters and centroids
    plot_clusters(coordinates, labels, centroids, iteration)
    
    # Recalculate centroids as the mean of assigned points
    new_centroids = np.array([coordinates[labels == cluster].mean(axis=0) for cluster in range(k)])
    
    # Check for convergence (if centroids do not change)
    if np.all(centroids == new_centroids):
        print(f"Convergence reached at iteration {iteration}")
        break
    
    centroids = new_centroids

Convergence reached at iteration 24


In [11]:
cleanedDataset.head()

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37732000.0,1392685764
1,Jakarta,Jakarta,-6.175,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,Delhi,28.61,77.23,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,Guangzhou,23.13,113.26,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629
