In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.impute import KNNImputer

In [7]:
# Load data
df = pd.read_csv(r"./World_development_mesurement.csv")
df1=df.copy()
df.columns = df.columns.str.replace(' ', '_')
df1.head()

Unnamed: 0,Birth Rate,Business Tax Rate,CO2 Emissions,Country,Days to Start Business,Ease of Business,Energy Usage,GDP,Health Exp % GDP,Health Exp/Capita,...,Life Expectancy Male,Mobile Phone Usage,Number of Records,Population 0-14,Population 15-64,Population 65+,Population Total,Population Urban,Tourism Inbound,Tourism Outbound
0,0.02,,87931.0,Algeria,,,26998.0,"$54,790,058,957",0.035,$60,...,67.0,0.0,1,0.342,0.619,0.039,31719449,0.599,"$102,000,000","$193,000,000"
1,0.05,,9542.0,Angola,,,7499.0,"$9,129,594,819",0.034,$22,...,44.0,0.0,1,0.476,0.499,0.025,13924930,0.324,"$34,000,000","$146,000,000"
2,0.043,,1617.0,Benin,,,1983.0,"$2,359,122,303",0.043,$15,...,53.0,0.0,1,0.454,0.517,0.029,6949366,0.383,"$77,000,000","$50,000,000"
3,0.027,,4276.0,Botswana,,,1836.0,"$5,788,311,645",0.047,$152,...,49.0,0.1,1,0.383,0.587,0.029,1755375,0.532,"$227,000,000","$209,000,000"
4,0.046,,1041.0,Burkina Faso,,,,"$2,610,959,139",0.051,$12,...,49.0,0.0,1,0.468,0.505,0.028,11607944,0.178,"$23,000,000","$30,000,000"


In [8]:
# Creating a function to handle string characters and convert the non numeric into float
def Stringfunction(x):
    if isinstance(x, str):
        x = x.replace('$','')
        x = x.replace(',', '')
        x = x.replace('%', '')
        x = float(x)
    elif isinstance(x, float):
        pass  # no need to do anything if it's already a float
    else:
        try:
            x = x.replace('$','')
            x = x.replace(',', '')
            x = x.replace('%', '')
            x = float(x)
        except:
            pass
    return x
df=df.drop('Country', axis=1)
df = df.applymap(Stringfunction) # Applymap aplies function to each element of the dataframe
df['Country']=df1['Country']
 # Dropping unnecessary columns
df = df.drop(['Number_of_Records', 'Ease_of_Business'], axis=1)


  df = df.applymap(Stringfunction) # Applymap aplies function to each element of the dataframe


In [9]:
# Handling missing values
imputer = KNNImputer(n_neighbors=3)
df_impute = df.drop('Country', axis=1)
imputed = imputer.fit_transform(df_impute)
df_imputed = pd.DataFrame(imputed, columns=df_impute.columns)

In [10]:
# Dropping features with high missing values, unnecessary features
df_imputed = df_imputed.drop(['Business_Tax_Rate', 'Hours_to_do_Tax', 'Days_to_Start_Business','Lending_Interest','Health_Exp/Capita'], axis=1)
df_imputed['Country']=df1['Country']


In [11]:
# Handling outliers using IQR
for col in df_imputed.columns:
    if col != 'Country':
        Q1 = df_imputed[col].quantile(0.25)
        Q3 = df_imputed[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_imputed[col] = np.where(df_imputed[col] < lower_bound, lower_bound, df_imputed[col])
        df_imputed[col] = np.where(df_imputed[col] > upper_bound, upper_bound, df_imputed[col])


In [12]:
# Scaling data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_imputed.drop('Country', axis=1))

In [13]:
# PCA for dimensionality reduction
pca = PCA(n_components=4) 
pca_values = pca.fit_transform(scaled_data)
pca_data = pd.DataFrame(pca_values, columns=['pc1', 'pc2', 'pc3', 'pc4'])
pca_data=np.array(pca_data)
# Hierarchical Clustering
kmeans_pca = KMeans(n_clusters=3,random_state=0)
kmeans_pca.fit(pca_data)




In [14]:
# Assigning labels to the data
labels = kmeans_pca.labels_
df['Cluster'] = labels
print(df['Cluster'])

model = {'scaler': scaler, 'pca': pca, 'kmean': kmeans_pca}
with open('./trained_model_clustering.pkl', 'wb') as f:
    pickle.dump(model, f)

0       1
1       1
2       1
3       1
4       1
       ..
2699    0
2700    2
2701    0
2702    2
2703    0
Name: Cluster, Length: 2704, dtype: int32
