**-- Importing Libraries**

In [151]:
# Install the 'country_list' library
!pip install country_list

# Import necessary libraries
from country_list import countries_for_language  # Library for getting country names
import pandas as pd  # Data manipulation
from sklearn.impute import SimpleImputer  # Data imputation
import matplotlib.pyplot as plt # Plotting
import seaborn as sns # Data visualization
import numpy as np #Numerical operations
from sklearn.preprocessing import StandardScaler #Data scaling
from sklearn.cluster import KMeans #K-Means clustering



## Tasks:

-- *1. Data Collection:*

    - Access the United Nations data repository or API to retrieve relevant datasets on population and economic indicators for various countries.
    - Download or fetch the necessary datasets in a format suitable for analysis (e.g., CSV, Excel, JSON).

-- **Importing Datasets**

In [152]:
##Economic Indicators


#DATA 1 Population Growth Rates in Urban areas and Capital cities
df_PGR = pd.read_csv('predata/SYB61_253_Population Growth Rates in Urban areas and Capital cities.csv')

#DATA 2 Population, Surface Area and Density
df_PSD = pd.read_csv('predata/SYB65_1_202209_Population, Surface Area and Density.csv')

#DATA 3 Population Growth, Fertility and Mortality Indicators
df_PGF = pd.read_csv('predata/SYB65_246_202209_Population Growth, Fertility and Mortality Indicators.csv')

#DATA 4 International Migrants and Refugees
df_IMR = pd.read_csv('predata/SYB65_327_202209_International Migrants and Refugees.csv')

    - Inspect the data for missing values, duplicates, and outliers. Address missing values and outliers using appropriate techniques.

-- Cleaning Datasets

In [153]:
df_PGR

Unnamed: 0,Country,Year,Series,Value
0,Brazil,2005,Capital city population (as a percentage of to...,1.8
1,Brazil,2005,Capital city population (thousands),3301
2,Brazil,2005,Rural population (percent growth rate per annum),-0.5
3,Brazil,2005,Urban population (percent growth rate per annum),1.7
4,India,2005,Capital city population (as a percentage of to...,1.6
...,...,...,...,...
67,Turkey,2015,Urban population (percent growth rate per annum),2.4
68,Vietnam,2015,Capital city population (as a percentage of to...,3.9
69,Vietnam,2015,Capital city population (thousands),3657
70,Vietnam,2015,Rural population (percent growth rate per annum),0.1


In [154]:
# Pivot the DataFrame
df_PGR = df_PGR.pivot(index=['Country', 'Year'], columns='Series', values='Value').reset_index()
df_PSD = df_PSD.pivot(index=['Country', 'Year'], columns='Series', values='Value').reset_index()
df_PGF = df_PGF.pivot(index=['Country', 'Year'], columns='Series', values='Value').reset_index()
df_IMR = df_IMR.pivot(index=['Country', 'Year'], columns='Series', values='Value').reset_index()

# Rename the columns if needed
df_PGR.columns.name = None
df_PSD.columns.name = None
df_PGF.columns.name = None
df_IMR.columns.name = None


In [155]:
df_IMR

Unnamed: 0,Country,Year,International migrant stock: Both sexes (% total population),International migrant stock: Female (% total Population),International migrant stock: Male (% total Population)
0,Brazil,2005,0.3,0.3,0.4
1,Brazil,2010,0.3,0.3,0.3
2,Brazil,2015,0.3,0.3,0.4
3,Brazil,2020,0.5,0.5,0.6
4,India,2005,0.5,0.5,0.5
5,India,2010,0.5,0.5,0.4
6,India,2015,0.4,0.4,0.4
7,India,2020,0.4,0.4,0.3
8,Japan,2005,1.6,1.7,1.5
9,Japan,2010,1.7,1.8,1.5


In [156]:
# Remove NaN values
df_PGR.dropna()
df_PSD.dropna()
# df_PGF.dropna()
# df_IMR.dropna()

Unnamed: 0,Country,Year,Population aged 0 to 14 years old (percentage),Population aged 60+ years old (percentage),Population density,Population mid-year estimates (millions),Sex ratio (males per 100 females)
0,Brazil,2010,24.8,10.3,23.5,196.35,97.2
1,Brazil,2015,22.5,11.9,24.6,205.19,97.0
2,Brazil,2020,20.8,13.8,25.5,213.2,96.6
3,Brazil,2022,20.3,14.6,25.8,215.31,96.5
4,India,2010,31.0,7.8,417.3,1240.61,107.1
5,India,2015,28.6,8.9,444.9,1322.87,106.9
6,India,2020,26.1,10.2,469.7,1396.39,106.8
7,India,2022,25.3,10.5,476.7,1417.17,106.6
8,Japan,2010,13.2,31.5,340.0,128.11,95.0
9,Japan,2015,12.7,33.9,337.7,127.25,94.8


-- Displaying all the dataframes into CSV

In [157]:
column_data_types_1 = df_PGR.dtypes
column_data_types_2 = df_PSD.dtypes
column_data_types_3 = df_PGF.dtypes
column_data_types_4 = df_IMR.dtypes
print(column_data_types_1)
print(column_data_types_2)
print(column_data_types_3)
print(column_data_types_4)

Country                                                          object
Year                                                              int64
Capital city population (as a percentage of total population)    object
Capital city population (thousands)                              object
Rural population (percent growth rate per annum)                 object
Urban population (percent growth rate per annum)                 object
dtype: object
Country                                           object
Year                                               int64
Population aged 0 to 14 years old (percentage)    object
Population aged 60+ years old (percentage)        object
Population density                                object
Population mid-year estimates (millions)          object
Sex ratio (males per 100 females)                 object
dtype: object
Country                                                     object
Year                                                         int64
Infant 

In [158]:
# Convert columns by position (column number) to float
columns_to_convert_1 = [3]  # Put the column position in a list
columns_to_convert_2 = [5]
# Loop through the selected column positions and apply the conversion
for column_pos in columns_to_convert_1:  # Iterate through the list
    df_PGR.iloc[:, column_pos] = df_PGR.iloc[:, column_pos].str.replace(',', '').astype(float)

for column_pos in columns_to_convert_2:  # Iterate through the list
    df_PSD.iloc[:, column_pos] = df_PSD.iloc[:, column_pos].str.replace(',', '').astype(float)

In [159]:

# Reset the index after dropping rows with NaN values
df_PGR = df_PGR.reset_index(drop=True)
df_PSD = df_PSD.reset_index(drop=True)
df_PGF = df_PGF.reset_index(drop=True)
df_IMR = df_IMR.reset_index(drop=True)


df_PGR.to_csv('data/Population Growth Rates.csv', index=False)
df_PSD.to_csv('data/Population, Surface Area and Density.csv', index=False)
df_PGF.to_csv('data/Population Growth, Fertility and Mortality.csv', index=False)
df_IMR.to_csv('data/International Migrants and Refugees.csv', index=False)

