**-- Importing Libraries**

In [281]:
# Install the 'country_list' library
!pip install country_list

# Import necessary libraries
from country_list import countries_for_language  # Library for getting country names
import pandas as pd  # Data manipulation
from sklearn.impute import SimpleImputer  # Data imputation
import matplotlib.pyplot as plt # Plotting
import seaborn as sns # Data visualization
import numpy as np #Numerical operations
from sklearn.preprocessing import StandardScaler #Data scaling
from sklearn.cluster import KMeans #K-Means clustering



## Tasks:

-- *1. Data Collection:*

    - Access the United Nations data repository or API to retrieve relevant datasets on population and economic indicators for various countries.
    - Download or fetch the necessary datasets in a format suitable for analysis (e.g., CSV, Excel, JSON).

-- **Importing Datasets**

In [282]:
##Economic Indicators


#DATA 1 SYB65_125_202209_Exchange Rates
tf_ExR = pd.read_csv('data/SYB65_130_202209_Exchange Rates.csv', skiprows=1)

#DATA 2 SYB65_176_202209_Tourist-Visitors Arrival and Expenditure
tf_TVAE = pd.read_csv('data/SYB65_176_202209_Tourist-Visitors Arrival and Expenditure.csv', skiprows=1)

#DATA 3 SYB65_230_202209_GDP and GDP Per Capita
tf_GDP = pd.read_csv('data/SYB65_230_202209_GDP and GDP Per Capita.csv', skiprows=1)

    - Inspect the data for missing values, duplicates, and outliers. Address missing values and outliers using appropriate techniques.

-- Cleaning Datasets

In [283]:
#Droping Inefficient Columns
columns_to_drop_1 = ['National currency footnote','Footnotes', 'Source']
df_ExR = tf_ExR.drop(columns=columns_to_drop_1)

columns_to_drop_2 = ['Tourism arrivals series type','Tourism arrivals series type footnote','Footnotes', 'Source']
df_TVAE = tf_TVAE.drop(columns=columns_to_drop_2)

columns_to_drop_3 = ['Footnotes', 'Source']
df_GDP = tf_GDP.drop(columns=columns_to_drop_3)

In [284]:
df_GDP

Unnamed: 0,Region/Country/Area,Unnamed: 1,Year,Series,Value
0,1,"Total, all countries or areas",1995,GDP in current prices (millions of US dollars),31247262
1,1,"Total, all countries or areas",2005,GDP in current prices (millions of US dollars),47730924
2,1,"Total, all countries or areas",2010,GDP in current prices (millions of US dollars),66461443
3,1,"Total, all countries or areas",2015,GDP in current prices (millions of US dollars),75133208
4,1,"Total, all countries or areas",2018,GDP in current prices (millions of US dollars),86357998
...,...,...,...,...,...
6781,716,Zimbabwe,2010,GDP real rates of growth (percent),19.7
6782,716,Zimbabwe,2015,GDP real rates of growth (percent),1.8
6783,716,Zimbabwe,2018,GDP real rates of growth (percent),4.8
6784,716,Zimbabwe,2019,GDP real rates of growth (percent),-8.3


In [285]:
#Defining an array of new column names
new_column_names_1 = ['Region_ID', 'Country/Region', 'Year', 'Exchange rates_Series', 'National currency', 'Exchange_rates_value']
new_column_names_2 = ['Region_ID', 'Country/Region', 'Year', 'Tourism expenditure(millions of US dollars)_Series', 'Tourism expenditure_value']
new_column_names_3 = ['Region_ID', 'Country/Region', 'Year', 'GDP per capita (US dollars)_Series', 'GDP per capita_value']

In [286]:
#Changing Column names
df_ExR.columns = new_column_names_1
df_TVAE.columns = new_column_names_2
df_GDP.columns = new_column_names_3


In [287]:
#Pivoting DataFrames
df_ExR = df_ExR.pivot(index=['Region_ID', 'Country/Region', 'Year'], columns='Exchange rates_Series', values='Exchange_rates_value').reset_index()
df_TVAE = df_TVAE.pivot(index=['Region_ID', 'Country/Region', 'Year'], columns='Tourism expenditure(millions of US dollars)_Series', values='Tourism expenditure_value').reset_index()
df_GDP = df_GDP.pivot(index=['Region_ID', 'Country/Region', 'Year'], columns='GDP per capita (US dollars)_Series', values='GDP per capita_value').reset_index()

#Renaming the columns if needed
df_ExR.columns.name = None 
df_TVAE.columns.name = None
df_GDP.columns.name = None

In [288]:
df_ExR

Unnamed: 0,Region_ID,Country/Region,Year,Exchange rates: end of period (national currency per US dollar),Exchange rates: period average (national currency per US dollar)
0,4,Afghanistan,1995,47.5,36.6
1,4,Afghanistan,2005,50.4,49.5
2,4,Afghanistan,2010,45.3,46.5
3,4,Afghanistan,2015,68.1,61.1
4,4,Afghanistan,2018,75.0,72.1
...,...,...,...,...,...
1800,894,Zambia,2015,11.0,8.6
1801,894,Zambia,2018,11.9,10.5
1802,894,Zambia,2019,14.1,12.9
1803,894,Zambia,2020,21.2,18.3


In [289]:
df_ExR = df_ExR.iloc[:, :-1]
drop_col2=['Tourist/visitor arrivals (thousands)']
df_TVAE = df_TVAE.drop(drop_col2, axis=1)
drop_col3=['GDP in constant 2010 prices (millions of US dollars)','GDP in current prices (millions of US dollars)']
df_GDP = df_GDP.drop(drop_col3, axis=1)

-- Displaying all the dataframes into CSV

In [290]:
column_data_types_1 = df_ExR.dtypes
column_data_types_2 = df_TVAE.dtypes
column_data_types_3 = df_GDP.dtypes
print(column_data_types_1)
print(column_data_types_2)
print(column_data_types_3)

Region_ID                                                           int64
Country/Region                                                     object
Year                                                                int64
Exchange rates: end of period (national currency per US dollar)    object
dtype: object
Region_ID                                        int64
Country/Region                                  object
Year                                             int64
Tourism expenditure (millions of US dollars)    object
dtype: object
Region_ID                              int64
Country/Region                        object
Year                                   int64
GDP per capita (US dollars)           object
GDP real rates of growth (percent)    object
dtype: object


In [291]:
# Convert columns by position (column number) to float
columns_to_convert_1 = [3]  # Put the column position in a list
columns_to_convert_2 = [3,4]
# Loop through the selected column positions and apply the conversion
for column_pos in columns_to_convert_1:  # Iterate through the list
    df_ExR.iloc[:, column_pos] = df_ExR.iloc[:, column_pos].str.replace(',', '').astype(float)
    df_TVAE.iloc[:, column_pos] = df_TVAE.iloc[:, column_pos].str.replace(',', '').astype(float)

for column_pos in columns_to_convert_2:  # Iterate through the list
    df_GDP.iloc[:, column_pos] = df_GDP.iloc[:, column_pos].str.replace(',', '').astype(float)

In [292]:
df_ExR = df_ExR[df_ExR['Country/Region'].isin(['India', 'Iceland'])]
df_TVAE = df_TVAE[df_TVAE['Country/Region'].isin(['India', 'Iceland'])]
df_GDP = df_GDP[df_GDP['Country/Region'].isin(['India', 'Iceland'])]

In [293]:
df_ExR.to_csv('Exchange Rates.csv', index=False)
df_TVAE.to_csv('Tourism.csv', index=False)
df_GDP.to_csv('GDP per Capita.csv', index=False)

In [294]:
all_years_1 = df_ExR['Year'].unique()
all_years_2 = df_TVAE['Year'].unique()
all_years_3 = df_GDP['Year'].unique()

In [295]:
for each in all_years_1:
    df_temp = df_ExR[df_ExR['Year']==each]
    name = f"data/{each}_exchange.csv"
    df_temp.to_csv(name)

for each in all_years_2:
    df_tem = df_TVAE[df_TVAE['Year']==each]
    name = f"data/{each}__tourism.csv"
    df_tem.to_csv(name)

for each in all_years_3:
    df_te = df_GDP[df_GDP['Year']==each]
    name = f"data/{each}_GDP.csv"
    df_te.to_csv(name)
