# Dataset cleaning script

Does few steps:

1. Imports the CSV
2. Removes the regions from dataset
3. Moves format to long format, as we can easily drop missing (N/A) rows
4. Get's the features to include into training
5. Then creates dataset ready for training (with the dependent variable as index and independent variables as features)

# 1 Import CSV (exported from Excel to CSV)

In [1]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

file_path = "WDIEXCEL.xlsx"

# Load the data from the CSV file
data = pd.read_excel(file_path)


In [2]:
# export time series name and code
indicator_name_and_code = data[['Indicator Name', 'Indicator Code']].copy()

indicator_name_and_code.drop_duplicates(inplace=True)
indicator_name_and_code.to_csv('../2_correlation_tests/indicator_name_and_code.csv', index=False)

# 2 Remove regions (and unused years from dataset)

In [3]:
countries_to_keep = [
  "AFG", "ALB", "DZA", "AND", "AGO", "ATG", "ARG", "ARM", "AUS", "AUT", "AZE", "BHS",
  "BHR", "BGD", "BRB", "BLR", "BEL", "BLZ", "BEN", "BTN", "BOL", "BIH", "BWA", "BRA",
  "BRN", "BGR", "BFA", "BDI", "CPV", "KHM", "CMR", "CAN", "CAF", "TCD", "CHL", "CHN",
  "COL", "COM", "COG", "COD", "CRI", "CIV", "HRV", "CUB", "CYP", "CZE", "DNK", "DJI",
  "DMA", "DOM", "ECU", "EGY", "SLV", "GNQ", "ERI", "EST", "SWZ", "ETH", "FJI", "FIN",
  "FRA", "GAB", "GMB", "GEO", "DEU", "GHA", "GRC", "GRD", "GTM", "GIN", "GNB", "GUY",
  "HTI", "HND", "HUN", "ISL", "IND", "IDN", "IRN", "IRQ", "IRL", "ISR", "ITA", "JAM",
  "JPN", "JOR", "KAZ", "KEN", "KIR", "PRK", "KOR", "KWT", "KGZ", "LAO", "LVA", "LBN",
  "LSO", "LBR", "LBY", "LIE", "LTU", "LUX", "MDG", "MWI", "MYS", "MDV", "MLI", "MLT",
  "MHL", "MRT", "MUS", "MEX", "FSM", "MDA", "MCO", "MNG", "MNE", "MAR", "MOZ", "MMR",
  "NAM", "NRU", "NPL", "NLD", "NZL", "NIC", "NER", "NGA", "MKD", "NOR", "OMN", "PAK",
  "PLW", "PAN", "PNG", "PRY", "PER", "PHL", "POL", "PRT", "QAT", "ROU", "RUS", "RWA",
  "KNA", "LCA", "VCT", "WSM", "SMR", "STP", "SAU", "SEN", "SRB", "SYC", "SLE", "SGP",
  "SVK", "SVN", "SLB", "SOM", "ZAF", "SSD", "ESP", "LKA", "SDN", "SUR", "SWE", "CHE",
  "SYR", "TJK", "TZA", "THA", "TLS", "TGO", "TON", "TTO", "TUN", "TUR", "TKM", "TUV",
  "UGA", "UKR", "ARE", "GBR", "USA", "URY", "UZB", "VUT", "VAT", "VEN", "VNM", "YEM",
  "ZMB", "ZWE"

]

# Generate year strings for the columns to drop (1960 untill 2000)
years = [str(year) for year in range(1960, 2001)]

# drop the Country Name, Indicator Name and years 1960-2000 columns
preprocessed_data = data.drop(columns=["Country Name", "Indicator Name"] + years)

# drop the combined country data (world, asia, etc.)
preprocessed_data = preprocessed_data[preprocessed_data["Country Code"].isin(countries_to_keep)]

# 3 Move format to long format (and drop N/A rows)

In [4]:
long_formatted_data = preprocessed_data.melt(
    id_vars=["Country Code", "Indicator Code"],  # Columns to keep fixed
    var_name="Year",                             # Column name for years
    value_name="Value"                           # Column name for indicator values
)
long_formatted_data.dropna(inplace=True)
long_formatted_data.to_csv("../2_correlation_tests/long_formatted_data.csv", index=False)

# 4 Include features

In [30]:
# Separate GDP and Life Expectancy into different columns
long_formatted_data["Year"] = long_formatted_data["Year"].astype(int)  # Ensure Year is numeric

# Get the GDP (still have to convert log to linear)
gdp = long_formatted_data[long_formatted_data["Indicator Code"] == "NY.GDP.PCAP.CD"].rename(columns={"Value": "GDP"})
life_expectancy = long_formatted_data[long_formatted_data["Indicator Code"] == "SP.DYN.LE00.IN"].rename(columns={"Value": "Life Expectancy"})
underfivedeaths = long_formatted_data[long_formatted_data["Indicator Code"] == "SH.DYN.MORT"].rename(columns={"Value": "UnderFiveDeaths"})
adultmortality = long_formatted_data[long_formatted_data["Indicator Code"] == "SP.DYN.AMRT.MA"].rename(columns={"Value": "MortRateAdult"})
accessElectricity = long_formatted_data[long_formatted_data["Indicator Code"] == "EG.ELC.ACCS.RU.ZS"].rename(columns={"Value": "AccessElectricityRural"})
cleanfuels = long_formatted_data[long_formatted_data["Indicator Code"] == "EG.CFT.ACCS.ZS"].rename(columns={"Value": "CleanFuels"})
birthrate = long_formatted_data[long_formatted_data["Indicator Code"] == "SP.DYN.TFRT.IN"].rename(columns={"Value": "FertilityRate"})
basicsanitation = long_formatted_data[long_formatted_data["Indicator Code"] == "SH.STA.BASS.ZS"].rename(columns={"Value": "BasicSanitation"})
healthexpenditure = long_formatted_data[long_formatted_data["Indicator Code"] == "SH.XPD.CHEX.PC.CD"].rename(columns={"Value": "HealthExpenditure"})

features_data = pd.DataFrame()
features_data = pd.merge(
    gdp[["Country Code", "Year", "GDP"]],
    life_expectancy[["Country Code", "Year", "Life Expectancy"]],
    on=["Country Code", "Year"]
)

features_data = pd.merge(
     underfivedeaths[["Country Code", "Year", "UnderFiveDeaths"]],
     features_data,
     on=["Country Code", "Year"]
 )
features_data = pd.merge(
    adultmortality[["Country Code", "Year", "MortRateAdult"]],
    features_data,
    on=["Country Code", "Year"]
)
features_data = pd.merge(
    healthexpenditure[["Country Code", "Year", "HealthExpenditure"]],
    features_data,
    on=["Country Code", "Year"]
)

features_data = pd.merge(
    cleanfuels[["Country Code", "Year", "CleanFuels"]],
    features_data,
    on=["Country Code", "Year"]
)

features_data = pd.merge(
    birthrate[["Country Code", "Year", "FertilityRate"]],
    features_data,
    on=["Country Code", "Year"]
)

features_data = pd.merge(
    accessElectricity[["Country Code", "Year", "AccessElectricityRural"]],
    features_data,
    on=["Country Code", "Year"]
)

features_data = pd.merge(
    basicsanitation[["Country Code", "Year", "BasicSanitation"]],
    features_data,
    on=["Country Code", "Year"]
)


## 4.5 Transform from log

In [31]:
import numpy as np
features_data["GDP"] = np.log(features_data["GDP"])
features_data["MortRateAdult"] = np.log(features_data["MortRateAdult"])
features_data["AccessElectricityRural"] = np.log(features_data["AccessElectricityRural"])
features_data["CleanFuels"] = np.log(features_data["CleanFuels"])
features_data["HealthExpenditure"] = np.log(features_data["HealthExpenditure"])

# 5 Create dataset suitable for training

Training datasets are normally formatted in a way were the index corresponds to the dependent variable.

In [32]:
dataset_final = features_data.set_index("Life Expectancy")

# Step 5: Rearrange columns to include Country, GDP, and Year as features (for now, for the final set: Year and Country Code should be removed!)
dataset_final = dataset_final[["GDP", "MortRateAdult", "CleanFuels", "UnderFiveDeaths", "FertilityRate", "BasicSanitation", "AccessElectricityRural", "HealthExpenditure"]]		

# Display the final dataset
print(dataset_final)

dataset_final.to_csv("../3_training/final_dataset.csv")

                      GDP  MortRateAdult  CleanFuels  UnderFiveDeaths  \
Life Expectancy                                                         
75.639           7.155911       4.692137    3.713572             25.8   
70.823           7.469986       5.027348    4.577799             40.1   
46.590           6.268081       6.176034    3.706228            198.9   
75.078           9.259410       5.050234    4.605170             15.1   
74.186           8.877518       5.162160    4.567468             19.0   
...                   ...            ...         ...              ...   
70.331           7.472676       5.355402    4.421247             14.7   
70.299           7.978570       5.227755    1.987874             24.0   
75.378           8.184889       5.094774    4.550714             20.9   
62.380           6.863628       5.808347    2.388763             59.7   
61.124           7.224532       6.022593    3.411148             51.8   

                 FertilityRate  BasicSanitation  A