# Dataset cleaning script

Does few steps:

1. Imports the CSV
2. Removes the regions from dataset
3. Moves format to long format, as we can easily drop missing (N/A) rows
4. Get's the features to include into training
5. Then creates dataset ready for training (with the dependent variable as index and independent variables as features)

# 1 Import CSV (exported from Excel to CSV)

In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

file_path = "WDIEXCEL.csv"

# Load the data from the CSV file
data = pd.read_csv(file_path)

# 2 Remove regions (and unused years from dataset)

In [None]:
countries_to_keep = [
  "AFG", "ALB", "DZA", "AND", "AGO", "ATG", "ARG", "ARM", "AUS", "AUT", "AZE", "BHS",
  "BHR", "BGD", "BRB", "BLR", "BEL", "BLZ", "BEN", "BTN", "BOL", "BIH", "BWA", "BRA",
  "BRN", "BGR", "BFA", "BDI", "CPV", "KHM", "CMR", "CAN", "CAF", "TCD", "CHL", "CHN",
  "COL", "COM", "COG", "COD", "CRI", "CIV", "HRV", "CUB", "CYP", "CZE", "DNK", "DJI",
  "DMA", "DOM", "ECU", "EGY", "SLV", "GNQ", "ERI", "EST", "SWZ", "ETH", "FJI", "FIN",
  "FRA", "GAB", "GMB", "GEO", "DEU", "GHA", "GRC", "GRD", "GTM", "GIN", "GNB", "GUY",
  "HTI", "HND", "HUN", "ISL", "IND", "IDN", "IRN", "IRQ", "IRL", "ISR", "ITA", "JAM",
  "JPN", "JOR", "KAZ", "KEN", "KIR", "PRK", "KOR", "KWT", "KGZ", "LAO", "LVA", "LBN",
  "LSO", "LBR", "LBY", "LIE", "LTU", "LUX", "MDG", "MWI", "MYS", "MDV", "MLI", "MLT",
  "MHL", "MRT", "MUS", "MEX", "FSM", "MDA", "MCO", "MNG", "MNE", "MAR", "MOZ", "MMR",
  "NAM", "NRU", "NPL", "NLD", "NZL", "NIC", "NER", "NGA", "MKD", "NOR", "OMN", "PAK",
  "PLW", "PAN", "PNG", "PRY", "PER", "PHL", "POL", "PRT", "QAT", "ROU", "RUS", "RWA",
  "KNA", "LCA", "VCT", "WSM", "SMR", "STP", "SAU", "SEN", "SRB", "SYC", "SLE", "SGP",
  "SVK", "SVN", "SLB", "SOM", "ZAF", "SSD", "ESP", "LKA", "SDN", "SUR", "SWE", "CHE",
  "SYR", "TJK", "TZA", "THA", "TLS", "TGO", "TON", "TTO", "TUN", "TUR", "TKM", "TUV",
  "UGA", "UKR", "ARE", "GBR", "USA", "URY", "UZB", "VUT", "VAT", "VEN", "VNM", "YEM",
  "ZMB", "ZWE"

]

# Generate year strings for the columns to drop (1960 untill 2000)
years = [str(year) for year in range(1960, 2001)]

# drop the Country Name, Indicator Name and years 1960-2000 columns
preprocessed_data = data.drop(columns=["Country Name", "Indicator Name"] + years)

# drop the combined country data (world, asia, etc.)
preprocessed_data = preprocessed_data[preprocessed_data["Country Code"].isin(countries_to_keep)]

# 3 Move format to long format (and drop N/A rows)

In [None]:
long_formatted_data = preprocessed_data.melt(
    id_vars=["Country Code", "Indicator Code"],  # Columns to keep fixed
    var_name="Year",                             # Column name for years
    value_name="Value"                           # Column name for indicator values
)
long_formatted_data.dropna(inplace=True)

# 4 Include features

In [None]:
# Separate GDP and Life Expectancy into different columns
long_formatted_data["Year"] = long_formatted_data["Year"].astype(int)  # Ensure Year is numeric

# Get the GDP (still have to convert log to linear)
gdp = long_formatted_data[long_formatted_data["Indicator Code"] == "NY.GDP.PCAP.CD"].rename(columns={"Value": "GDP"})
life_expectancy = long_formatted_data[long_formatted_data["Indicator Code"] == "SP.DYN.LE00.IN"].rename(columns={"Value": "Life Expectancy"})

features_data = pd.merge(
    gdp[["Country Code", "Year", "GDP"]],
    life_expectancy[["Country Code", "Year", "Life Expectancy"]],
    on=["Country Code", "Year"]
)

# 5 Create dataset suitable for training

Training datasets are normally formatted in a way were the index corresponds to the dependent variable.

In [None]:
dataset_final = features_data.set_index("Life Expectancy")

# Step 5: Rearrange columns to include Country, GDP, and Year as features (for now, for the final set: Year and Country Code should be removed!)
dataset_final = dataset_final[["Country Code", "GDP", "Year"]]

# Display the final dataset
print(dataset_final)