# Import Required Libraries
Import pandas, numpy, and other necessary libraries for data processing.

In [1]:
import pandas as pd
import numpy as np
import os

# Load Process Dataset
Load the processdataset into a pandas DataFrame.

In [2]:
# Path to processdataset
processdataset_path = '../processdataset/'

# List of CSV files to load
csv_files = [
    'population_demographics_consolidated.csv',
    'economic_consolidated.csv',
    'health_hdi_consolidated.csv',
    'reference_regional_consolidated.csv',
    'environment_energy_consolidated.csv',
    'employment_consolidated.csv',
    'education_consolidated.csv',
    'urbanization_consolidated.csv'
]

# Load and merge dataframes
dfs = []
for file in csv_files:
    file_path = os.path.join(processdataset_path, file)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        dfs.append(df)
    else:
        print(f"File not found: {file_path}")

# Merge all dataframes on 'Year'
if dfs:
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on='Year', how='outer')
    print("Merged dataframe shape:", merged_df.shape)
    print("Columns:", list(merged_df.columns))
else:
    print("No dataframes to merge")

Merged dataframe shape: (65, 94)
Columns: ['Year', 'PopulationDensity', 'Pop0to14Pct', 'Pop15to64Pct', 'Pop65PlusPct', 'BirthsTotal', 'DeathsTotal', 'BirthRatePer1000', 'DeathRatePer1000', 'FertilityRate', 'MedianAge', 'SexRatio', 'DependencyRatio', 'NetMigration', 'PopulationGrowth', 'RuralPopulation_x', 'UrbanPopulation_x', 'UrbanizationPct', 'UrbanGrowthRate_x', 'GDPTotalBillion', 'GDPPerCapita', 'GDPPPPBillion', 'GDPGrowthRate', 'GNIBillion', 'GNIPerCapita', 'GNIPerCapitaPPP', 'AdjustedNNIPerCapita', 'InflationRate', 'ExportsPercentGDP', 'ImportsPercentGDP', 'TradeBalance', 'FDINetInflowsMillion', 'UnemploymentRate_x', 'LifeExpectancy', 'LifeExpectancyMale', 'LifeExpectancyFemale', 'InfantMortalityRate', 'Under5MortalityRate', 'HealthExpenditureGDP', 'HumanCapitalIndex', 'HumanCapitalIndexMale', 'HumanCapitalIndexFemale', 'LearningAdjustedYears', 'World_Population', 'World_Urbanization_Pct', 'VN_Global_Rank', 'VN_Global_Rank_Population', 'VN_ASEAN_Rank', 'VN_ASEAN_Rank_Population',

# Process and Aggregate Data by Year
Filter and aggregate data for each year from 1955 to 2025, computing or deriving values for each specified column.

In [3]:
# Filter data for years 1955 to 2025
merged_df['Year'] = pd.to_numeric(merged_df['Year'], errors='coerce')
filtered_df = merged_df[(merged_df['Year'] >= 1955) & (merged_df['Year'] <= 2025)].copy()

# Calculate Population if not present
if 'Population' not in filtered_df.columns:
    if 'RuralPopulation' in filtered_df.columns and 'UrbanPopulation' in filtered_df.columns:
        filtered_df['Population'] = pd.to_numeric(filtered_df['RuralPopulation'], errors='coerce') + pd.to_numeric(filtered_df['UrbanPopulation'], errors='coerce')
    else:
        filtered_df['Population'] = np.nan

# Sort by Year
filtered_df = filtered_df.sort_values('Year').reset_index(drop=True)

print("Filtered dataframe shape:", filtered_df.shape)
print("Years range:", filtered_df['Year'].min(), "to", filtered_df['Year'].max())

Filtered dataframe shape: (65, 95)
Years range: 1960 to 2024


# Create New DataFrame with Columns
Construct a new DataFrame with columns: Year, Population, Vietnam Global Rank, ASEAN Population Rank, Vietnam's Share of Asian Pop (%), Country's Share of World Pop, Median Age, Regional Median Age, Global Median Age, Dependency Ratio (%), Sex Ratio (M/F), Pop Aged 0–14 (%), Pop Aged 15–64 (%), Pop Aged 65+ (%), GDP per Capita (USD), HDI, Unemployment Rate (%), GDP Growth Rate (%), FDI Net Inflows (million USD), GDP PPP per Capita (Int$), Fertility Rate, Life Expectancy, Birth Rate (‰), Death Rate (‰), Employment Agriculture (%), Employment Industry (%), Employment Services (%), Poverty Rate (%), Health Expenditure (% GDP), Rural Population, Urban Population, Energy Consumption per Capita (kWh), CO₂ Emissions per Capita (t), Agricultural Land (% Land), Forest Area (% Land), Human Capital Index (0-1), Renewable Energy Share (%).

In [4]:
# Define the column mapping
column_mapping = {
    'Year': 'Year',
    'Population': 'Population',
    'Vietnam Global Rank': 'VN_Global_Rank_Population',
    'ASEAN Population Rank': 'VN_ASEAN_Rank_Population',
    'Vietnam\'s Share of Asian Pop (%)': None,  # Not available
    'Country\'s Share of World Pop': None,  # Not available
    'Median Age': 'MedianAge',
    'Regional Median Age': None,  # Not available
    'Global Median Age': None,  # Not available
    'Dependency Ratio (%)': 'DependencyRatio',
    'Sex Ratio (M/F)': 'SexRatio',
    'Pop Aged 0–14 (%)': 'Pop0to14Pct',
    'Pop Aged 15–64 (%)': 'Pop15to64Pct',
    'Pop Aged 65+ (%)': 'Pop65PlusPct',
    'GDP per Capita (USD)': 'GDPPerCapita',
    'HDI': 'HDI',
    'Unemployment Rate (%)': 'UnemploymentRate',
    'GDP Growth Rate (%)': 'GDPGrowthRate',
    'FDI Net Inflows (million USD)': 'FDINetInflowsMillion',
    'GDP PPP per Capita (Int$)': 'GNIPerCapitaPPP',
    'Fertility Rate': 'FertilityRate',
    'Life Expectancy': 'LifeExpectancy',
    'Birth Rate (‰)': 'BirthRatePer1000',
    'Death Rate (‰)': 'DeathRatePer1000',
    'Employment Agriculture (%)': 'AgricultureEmploymentPct',
    'Employment Industry (%)': 'IndustryEmploymentPct',
    'Employment Services (%)': 'ServicesEmploymentPct',
    'Poverty Rate (%)': 'Poverty_Rate_215_Day',
    'Health Expenditure (% GDP)': 'HealthExpenditureGDP',
    'Rural Population': 'RuralPopulation',
    'Urban Population': 'UrbanPopulation',
    'Energy Consumption per Capita (kWh)': 'EnergyUsePerCapita',
    'CO₂ Emissions per Capita (t)': 'CO2EmissionsPerCapita',
    'Agricultural Land (% Land)': 'AgriLandPercent',
    'Forest Area (% Land)': 'ForestAreaPercent',
    'Human Capital Index (0-1)': 'HumanCapitalIndex',
    'Renewable Energy Share (%)': 'RenewableEnergyPercent'
}

# Create new dataframe with specified columns
new_df = pd.DataFrame()
for new_col, old_col in column_mapping.items():
    if old_col and old_col in filtered_df.columns:
        new_df[new_col] = filtered_df[old_col]
    else:
        new_df[new_col] = np.nan

print("New dataframe shape:", new_df.shape)
print("Columns:", list(new_df.columns))

New dataframe shape: (65, 37)
Columns: ['Year', 'Population', 'Vietnam Global Rank', 'ASEAN Population Rank', "Vietnam's Share of Asian Pop (%)", "Country's Share of World Pop", 'Median Age', 'Regional Median Age', 'Global Median Age', 'Dependency Ratio (%)', 'Sex Ratio (M/F)', 'Pop Aged 0–14 (%)', 'Pop Aged 15–64 (%)', 'Pop Aged 65+ (%)', 'GDP per Capita (USD)', 'HDI', 'Unemployment Rate (%)', 'GDP Growth Rate (%)', 'FDI Net Inflows (million USD)', 'GDP PPP per Capita (Int$)', 'Fertility Rate', 'Life Expectancy', 'Birth Rate (‰)', 'Death Rate (‰)', 'Employment Agriculture (%)', 'Employment Industry (%)', 'Employment Services (%)', 'Poverty Rate (%)', 'Health Expenditure (% GDP)', 'Rural Population', 'Urban Population', 'Energy Consumption per Capita (kWh)', 'CO₂ Emissions per Capita (t)', 'Agricultural Land (% Land)', 'Forest Area (% Land)', 'Human Capital Index (0-1)', 'Renewable Energy Share (%)']


# Save to CSV File
Export the new DataFrame to vietnam_population.csv.

In [5]:
# Save to CSV
output_path = '../processdataset/vietnam_population.csv'
new_df.to_csv(output_path, index=False)
print(f"Data saved to {output_path}")
print("First 5 rows:")
print(new_df.head())

Data saved to ../processdataset/vietnam_population.csv
First 5 rows:
   Year  Population  Vietnam Global Rank  ASEAN Population Rank  \
0  1960         NaN                  NaN                    NaN   
1  1961         NaN                  NaN                    NaN   
2  1962         NaN                  NaN                    NaN   
3  1963         NaN                  NaN                    NaN   
4  1964         NaN                  NaN                    NaN   

   Vietnam's Share of Asian Pop (%)  Country's Share of World Pop  Median Age  \
0                               NaN                           NaN    653817.0   
1                               NaN                           NaN    619282.0   
2                               NaN                           NaN    624174.5   
3                               NaN                           NaN    677340.5   
4                               NaN                           NaN    683751.0   

   Regional Median Age  Global Median Age