# Population data (2022–2025) for selected countries

This notebook cleans and structures **World population** data for the following countries:

- United Kingdom
- New Zealand
- India
- United States
- Canada
- Australia

**Target study period:** **2022–2025**.

> Note: the attached source file contains yearly values up to **2024**.  
> We will still create a 2025 row/column, but it will be **missing (NaN)** unless you provide a source that includes 2025.


# Imports

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load raw data

In [2]:
DATA_PATH = Path(r"/Users/linaabdulsamad/Desktop/Bootcamp/Chocolate/datasets/jl_world_population.csv")

# The file is semicolon-separated (;) — important!
raw = pd.read_csv(DATA_PATH, sep=";")

raw.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,China,CHN,"Population, total",SP.POP.TOTL,667070000.0,660330000.0,665770000.0,682335000.0,698355000.0,715185000.0,...,1379860000.0,1387790000.0,1396215000.0,1402760000.0,1407745000.0,1411100000.0,1412360000.0,1412175000.0,1410710000.0,1408975000.0
1,India,IND,"Population, total",SP.POP.TOTL,435990338.0,446564729.0,457283090.0,468138575.0,479229598.0,490140146.0,...,1328024000.0,1343944000.0,1359657000.0,1374659000.0,1389030000.0,1402618000.0,1414204000.0,1425423000.0,1438070000.0,1450936000.0
2,United States,USA,"Population, total",SP.POP.TOTL,180671000.0,183691000.0,186538000.0,189242000.0,191889000.0,194303000.0,...,321815100.0,324353300.0,326608600.0,328529600.0,330226200.0,331577700.0,332099800.0,334017300.0,336806200.0,340111000.0
3,Russian Federation,RUS,"Population, total",SP.POP.TOTL,119897000.0,121236000.0,122591000.0,123960000.0,125345000.0,126745000.0,...,144640700.0,145015500.0,145293300.0,145398100.0,145453300.0,145245100.0,144746800.0,144236900.0,143826100.0,143533900.0
4,Low income,LIC,"Population, total",SP.POP.TOTL,114749038.0,117449222.0,120247507.0,123153845.0,126203534.0,129388226.0,...,488308300.0,501149300.0,514890500.0,529522000.0,544981000.0,560643100.0,576076600.0,591481500.0,607774700.0,624608100.0


# Select countries & reshape to a tidy format

In [3]:
countries = [
    "United Kingdom",
    "New Zealand",
    "India",
    "United States",
    "Canada",
    "Australia",
]

# Keep only the selected countries
df = raw[raw["Country Name"].isin(countries)].copy()

# Years we need
years = ["2022", "2023", "2024", "2025"]  # 2025 isn't present in source → will become NaN

# Ensure year columns exist (create missing ones as NaN)
for y in years:
    if y not in df.columns:
        df[y] = np.nan

# Keep only the columns we need
df = df[["Country Name", "Country Code"] + years]

# Tidy / long format: one row = (country, year, population)
pop_long = df.melt(
    id_vars=["Country Name", "Country Code"],
    value_vars=years,
    var_name="Year",
    value_name="Population"
)

pop_long["Year"] = pop_long["Year"].astype(int)

# Sort for readability
pop_long = pop_long.sort_values(["Country Name", "Year"]).reset_index(drop=True)

pop_long.head(12)

Unnamed: 0,Country Name,Country Code,Year,Population
0,Australia,AUS,2022,26018720.0
1,Australia,AUS,2023,26659920.0
2,Australia,AUS,2024,27196810.0
3,Australia,AUS,2025,
4,Canada,CAN,2022,38935930.0
5,Canada,CAN,2023,40083480.0
6,Canada,CAN,2024,41288600.0
7,Canada,CAN,2025,
8,India,IND,2022,1425423000.0
9,India,IND,2023,1438070000.0


# Quick checks

In [4]:
# Which years are missing?
pop_long.groupby("Year")["Population"].apply(lambda s: s.isna().sum())

Year
2022    0
2023    0
2024    0
2025    6
Name: Population, dtype: int64

In [5]:
# Basic stats (per year)
pop_long.groupby("Year")["Population"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022,6.0,316180100.0,556842700.0,5081700.0,29248024.25,53269967.0,267414000.0,1425423000.0
2023,6.0,319218500.0,561625400.0,5200000.0,30015812.5,54287742.0,269727700.0,1438070000.0
2024,6.0,322340900.0,566521400.0,5287500.0,30719758.75,55257299.5,272389700.0,1450936000.0
2025,0.0,,,,,,,


# Optional: wide format for joins

In [6]:
# Wide table (index=Year, columns=Country)
pop_wide = pop_long.pivot_table(index="Year", columns="Country Name", values="Population", aggfunc="first")

pop_wide

Country Name,Australia,Canada,India,New Zealand,United Kingdom,United States
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022,26018721.0,38935934.0,1425423000.0,5081700.0,67604000.0,334017321.0
2023,26659922.0,40083484.0,1438070000.0,5200000.0,68492000.0,336806231.0
2024,27196812.0,41288599.0,1450936000.0,5287500.0,69226000.0,340110988.0


In [7]:
locals().keys()

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', 'open', '_', '__', '___', '__vsc_ipynb_file__', '_i', '_ii', '_iii', '_i1', 'pd', 'np', 'Path', '_i2', 'DATA_PATH', 'raw', '_2', '_i3', 'countries', 'df', 'years', 'y', 'pop_long', '_3', '_i4', '_4', '_i5', '_5', '_i6', 'pop_wide', '_6', '_i7'])

# Save cleaned outputs

In [8]:
pop_long.to_csv(
    "/Users/linaabdulsamad/Desktop/Bootcamp/Chocolate/datasets/population_2022_2025_clean.csv",
    index=False
)

In [9]:
from dotenv import dotenv_values
from sqlalchemy import create_engine
from urllib.parse import quote_plus
import pandas as pd
from sqlalchemy import Integer, String, Float, DateTime, Date

config = dotenv_values()

user = config["POSTGRES_USER"]
password = quote_plus(config["POSTGRES_PASS"])
host = config["POSTGRES_HOST"]
port = config["POSTGRES_PORT"]
dbname = config["POSTGRES_DB"]
schema = config["POSTGRES_SCHEMA"]

In [10]:
url = f'postgresql://{user}:{password}@{host}:{port}/{dbname}'

In [11]:
schema

'air_force'

In [14]:
df_population = pd.read_csv("/Users/linaabdulsamad/Desktop/Bootcamp/Chocolate/datasets/population_2022_2025_clean.csv", index_col=0, parse_dates=True)

  df_population = pd.read_csv("/Users/linaabdulsamad/Desktop/Bootcamp/Chocolate/datasets/population_2022_2025_clean.csv", index_col=0, parse_dates=True)


In [16]:
df_population.head()

Unnamed: 0_level_0,Country Code,Year,Population
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,AUS,2022,26018721.0
Australia,AUS,2023,26659922.0
Australia,AUS,2024,27196812.0
Australia,AUS,2025,
Canada,CAN,2022,38935934.0


In [18]:
df_population.dtypes

Country Code     object
Year              int64
Population      float64
dtype: object

In [21]:
engine = create_engine(url, echo=False)

df_population.to_sql(
    "jl_population_2022_2025", # name of the table in the database
    engine,
    schema = schema,
    if_exists="replace", # this replaces an existing table!
    index=True,
    dtype={
        "Country Name": String(),
        "Country Code": String(),
        "Year": Integer(),
        "Population": Float(),
    }
)

24