# 

In [14]:
import pandas as pd 
import numpy as np 
from bokeh.plotting import figure, show, output_notebook

output_notebook()

file_path = "data/births-and-deaths-projected-to-2100.csv"
data = pd.read_csv(file_path)

data.head()


Unnamed: 0,Entity,Code,Year,Deaths - Sex: all - Age: all - Variant: estimates,Deaths - Sex: all - Age: all - Variant: medium,Births - Sex: all - Age: all - Variant: estimates,Births - Sex: all - Age: all - Variant: medium
0,Afghanistan,AFG,1950,290972.0,,383985.0,
1,Afghanistan,AFG,1951,288752.0,,391002.0,
2,Afghanistan,AFG,1952,288059.0,,397663.0,
3,Afghanistan,AFG,1953,287712.0,,404666.0,
4,Afghanistan,AFG,1954,289189.0,,410428.0,


In [12]:
# inspect the data


data.info()

data.head()

# check for missing values

missing_values = data.isnull().sum()

print('Missing values per colume:')
print(missing_values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38656 entries, 0 to 38655
Data columns (total 7 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Entity                                             38656 non-null  object 
 1   Code                                               35938 non-null  object 
 2   Year                                               38656 non-null  int64  
 3   Deaths - Sex: all - Age: all - Variant: estimates  18944 non-null  float64
 4   Deaths - Sex: all - Age: all - Variant: medium     19712 non-null  float64
 5   Births - Sex: all - Age: all - Variant: estimates  18722 non-null  float64
 6   Births - Sex: all - Age: all - Variant: medium     19481 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 2.1+ MB
Missing values per colume:
Entity                                                   0
Code                    

## Helper function to convert values

The result will display in scientific notation so this helper function will convert values to integers and also handle NaN values
Safely converts a column in a DataFrame to integers, handling NaN values.

In [26]:
# try catch function to convert scientific notation and check for missing values

def safe_convert_to_int(df, column_name, fill_value=0):

    try:
        # Fill NaN values with the specified fill_value
        df[column_name] = df[column_name].fillna(fill_value).astype(int)
    except Exception as e:
        print(f"Error converting column '{column_name}' to int: {e}")
    return df



 # Rename Headers and Clean Null Data
 
The column headers in the dataset were unclear and there it included a txt file with descriptors the Birth/Death data projections were in a separate column thatn up to the present year , which resulted in thousands in null values in several columns .

Below we display the column descriptors, clean data and rename columns for clarity.

In [27]:
# Display the column descriptors for reference
with open("data/global_births_deaths_column_descriptors.txt", "r") as file:
    descriptors = file.read()
print("Column Descriptors:\n", descriptors)

# Separate the data based on the Year and column purpose
# Historical data: Years <= 2023, use 'Estimates' columns
historical_data = data[data["Year"] <= 2023][[
    "Entity", "Code", "Year",
    "Deaths - Sex: all - Age: all - Variant: estimates",
    "Births - Sex: all - Age: all - Variant: estimates"
]]

# Projected data: Years > 2023, use 'Projected' columns
projected_data = data[data["Year"] > 2023][[
    "Entity", "Code", "Year",
    "Deaths - Sex: all - Age: all - Variant: medium",
    "Births - Sex: all - Age: all - Variant: medium"
]]

# Rename columns for clarity and consistency
historical_data = historical_data.rename(columns={
    "Entity": "Country",
    "Code": "Country_Code",
    "Deaths - Sex: all - Age: all - Variant: estimates": "Deaths",
    "Births - Sex: all - Age: all - Variant: estimates": "Births"
})

projected_data = projected_data.rename(columns={
    "Entity": "Country",
    "Code": "Country_Code",
    "Deaths - Sex: all - Age: all - Variant: medium": "Deaths",
    "Births - Sex: all - Age: all - Variant: medium": "Births"
})

# Apply safe conversion to the Deaths and Births columns
historical_data = safe_convert_to_int(historical_data, "Deaths")
historical_data = safe_convert_to_int(historical_data, "Births")

projected_data = safe_convert_to_int(projected_data, "Deaths")
projected_data = safe_convert_to_int(projected_data, "Births")

# Verify the changes
print(historical_data.head())
print(projected_data.head())


# Verify the cleaned and separated data
print("Historical Data Sample:")
print(historical_data.head())

print("\nProjected Data Sample:")
print(projected_data.head())


Column Descriptors:
 Column Descriptors:
Entity: Country or area name
Code: Country or area code
Year: Year of observation or projection (2024-2100)
Deaths - Sex: all - Age: all - Variant: estimates: Historical death estimates
Deaths - Sex: all - Age: all - Variant: medium: Projected deaths (medium scenario)
Births - Sex: all - Age: all - Variant: estimates: Historical birth estimates
Births - Sex: all - Age: all - Variant: medium: Projected births (medium scenario)
       Country Country_Code  Year  Deaths  Births
0  Afghanistan          AFG  1950  290972  383985
1  Afghanistan          AFG  1951  288752  391002
2  Afghanistan          AFG  1952  288059  397663
3  Afghanistan          AFG  1953  287712  404666
4  Afghanistan          AFG  1954  289189  410428
        Country Country_Code  Year  Deaths   Births
74  Afghanistan          AFG  2024  243181  1492956
75  Afghanistan          AFG  2025  245867  1507838
76  Afghanistan          AFG  2026  248524  1520756
77  Afghanistan      

# Verify Data Subsets

- check the shap and columns
- validate year range
- Ensure there are no missing values in the subset 


In [17]:
# verify the basic information of the data

print('Historical Data Shape:', historical_data.shape)
print('Historical Data Columns:', historical_data.columns)

print('\nProjected Data Shape:', projected_data.shape)
print('Projected Data Columns:', projected_data.columns)

# validate the year range 
print('Year Range in Historical Data:', historical_data["Year"].min(), historical_data, "-", historical_data["Year"].max())
print('Year Range in Projected Data:', projected_data["Year"].min(), projected_data["Year"].max())

# check for missing values

print('Missing Values in Historical Data:')
print(historical_data.isnull().sum())

print('\nMissing Values in Projected Data:')
print(projected_data.isnull().sum())

# ensure country codes are also present in the projected data

common_countries = set(historical_data['Country']).intersection(set(projected_data['Country']))
print('Number of Common Countries:', len(common_countries))




Historical Data Shape: (18944, 5)
Historical Data Columns: Index(['Country', 'Country_Code', 'Year', 'Deaths', 'Births'], dtype='object')

Projected Data Shape: (19712, 5)
Projected Data Columns: Index(['Country', 'Country_Code', 'Year', 'Deaths', 'Births'], dtype='object')
Year Range in Historical Data: 1950            Country Country_Code  Year     Deaths    Births
0      Afghanistan          AFG  1950  290972.00  383985.0
1      Afghanistan          AFG  1951  288752.00  391002.0
2      Afghanistan          AFG  1952  288059.00  397663.0
3      Afghanistan          AFG  1953  287712.00  404666.0
4      Afghanistan          AFG  1954  289189.00  410428.0
...            ...          ...   ...        ...       ...
38574     Zimbabwe          ZWE  2019  126467.99  475267.0
38575     Zimbabwe          ZWE  2020  126365.00  481152.0
38576     Zimbabwe          ZWE  2021  138738.00  488642.0
38577     Zimbabwe          ZWE  2022  124995.00  496240.0
38578     Zimbabwe          ZWE  2023  1

In [28]:
#Address missing values 

historical_data.fillna({'Births': historical_data['Births'].mean(), 
                        'Country_Code': 'Unknown'}, inplace=True)

print(historical_data.isnull().sum())


# Aggregate data for visualization

global_historical = historical_data.groupby('Year')[["Deaths", "Births"]].sum().reset_index()
global_projected = projected_data.groupby('Year')[["Deaths", "Births"]].sum().reset_index()


# combine data for plotting 

global_data = pd.concat([global_historical, global_projected])
print(global_data.head())



Country         0
Country_Code    0
Year            0
Deaths          0
Births          0
dtype: int64
   Year     Deaths     Births
0  1950  315995016  581922863
1  1951  313786330  587191205
2  1952  308937347  617929968
3  1953  308173393  618959927
4  1954  304791063  637745325
