# 

In [31]:
import pandas as pd 

file_path = "data/births-and-deaths-projected-to-2100.csv"
data = pd.read_csv(file_path)

data.head()


Unnamed: 0,Entity,Code,Year,Deaths - Sex: all - Age: all - Variant: estimates,Deaths - Sex: all - Age: all - Variant: medium,Births - Sex: all - Age: all - Variant: estimates,Births - Sex: all - Age: all - Variant: medium
0,Afghanistan,AFG,1950,290972.0,,383985.0,
1,Afghanistan,AFG,1951,288752.0,,391002.0,
2,Afghanistan,AFG,1952,288059.0,,397663.0,
3,Afghanistan,AFG,1953,287712.0,,404666.0,
4,Afghanistan,AFG,1954,289189.0,,410428.0,


In [12]:
# inspect the data


data.info()

data.head()

# check for missing values

missing_values = data.isnull().sum()

print('Missing values per colume:')
print(missing_values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38656 entries, 0 to 38655
Data columns (total 7 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Entity                                             38656 non-null  object 
 1   Code                                               35938 non-null  object 
 2   Year                                               38656 non-null  int64  
 3   Deaths - Sex: all - Age: all - Variant: estimates  18944 non-null  float64
 4   Deaths - Sex: all - Age: all - Variant: medium     19712 non-null  float64
 5   Births - Sex: all - Age: all - Variant: estimates  18722 non-null  float64
 6   Births - Sex: all - Age: all - Variant: medium     19481 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 2.1+ MB
Missing values per colume:
Entity                                                   0
Code                    

## Helper function to convert values

The result will display in scientific notation so this helper function will convert values to integers and also handle NaN values
Safely converts a column in a DataFrame to integers, handling NaN values.

In [26]:
# try catch function to convert scientific notation and check for missing values

def safe_convert_to_int(df, column_name, fill_value=0):

    try:
        # Fill NaN values with the specified fill_value
        df[column_name] = df[column_name].fillna(fill_value).astype(int)
    except Exception as e:
        print(f"Error converting column '{column_name}' to int: {e}")
    return df



 # Rename Headers and Clean Null Data
 
The column headers in the dataset were unclear and there it included a txt file with descriptors the Birth/Death data projections were in a separate column thatn up to the present year , which resulted in thousands in null values in several columns .

Below we display the column descriptors, clean data and rename columns for clarity.

In [83]:
# Display the column descriptors for reference
with open("data/global_births_deaths_column_descriptors.txt", "r") as file:
    descriptors = file.read()
print("Column Descriptors:\n", descriptors)

# Separate the data based on the Year and column purpose
# Historical data: Years <= 2023, use 'Estimates' columns
historical_data = data[data["Year"] <= 2023][[
    "Entity", "Code", "Year",
    "Deaths - Sex: all - Age: all - Variant: estimates",
    "Births - Sex: all - Age: all - Variant: estimates"
]]

# Projected data: Years > 2023, use 'Projected' columns
projected_data = data[data["Year"] > 2023][[
    "Entity", "Code", "Year",
    "Deaths - Sex: all - Age: all - Variant: medium",
    "Births - Sex: all - Age: all - Variant: medium"
]]

# Rename columns for clarity and consistency
historical_data = historical_data.rename(columns={
    "Entity": "Country",
    "Code": "Country_Code",
    "Deaths - Sex: all - Age: all - Variant: estimates": "Deaths",
    "Births - Sex: all - Age: all - Variant: estimates": "Births"
})

projected_data = projected_data.rename(columns={
    "Entity": "Country",
    "Code": "Country_Code",
    "Deaths - Sex: all - Age: all - Variant: medium": "Deaths",
    "Births - Sex: all - Age: all - Variant: medium": "Births"
})


# address entity/code columns to just include the country and not region

historical_data = historical_data[historical_data["Country_Code"].notnull()]
projected_data = projected_data[projected_data["Country_Code"].notnull()]

# Apply safe conversion to the Deaths and Births columns
historical_data = safe_convert_to_int(historical_data, "Deaths")
historical_data = safe_convert_to_int(historical_data, "Births")

projected_data = safe_convert_to_int(projected_data, "Deaths")
projected_data = safe_convert_to_int(projected_data, "Births")

# Verify the changes
print(historical_data.head())
print(projected_data.head())


# Verify the cleaned and separated data
print("Historical Data Sample:")
print(historical_data.head())

print("\nProjected Data Sample:")
print(projected_data.head())


Column Descriptors:
 Column Descriptors:
Entity: Country or area name
Code: Country or area code
Year: Year of observation or projection (2024-2100)
Deaths - Sex: all - Age: all - Variant: estimates: Historical death estimates
Deaths - Sex: all - Age: all - Variant: medium: Projected deaths (medium scenario)
Births - Sex: all - Age: all - Variant: estimates: Historical birth estimates
Births - Sex: all - Age: all - Variant: medium: Projected births (medium scenario)
       Country Country_Code  Year  Deaths  Births
0  Afghanistan          AFG  1950  290972  383985
1  Afghanistan          AFG  1951  288752  391002
2  Afghanistan          AFG  1952  288059  397663
3  Afghanistan          AFG  1953  287712  404666
4  Afghanistan          AFG  1954  289189  410428
        Country Country_Code  Year  Deaths   Births
74  Afghanistan          AFG  2024  243181  1492956
75  Afghanistan          AFG  2025  245867  1507838
76  Afghanistan          AFG  2026  248524  1520756
77  Afghanistan      

# Verify Data Subsets

- check the shap and columns
- validate year range
- Ensure there are no missing values in the subset 


In [30]:
# verify the basic information of the data

print('Historical Data Shape:', historical_data.shape)
print('Historical Data Columns:', historical_data.columns)

print('\nProjected Data Shape:', projected_data.shape)
print('Projected Data Columns:', projected_data.columns)

# validate the year range 
print('Year Range in Historical Data:', historical_data["Year"].min(), historical_data, "-", historical_data["Year"].max())
print('Year Range in Projected Data:', projected_data["Year"].min(), projected_data["Year"].max())

# check for missing values

print('Missing Values in Historical Data:')
print(historical_data.isnull().sum())

print('\nMissing Values in Projected Data:')
print(projected_data.isnull().sum())

# ensure country codes are also present in the projected data

common_countries = set(historical_data['Country']).intersection(set(projected_data['Country']))
print('Number of Common Countries:', len(common_countries))



Historical Data Shape: (18944, 5)
Historical Data Columns: Index(['Country', 'Country_Code', 'Year', 'Deaths', 'Births'], dtype='object')

Projected Data Shape: (19712, 5)
Projected Data Columns: Index(['Country', 'Country_Code', 'Year', 'Deaths', 'Births'], dtype='object')
Year Range in Historical Data: 1950            Country Country_Code  Year  Deaths  Births
0      Afghanistan          AFG  1950  290972  383985
1      Afghanistan          AFG  1951  288752  391002
2      Afghanistan          AFG  1952  288059  397663
3      Afghanistan          AFG  1953  287712  404666
4      Afghanistan          AFG  1954  289189  410428
...            ...          ...   ...     ...     ...
38574     Zimbabwe          ZWE  2019  126467  475267
38575     Zimbabwe          ZWE  2020  126365  481152
38576     Zimbabwe          ZWE  2021  138738  488642
38577     Zimbabwe          ZWE  2022  124995  496240
38578     Zimbabwe          ZWE  2023  124411  496917

[18944 rows x 5 columns] - 2023
Year Ran

In [28]:
#Address missing values 

historical_data.fillna({'Births': historical_data['Births'].mean(), 
                        'Country_Code': 'Unknown'}, inplace=True)

print(historical_data.isnull().sum())


# Aggregate data for visualization

global_historical = historical_data.groupby('Year')[["Deaths", "Births"]].sum().reset_index()
global_projected = projected_data.groupby('Year')[["Deaths", "Births"]].sum().reset_index()


# combine data for plotting 

global_data = pd.concat([global_historical, global_projected])
print(global_data.head())



Country         0
Country_Code    0
Year            0
Deaths          0
Births          0
dtype: int64
   Year     Deaths     Births
0  1950  315995016  581922863
1  1951  313786330  587191205
2  1952  308937347  617929968
3  1953  308173393  618959927
4  1954  304791063  637745325


# Global Trends Line Chart 

Create a line chart to visualize the global trends in births and deaths from 1950 to 2100.
- Includes tooltip for additional information.

In [70]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool 
from bokeh.models import NumeralTickFormatter

output_notebook()

# create the figure

p = figure(
    title='Global Births and Deaths (1950-2100)',
    x_axis_label='Year',
    y_axis_label='count (millions)',
    width=800,
    height=400
)

# add lines for births and deaths

p.line(
    global_data['Year'],global_data['Births'] / 1e6,
    line_width=2,color='blue',legend_label='Births'
)

p.line(
    global_data['Year'],global_data['Deaths'] / 1e6,
    line_width=2,color='red',legend_label='Deaths'
)

# add hover tool

hover = HoverTool(tooltips=[('Year', '@x'), ('Value(Millions)', '@y')])
p.add_tools(hover)

# legend location
p.legend.location = 'top_left'
p.yaxis.formatter = NumeralTickFormatter(format='0.0a')

show(p)

# Inspect and clean non-country entities 

In [None]:
print("Unique 'Country' values in historical_data:")
print(historical_data["Country"].unique())

excluded_labels = [
    "World", "Africa (UN)", "Americas (UN)", "Asia (UN)", "Europe (UN)",
    "Latin America and the Caribbean (UN)", "High-income countries",
    "Upper-middle-income countries", "Lower-middle-income countries",
    "Least developed countries", "Land-locked developing countries (LLDC)",
    "Less developed regions", "Less developed regions excluding China"
]

# Remove excluded labels from historical and projected data
historical_data = historical_data[~historical_data["Country"].isin(excluded_labels)]
projected_data = projected_data[~projected_data["Country"].isin(excluded_labels)]

print("Unique 'Country' values after filtering:")
print(historical_data["Country"].unique())


# Country-Wise Bar Chart

**Dynamic Year Selection:**

- Use the dropdown to select a year.
- Chart updates automatically.

**Top 10 Countries**

- Filter to show the top 10 countries by births or Deaths and filter by year

**Hover Tool:**

- Displays exact values for births and deaths when hovering over bars.


In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, Select, CustomJS, HoverTool
from bokeh.layouts import column
from bokeh.models import Legend

output_notebook()

# Exclude non-country entries based on Country_Code
valid_countries = historical_data[historical_data["Country_Code"].notnull()]

# Manually exclude specific labels
excluded_labels = [
    "World", "Africa (UN)", "Americas (UN)", "Asia (UN)", "Europe (UN)",
    "Latin America and the Caribbean (UN)", "High-income countries",
    "Upper-middle-income countries", "Lower-middle-income countries",
    "Least developed countries", "Land-locked developing countries (LLDC)",
    "Less developed regions", "Less developed regions excluding China"
]
valid_countries = valid_countries[~valid_countries["Country"].isin(excluded_labels)]

# Prepare data for all years
def prepare_data():
    valid_data = valid_countries.copy()
    valid_data["Births"] = (valid_data["Births"] / 1e6).round(2)  # Convert to millions and round
    valid_data["Deaths"] = (valid_data["Deaths"] / 1e6).round(2)  # Convert to millions and round
    return valid_data.to_dict('records')

# Get data for a specific year and metric
def get_country_data(year, metric):
    filtered_data = valid_countries[valid_countries["Year"] == year].copy()
    filtered_data[metric] = (filtered_data[metric] / 1e6).round(2)  # Convert to millions
    return filtered_data.sort_values(by=metric, ascending=False).head(10)

# Initial setup
default_year = 2023
default_metric = "Births"
filtered_data = get_country_data(default_year, default_metric)

# Create ColumnDataSource
source = ColumnDataSource(data={
    "Country": filtered_data["Country"].tolist(),
    "Value": filtered_data[default_metric].tolist()
})

# Create the figure
p = figure(
    x_range=filtered_data["Country"].tolist(),
    title=f"Top 10 Countries by {default_metric} in {default_year}",
    x_axis_label="Country",
    y_axis_label="Count (Millions)",
    width=900,
    height=500,
    tools="pan,box_zoom,reset,save"
)

# Define custom colors for metrics
metric_colors = {
    "Births": "#2ca02c",  
    "Deaths": "#1f77b4"   
}#1f77b4

# Add bar glyph - Note: not setting legend_label here
vbar = p.vbar(
    x="Country", 
    top="Value", 
    width=0.4, 
    source=source, 
    color=metric_colors[default_metric]
)

# Create legend separately
legend = Legend(items=[
    (default_metric, [vbar])
])

# Add legend to plot
p.add_layout(legend, 'left')

# Customize legend
p.legend.click_policy = "mute"

# Rotate x-axis labels
p.xaxis.major_label_orientation = 0.8

# Add hover tool
p.add_tools(HoverTool(tooltips=[
    ("Country", "@Country"),
    ("Count (M)", "@Value{0.00}")  # 
]))

# Create year options
years = sorted(historical_data["Year"].unique())
year_options = [str(year) for year in years]

# Create dropdowns
year_dropdown = Select(
    title="Select Year",
    value=str(default_year),
    options=year_options
)

metric_dropdown = Select(
    title="Select Metric",
    value="Births",
    options=["Births", "Deaths"]
)

# Create JavaScript callback
callback = CustomJS(args=dict(
    source=source,
    plot=p,
    all_data=prepare_data(),
    year_dropdown=year_dropdown,
    metric_dropdown=metric_dropdown,
    vbar=vbar,
    metric_colors=metric_colors,
    legend=legend  
), code="""
    const year = parseInt(year_dropdown.value);
    const metric = metric_dropdown.value;

    // Filter data for selected year and metric
    const filtered = all_data
        .filter(row => row.Year === year)
        .sort((a, b) => b[metric] - a[metric])
        .slice(0, 10);
    
    // Update source data
    const new_data = {
        Country: filtered.map(row => row.Country),
        Value: filtered.map(row => row[metric])
    };
    
    // Update plot properties
    source.data = new_data;
    plot.x_range.factors = new_data.Country;
    plot.title.text = `Top 10 Countries by ${metric} in ${year}`;
    
    // Update bar color
    vbar.glyph.fill_color = metric_colors[metric];
    vbar.glyph.line_color = metric_colors[metric];
    
    // Update legend
    legend.items[0].label.value = metric;
    
    // Trigger a plot update
    plot.change.emit();
""")

# Connect callbacks to dropdowns
year_dropdown.js_on_change('value', callback)
metric_dropdown.js_on_change('value', callback)

# Show the plot
show(column(metric_dropdown, year_dropdown, p))

# Net Population Growth/Decline Map Visualization

The Net Population Growth/Decline Map is an interactive choropleth visualization that displays global population dynamics by showing the difference between births and deaths for each country over time. The map uses color encoding to represent population changes, with red indicating population decline (negative growth) and green showing population growth (positive growth).

- Uses GeoJSON format for country boundaries
- Color sensitivity is optimized using percentile-based bounds

**Color Scale**

- Red: Indicates population decline (negative net growth)
- Yellow: Represents near-zero or balanced growth
- Green: Shows population growth (positive net growth)
- Scale is centered on percentile-based bounds for optimal contrast

**Interactive Features**

- **Year Selection:** Users can select different years to view historical and projected population changes
- **Hover Information:** Displays country name and exact net growth/decline values
- **Zoom Capability:** Allows detailed examination of specific regions
- **Color Bar: Shows** the scale of growth/decline in millions


In [122]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, ColorBar, CustomJS, Select, LinearColorMapper, NumeralTickFormatter, HoverTool
from bokeh.palettes import RdYlGn
from bokeh.layouts import column
import json
import numpy as np

output_notebook()

# Combine historical and projected data
historical_data["Net_Growth"] = historical_data["Births"] - historical_data["Deaths"]
projected_data["Net_Growth"] = projected_data["Births"] - projected_data["Deaths"]
combined_data = pd.concat([historical_data, projected_data])

# Load GeoJSON file
with open("data/countries.geo.json", "r") as file:
    geojson = json.load(file)

# Initial data for the default year
default_year = 2023
default_data = combined_data[combined_data["Year"] == default_year]

# Update GeoJSON features with net growth data
for feature in geojson["features"]:
    country_code = feature["id"]
    country_data = default_data[default_data["Country_Code"] == country_code]
    net_growth = country_data["Net_Growth"].iloc[0] if not country_data.empty else 0
    feature["properties"]["Net_Growth"] = float(net_growth)

# Create GeoJSONDataSource
geojson_data = GeoJSONDataSource(geojson=json.dumps(geojson))

# Calculate more sensitive bounds for the color mapper
# Get the 5th and 95th percentiles for a more focused range
q05 = combined_data["Net_Growth"].quantile(0.05)
q95 = combined_data["Net_Growth"].quantile(0.95)

# Create color mapper with reversed RdYlGn palette (so red is negative, green is positive)
color_mapper = LinearColorMapper(
    palette=RdYlGn[11][::-1],  # Reverse the palette
    low=q05,  # Use 5th percentile for more sensitivity
    high=q95,  # Use 95th percentile for more sensitivity
    nan_color='gray'
)

# Create the figure
p = figure(
    title=f"Net Population Growth/Decline Map ({default_year})",
    width=900,
    height=500,
    tools="pan,wheel_zoom,reset,save",
    x_axis_location=None,
    y_axis_location=None
)

p.grid.grid_line_color = None

# Add the patches
p.patches(
    "xs", "ys",
    source=geojson_data,
    fill_color={'field': 'Net_Growth', 'transform': color_mapper},
    line_color="black",
    line_width=0.5,
    fill_alpha=0.8
)

# Add color bar with correct formatter
color_bar = ColorBar(
    color_mapper=color_mapper,
    width=8,
    location=(0, 0),
    title="Net Growth/Decline (Millions)",
    formatter=NumeralTickFormatter(format="0.0a")
)
p.add_layout(color_bar, "right")

# Add hover tool
hover = HoverTool(tooltips=[
    ('Country', '@name'),
    ('Net Growth/Decline', '@Net_Growth{0,0.0}M')
])
p.add_tools(hover)

# Add year selector
years = sorted(combined_data["Year"].unique())
dropdown = Select(
    title="Select Year",
    value=str(default_year),
    options=[str(year) for year in years]
)

# JavaScript callback
callback = CustomJS(args=dict(
    source=geojson_data,
    all_data=combined_data.to_dict("records"),
    dropdown=dropdown,
    color_mapper=color_mapper,
    title=p.title
), code="""
    const selected_year = parseInt(dropdown.value);
    const data = all_data.filter(row => row.Year === selected_year);
    const geojson = JSON.parse(source.geojson);
    
    // Update each feature's Net_Growth value
    geojson.features.forEach(feature => {
        const country_code = feature.id;
        const country_data = data.find(row => row.Country_Code === country_code);
        feature.properties.Net_Growth = country_data ? country_data.Net_Growth : 0;
    });
    
    // Update the source and title
    source.geojson = JSON.stringify(geojson);
    title.text = `Net Population Growth/Decline Map (${selected_year})`;
""")

# Connect callback
dropdown.js_on_change("value", callback)

# Show the plot
show(column(dropdown, p))