# 

In [31]:
import pandas as pd 

file_path = "data/births-and-deaths-projected-to-2100.csv"
data = pd.read_csv(file_path)

data.head()


Unnamed: 0,Entity,Code,Year,Deaths - Sex: all - Age: all - Variant: estimates,Deaths - Sex: all - Age: all - Variant: medium,Births - Sex: all - Age: all - Variant: estimates,Births - Sex: all - Age: all - Variant: medium
0,Afghanistan,AFG,1950,290972.0,,383985.0,
1,Afghanistan,AFG,1951,288752.0,,391002.0,
2,Afghanistan,AFG,1952,288059.0,,397663.0,
3,Afghanistan,AFG,1953,287712.0,,404666.0,
4,Afghanistan,AFG,1954,289189.0,,410428.0,


In [12]:
# inspect the data


data.info()

data.head()

# check for missing values

missing_values = data.isnull().sum()

print('Missing values per colume:')
print(missing_values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38656 entries, 0 to 38655
Data columns (total 7 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Entity                                             38656 non-null  object 
 1   Code                                               35938 non-null  object 
 2   Year                                               38656 non-null  int64  
 3   Deaths - Sex: all - Age: all - Variant: estimates  18944 non-null  float64
 4   Deaths - Sex: all - Age: all - Variant: medium     19712 non-null  float64
 5   Births - Sex: all - Age: all - Variant: estimates  18722 non-null  float64
 6   Births - Sex: all - Age: all - Variant: medium     19481 non-null  float64
dtypes: float64(4), int64(1), object(2)
memory usage: 2.1+ MB
Missing values per colume:
Entity                                                   0
Code                    

## Helper function to convert values

The result will display in scientific notation so this helper function will convert values to integers and also handle NaN values
Safely converts a column in a DataFrame to integers, handling NaN values.

In [26]:
# try catch function to convert scientific notation and check for missing values

def safe_convert_to_int(df, column_name, fill_value=0):

    try:
        # Fill NaN values with the specified fill_value
        df[column_name] = df[column_name].fillna(fill_value).astype(int)
    except Exception as e:
        print(f"Error converting column '{column_name}' to int: {e}")
    return df



 # Rename Headers and Clean Null Data
 
The column headers in the dataset were unclear and there it included a txt file with descriptors the Birth/Death data projections were in a separate column thatn up to the present year , which resulted in thousands in null values in several columns .

Below we display the column descriptors, clean data and rename columns for clarity.

In [27]:
# Display the column descriptors for reference
with open("data/global_births_deaths_column_descriptors.txt", "r") as file:
    descriptors = file.read()
print("Column Descriptors:\n", descriptors)

# Separate the data based on the Year and column purpose
# Historical data: Years <= 2023, use 'Estimates' columns
historical_data = data[data["Year"] <= 2023][[
    "Entity", "Code", "Year",
    "Deaths - Sex: all - Age: all - Variant: estimates",
    "Births - Sex: all - Age: all - Variant: estimates"
]]

# Projected data: Years > 2023, use 'Projected' columns
projected_data = data[data["Year"] > 2023][[
    "Entity", "Code", "Year",
    "Deaths - Sex: all - Age: all - Variant: medium",
    "Births - Sex: all - Age: all - Variant: medium"
]]

# Rename columns for clarity and consistency
historical_data = historical_data.rename(columns={
    "Entity": "Country",
    "Code": "Country_Code",
    "Deaths - Sex: all - Age: all - Variant: estimates": "Deaths",
    "Births - Sex: all - Age: all - Variant: estimates": "Births"
})

projected_data = projected_data.rename(columns={
    "Entity": "Country",
    "Code": "Country_Code",
    "Deaths - Sex: all - Age: all - Variant: medium": "Deaths",
    "Births - Sex: all - Age: all - Variant: medium": "Births"
})

# Apply safe conversion to the Deaths and Births columns
historical_data = safe_convert_to_int(historical_data, "Deaths")
historical_data = safe_convert_to_int(historical_data, "Births")

projected_data = safe_convert_to_int(projected_data, "Deaths")
projected_data = safe_convert_to_int(projected_data, "Births")

# Verify the changes
print(historical_data.head())
print(projected_data.head())


# Verify the cleaned and separated data
print("Historical Data Sample:")
print(historical_data.head())

print("\nProjected Data Sample:")
print(projected_data.head())


Column Descriptors:
 Column Descriptors:
Entity: Country or area name
Code: Country or area code
Year: Year of observation or projection (2024-2100)
Deaths - Sex: all - Age: all - Variant: estimates: Historical death estimates
Deaths - Sex: all - Age: all - Variant: medium: Projected deaths (medium scenario)
Births - Sex: all - Age: all - Variant: estimates: Historical birth estimates
Births - Sex: all - Age: all - Variant: medium: Projected births (medium scenario)
       Country Country_Code  Year  Deaths  Births
0  Afghanistan          AFG  1950  290972  383985
1  Afghanistan          AFG  1951  288752  391002
2  Afghanistan          AFG  1952  288059  397663
3  Afghanistan          AFG  1953  287712  404666
4  Afghanistan          AFG  1954  289189  410428
        Country Country_Code  Year  Deaths   Births
74  Afghanistan          AFG  2024  243181  1492956
75  Afghanistan          AFG  2025  245867  1507838
76  Afghanistan          AFG  2026  248524  1520756
77  Afghanistan      

# Verify Data Subsets

- check the shap and columns
- validate year range
- Ensure there are no missing values in the subset 


In [30]:
# verify the basic information of the data

print('Historical Data Shape:', historical_data.shape)
print('Historical Data Columns:', historical_data.columns)

print('\nProjected Data Shape:', projected_data.shape)
print('Projected Data Columns:', projected_data.columns)

# validate the year range 
print('Year Range in Historical Data:', historical_data["Year"].min(), historical_data, "-", historical_data["Year"].max())
print('Year Range in Projected Data:', projected_data["Year"].min(), projected_data["Year"].max())

# check for missing values

print('Missing Values in Historical Data:')
print(historical_data.isnull().sum())

print('\nMissing Values in Projected Data:')
print(projected_data.isnull().sum())

# ensure country codes are also present in the projected data

common_countries = set(historical_data['Country']).intersection(set(projected_data['Country']))
print('Number of Common Countries:', len(common_countries))



Historical Data Shape: (18944, 5)
Historical Data Columns: Index(['Country', 'Country_Code', 'Year', 'Deaths', 'Births'], dtype='object')

Projected Data Shape: (19712, 5)
Projected Data Columns: Index(['Country', 'Country_Code', 'Year', 'Deaths', 'Births'], dtype='object')
Year Range in Historical Data: 1950            Country Country_Code  Year  Deaths  Births
0      Afghanistan          AFG  1950  290972  383985
1      Afghanistan          AFG  1951  288752  391002
2      Afghanistan          AFG  1952  288059  397663
3      Afghanistan          AFG  1953  287712  404666
4      Afghanistan          AFG  1954  289189  410428
...            ...          ...   ...     ...     ...
38574     Zimbabwe          ZWE  2019  126467  475267
38575     Zimbabwe          ZWE  2020  126365  481152
38576     Zimbabwe          ZWE  2021  138738  488642
38577     Zimbabwe          ZWE  2022  124995  496240
38578     Zimbabwe          ZWE  2023  124411  496917

[18944 rows x 5 columns] - 2023
Year Ran

In [28]:
#Address missing values 

historical_data.fillna({'Births': historical_data['Births'].mean(), 
                        'Country_Code': 'Unknown'}, inplace=True)

print(historical_data.isnull().sum())


# Aggregate data for visualization

global_historical = historical_data.groupby('Year')[["Deaths", "Births"]].sum().reset_index()
global_projected = projected_data.groupby('Year')[["Deaths", "Births"]].sum().reset_index()


# combine data for plotting 

global_data = pd.concat([global_historical, global_projected])
print(global_data.head())



Country         0
Country_Code    0
Year            0
Deaths          0
Births          0
dtype: int64
   Year     Deaths     Births
0  1950  315995016  581922863
1  1951  313786330  587191205
2  1952  308937347  617929968
3  1953  308173393  618959927
4  1954  304791063  637745325


# Global Trends Line Chart 

Create a line chart to visualize the global trends in births and deaths from 1950 to 2100.
- Includes tooltip for additional information.

In [70]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool 
from bokeh.models import NumeralTickFormatter

output_notebook()

# create the figure

p = figure(
    title='Global Births and Deaths (1950-2100)',
    x_axis_label='Year',
    y_axis_label='count (millions)',
    width=800,
    height=400
)

# add lines for births and deaths

p.line(
    global_data['Year'],global_data['Births'] / 1e6,
    line_width=2,color='blue',legend_label='Births'
)

p.line(
    global_data['Year'],global_data['Deaths'] / 1e6,
    line_width=2,color='red',legend_label='Deaths'
)

# add hover tool

hover = HoverTool(tooltips=[('Year', '@x'), ('Value(Millions)', '@y')])
p.add_tools(hover)

# legend location
p.legend.location = 'top_left'
p.yaxis.formatter = NumeralTickFormatter(format='0.0a')

show(p)

In [None]:
# Inspect and clean non-country entities 

In [69]:
print("Unique 'Country' values in historical_data:")
print(historical_data["Country"].unique())

excluded_labels = [
    "World", "Africa (UN)", "Americas (UN)", "Asia (UN)", "Europe (UN)",
    "Latin America and the Caribbean (UN)", "High-income countries",
    "Upper-middle-income countries", "Lower-middle-income countries",
    "Least developed countries", "Land-locked developing countries (LLDC)",
    "Less developed regions", "Less developed regions excluding China"
]

# Remove excluded labels from historical and projected data
historical_data = historical_data[~historical_data["Country"].isin(excluded_labels)]
projected_data = projected_data[~projected_data["Country"].isin(excluded_labels)]

print("Unique 'Country' values after filtering:")
print(historical_data["Country"].unique())


Unique 'Country' values in historical_data:
['Afghanistan' 'Albania' 'Algeria' 'American Samoa' 'Andorra' 'Angola'
 'Anguilla' 'Antigua and Barbuda' 'Argentina' 'Armenia' 'Aruba'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bermuda' 'Bhutan'
 'Bolivia' 'Bonaire Sint Eustatius and Saba' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'British Virgin Islands' 'Brunei' 'Bulgaria'
 'Burkina Faso' 'Burundi' 'Cambodia' 'Cameroon' 'Canada' 'Cape Verde'
 'Cayman Islands' 'Central African Republic' 'Chad' 'Chile' 'China'
 'Colombia' 'Comoros' 'Congo' 'Cook Islands' 'Costa Rica' "Cote d'Ivoire"
 'Croatia' 'Cuba' 'Curacao' 'Cyprus' 'Czechia'
 'Democratic Republic of Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'East Timor' 'Ecuador' 'Egypt' 'El Salvador'
 'Equatorial Guinea' 'Eritrea' 'Estonia' 'Eswatini' 'Ethiopia'
 'Falkland Islands' 'Faroe Islands' 'Fiji' 'Finland' 'France'
 'French Guiana' 'French Polynesi

# Country-Wise Bar Chart

**Dynamic Year Selection:**

- Use the dropdown to select a year.
- Chart updates automatically.

**Top 10 Countries**

- Filter to show the top 10 countries by births.

**Interactive Legend:**

- Click to mute/unmute bars for births or deaths.

**Hover Tool:**

- Displays exact values for births and deaths when hovering over bars.


In [73]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, Select, CustomJS
from bokeh.layouts import column

output_notebook()

# Exclude non-country entries based on Country_Code
valid_countries = historical_data[historical_data["Country_Code"].notnull()]

# Alternatively, manually exclude specific labels
excluded_labels = [
    "World", "Africa (UN)", "Americas (UN)", "Asia (UN)", "Europe (UN)",
    "Latin America and the Caribbean (UN)", "High-income countries",
    "Upper-middle-income countries", "Lower-middle-income countries",
    "Least developed countries", "Land-locked developing countries (LLDC)",
    "Less developed regions", "Less developed regions excluding China"
]
valid_countries = valid_countries[~valid_countries["Country"].isin(excluded_labels)]


# Prepare data for all years
def prepare_data():
    # Use valid_countries directly
    valid_data = valid_countries.copy()
    valid_data["Births"] = (valid_data["Births"] / 1e6).round(2)  # Convert to millions and round
    valid_data["Deaths"] = (valid_data["Deaths"] / 1e6).round(2)  # Convert to millions and round
    
    # Create a list of dictionaries for JavaScript
    return valid_data.to_dict('records')

# Get data for a specific year
def get_country_data(year):
    # Use valid_countries directly
    filtered_data = valid_countries[valid_countries["Year"] == year].copy()
    filtered_data["Births"] = (filtered_data["Births"] / 1e6).round(2)
    filtered_data["Deaths"] = (filtered_data["Deaths"] / 1e6).round(2)
    return filtered_data.sort_values(by="Births", ascending=False).head(10)

# Initial setup
default_year = 2023
filtered_data = get_country_data(default_year)

# Create ColumnDataSource
source = ColumnDataSource(data={
    "Country": filtered_data["Country"].tolist(),
    "Births": filtered_data["Births"].tolist(),
    "Deaths": filtered_data["Deaths"].tolist()
})

# Create the figure
p = figure(
    x_range=filtered_data["Country"].tolist(),
    title=f"Top 10 Countries by Births and Deaths in {default_year}",
    x_axis_label="Country",
    y_axis_label="Count (Millions)",
    width=900,
    height=500,
    tools="pan,box_zoom,reset,save,hover"
)

# Add bar glyphs
p.vbar(x="Country", top="Births", width=0.4, source=source, color="blue", legend_label="Births")
p.vbar(x="Country", top="Deaths", width=0.4, source=source, color="red", legend_label="Deaths")

# Customize legend
p.legend.location = "top_left"
p.legend.click_policy = "mute"

# Rotate x-axis labels
p.xaxis.major_label_orientation = 0.8

# Add hover tool
p.add_tools(HoverTool(tooltips=[
    ("Country", "@Country"),
    ("Births (M)", "@Births{0.00}"),
    ("Deaths (M)", "@Deaths{0.00}")
]))

# Create year options
years = sorted(historical_data["Year"].unique())
year_options = [str(year) for year in years]

# Create dropdown
dropdown = Select(
    title="Select Year",
    value=str(default_year),
    options=year_options
)

# Create JavaScript callback
callback = CustomJS(args=dict(
    source=source,
    plot=p,
    all_data=prepare_data()
), code="""
    const year = parseInt(cb_obj.value);
    
    // Filter data for selected year
    const filtered = all_data
        .filter(row => row.Year === year)
        .sort((a, b) => b.Births - a.Births)
        .slice(0, 10);
    
    // Update source data
    const new_data = {
        Country: filtered.map(row => row.Country),
        Births: filtered.map(row => row.Births),
        Deaths: filtered.map(row => row.Deaths)
    };
    
    // Update plot properties
    source.data = new_data;
    plot.x_range.factors = new_data.Country;
    plot.title.text = `Top 10 Countries by Births and Deaths in ${year}`;
""")

# Connect callback to dropdown
dropdown.js_on_change('value', callback)

# Show the plot
show(column(dropdown, p))
