## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.linear_model import LinearRegression
import wbdata
import datetime as dt


## Define coutries and frametime

## Fetch Population Data from World Bank

In [7]:
import wbdata
import pandas as pd

# African countries ISO codes
african_countries = [
    'DZA','AGO','BEN','BWA','BFA','BDI','CMR','CPV','CAF','TCD','COM','COG','COD',
    'DJI','EGY','GNQ','ERI','SWZ','ETH','GAB','GMB','GHA','GIN','GNB','CIV','KEN',
    'LSO','LBR','LBY','MDG','MWI','MLI','MRT','MUS','MAR','MOZ','NAM','NER','NGA',
    'RWA','STP','SEN','SYC','SLE','SOM','ZAF','SSD','SDN','TZA','TGO','TUN','UGA',
    'ZMB','ZWE'
]

# Indicator: Total Population
indicator = {'SP.POP.TOTL': 'Population'}

# Fetch data
population_data = wbdata.get_dataframe(indicator, country=african_countries)

# Reset index so 'date' and 'country' become columns
population_data = population_data.reset_index()

# Convert 'date' to datetime type explicitly
population_data['date'] = pd.to_datetime(population_data['date'], errors='coerce')

# Extract year
population_data['year'] = population_data['date'].dt.year

# Filter for 2000–2020
population_data = population_data[(population_data['year'] >= 2000) & (population_data['year'] <= 2020)]

# Drop the original 'date' column if you like
population_data = population_data.drop(columns=['date'])

# Save to CSV
population_data.to_csv('../data/african_population_2000_2020.csv', index=False)

print("✅ Population data fetched, cleaned, and saved successfully!")
population_data.head()

✅ Population data fetched, cleaned, and saved successfully!


Unnamed: 0,country,Population,year
4,Angola,33451132.0,2020
5,Angola,32375632.0,2019
6,Angola,31297155.0,2018
7,Angola,30234839.0,2017
8,Angola,29183070.0,2016


In [13]:
population_data.shape

(1134, 3)

In [15]:
population_data.isnull().sum()

country       0
Population    0
year          0
dtype: int64

In [17]:
population_data = population_data.sort_values(by=['country', 'year']).reset_index(drop=True)

In [19]:
# Initialize growth rate column
population_data['Growth_Rate'] = 0.0

# Calculate growth rate for each country
for country in population_data['country'].unique():
    country_data = population_data[population_data['country'] == country]
    population_data.loc[country_data.index, 'Growth_Rate'] = country_data['Population'].pct_change() * 100

In [21]:
population_data['Growth_Rate'] = population_data['Growth_Rate'].fillna(0)

In [23]:
# Summary of growth rates
population_data.groupby('country')['Growth_Rate'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Algeria,21.0,1.702334,0.472947,0.0,1.474068,1.857453,2.012797,2.077534
Angola,21.0,3.517819,0.828523,0.0,3.513549,3.670842,3.850575,3.964516
Benin,21.0,2.867405,0.669024,0.0,2.967789,3.012139,3.086791,3.273035
Botswana,21.0,1.652193,0.457382,0.0,1.43777,1.768844,1.957039,2.131842
Burkina Faso,21.0,2.843592,0.679154,0.0,2.962185,3.006266,3.098013,3.196837
Burundi,21.0,3.23687,1.094131,0.0,2.860364,3.452747,3.767405,5.19666
Cabo Verde,21.0,0.607992,0.586982,0.0,0.066139,0.326217,1.157045,1.538879
Cameroon,21.0,2.723781,0.635528,0.0,2.785505,2.846025,2.898884,3.189141
Central African Republic,21.0,1.304017,1.049811,-0.796017,0.684711,1.693969,2.155926,2.421546
Chad,21.0,3.41678,0.847077,0.0,3.361665,3.544566,3.634476,4.555339


In [25]:
population_data.head()

Unnamed: 0,country,Population,year,Growth_Rate
0,Algeria,30903893.0,2000,0.0
1,Algeria,31331221.0,2001,1.382764
2,Algeria,31750835.0,2002,1.339284
3,Algeria,32175818.0,2003,1.338494
4,Algeria,32628286.0,2004,1.406236
