In [2]:
# importing the pandas library
import pandas as pd

In [3]:
# loading the data
df = pd.read_csv("africa_poverty_cleaned.csv")
df.head()

Unnamed: 0,Country Name,Country Code,Year,Poverty Rate
0,Madagascar,MDG,1980,61.7
1,Morocco,MAR,1984,22.4
2,Botswana,BWA,1985,50.1
3,Cote d'Ivoire,CIV,1985,14.0
4,Nigeria,NGA,1985,51.1


In [4]:
# getting the df description summary i.e null value, datatype and the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260 entries, 0 to 259
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  260 non-null    object 
 1   Country Code  260 non-null    object 
 2   Year          260 non-null    int64  
 3   Poverty Rate  260 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 8.3+ KB


In [5]:
# checking the countries in the dataset
unique_countries = df['Country Name'].unique()
unique_countries

array(['Madagascar', 'Morocco', 'Botswana', "Cote d'Ivoire", 'Nigeria',
       'Tunisia', 'Lesotho', 'Ghana', 'Mauritania', 'Algeria', 'Uganda',
       'Egypt, Arab Rep.', 'Guinea', 'Guinea-Bissau', 'Senegal',
       'Tanzania', 'Zambia', 'Burundi', 'Central African Republic',
       'Kenya', 'Niger', 'Namibia', 'South Africa', 'Burkina Faso',
       'Mali', 'Eswatini', 'Ethiopia', 'Cameroon', 'Mozambique', 'Malawi',
       'Gambia, The', 'Seychelles', 'Angola', 'Rwanda',
       'Sao Tome and Principe', 'Cabo Verde', 'Djibouti', 'Benin',
       'Sierra Leone', 'Chad', 'Congo, Dem. Rep.', 'Congo, Rep.', 'Gabon',
       'Mauritius', 'Togo', 'Liberia', 'Sudan', 'South Sudan', 'Zimbabwe',
       'Comoros', 'Equatorial Guinea'], dtype=object)

In [6]:
# getting statistical summary of numerical data in the dataframe
df.describe()

Unnamed: 0,Year,Poverty Rate
count,260.0,260.0
mean,2006.042308,46.520385
std,10.166668,26.053503
min,1980.0,0.0
25%,1998.0,26.625
50%,2007.0,45.8
75%,2015.0,68.45
max,2022.0,94.9


In [7]:
# filtering some of the countries
countries_to_filter = ['Kenya', 'South Africa', 'Nigeria', 'Rwanda']
Ky = df[df['Country Name'].isin(countries_to_filter)]
Ky.head(15)

Unnamed: 0,Country Name,Country Code,Year,Poverty Rate
4,Nigeria,NGA,1985,51.1
27,Kenya,KEN,1992,32.0
29,Nigeria,NGA,1992,55.2
36,South Africa,ZAF,1993,44.6
40,Kenya,KEN,1994,32.0
54,Nigeria,NGA,1996,60.9
57,Kenya,KEN,1997,32.6
75,Rwanda,RWA,2000,82.3
80,South Africa,ZAF,2000,47.9
99,Nigeria,NGA,2003,51.1


In [8]:
# Importing necessary libraries for preprocessing and modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [9]:
# Encoding 'Country Name' as numerical values using LabelEncoder
df['Country Code Encoded'] = LabelEncoder().fit_transform(df['Country Name'])

In [10]:
# Sorting data by country and year to correctly create lag features
df = df.sort_values(by=['Country Name', 'Year'])

In [11]:
# Creating a new feature: previous year's poverty rate (lag by 1)
df['Poverty Rate Lag1'] = df.groupby('Country Name')['Poverty Rate'].shift(1)

In [12]:
# Removing rows with missing values caused by the lag operation
df.dropna(inplace=True)

In [13]:
# Defining feature set (X) and target variable (y)
X = df[['Year', 'Poverty Rate Lag1', 'Country Code Encoded']]
y = df['Poverty Rate']
print(f"set X:\n{X}")
print(f" set Y:\n{y}")

set X:
     Year  Poverty Rate Lag1  Country Code Encoded
47   1995               12.3                     0
161  2011               11.8                     0
132  2008               27.0                     1
219  2018               22.1                     1
159  2011               70.5                     2
..    ...                ...                   ...
158  2010               72.0                    49
204  2015               71.4                    49
259  2022               67.9                    49
218  2017               35.7                    50
238  2019               44.7                    50

[209 rows x 3 columns]
 set Y:
47     11.8
161     0.0
132    22.1
219    39.3
159    68.4
       ... 
158    71.4
204    67.9
259    71.7
218    44.7
238    49.2
Name: Poverty Rate, Length: 209, dtype: float64


In [14]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Initializing the XGBoost regressor with optimized lightweight parameters
model = XGBRegressor(
    n_estimators=50,        # Number of boosting rounds
    max_depth=4,            # Maximum depth of trees
    subsample=0.8,          # Fraction of training samples used per tree
    colsample_bytree=0.8    # Fraction of features used per tree
)

In [16]:
# Train the model on the training data
model.fit(X_train, y_train)

In [18]:
# Generating predictions on the test set
predictions = model.predict(X_test)
predictions

array([50.206722 , 28.129835 , 60.986633 , 73.85439  , 80.91115  ,
       13.735946 , 42.35414  , 29.512718 , 35.918068 , 41.556812 ,
       37.131054 ,  2.929102 , 73.011406 ,  5.6716285, 47.926735 ,
       75.59552  , 68.71342  , 72.090515 , 42.470604 , 44.385567 ,
       76.36022  ,  1.8744596, 49.781254 , 68.40421  , 79.01121  ,
       32.11384  , 20.755795 , 32.88572  , 42.554676 , 43.64565  ,
       77.98338  , 70.11246  , 77.71354  , 28.368082 , 81.079796 ,
       66.96339  , 27.051302 , 65.850006 , 36.58389  , 49.170837 ,
       46.453506 , 18.258177 ], dtype=float32)

In [19]:
# Evaluate the model using MAE and RMSE metrics
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Absolute Error (MAE): 9.372414009344011
Root Mean Squared Error (RMSE): 11.99268620803985


In [20]:
# importing libraries needed
import matplotlib.pyplot as plt
from ipywidgets import interact, IntSlider, Dropdown # ipwidgets providing a sliding bar to choose years of prediction and dropdown to choose the country
import warnings
warnings.filterwarnings("ignore")

In [21]:
# Function to forecast
def forecast_poverty(model, df, country_name, years_to_forecast):
    country_data = df[df['Country Name'] == country_name].sort_values(by='Year')
    last_year = country_data['Year'].max()
    last_poverty_rate = country_data[country_data['Year'] == last_year]['Poverty Rate'].values[0]
    country_code = country_data['Country Code Encoded'].values[0]

    forecast_years = []
    forecast_rates = []

    for i in range(1, years_to_forecast + 1):
        future_year = last_year + i
        X_future = pd.DataFrame({
            'Year': [future_year],
            'Poverty Rate Lag1': [last_poverty_rate],
            'Country Code Encoded': [country_code]
        })
        pred = model.predict(X_future)[0]
        forecast_years.append(future_year)
        forecast_rates.append(pred)
        last_poverty_rate = pred

    return forecast_years, forecast_rates

In [22]:
# Function to update the chart
def update_chart(country_name, years_to_forecast):
    years, predictions = forecast_poverty(model, df, country_name, years_to_forecast)
    country_hist = df[df['Country Name'] == country_name].sort_values(by='Year')

    plt.figure(figsize=(10, 5))
    plt.plot(country_hist['Year'], country_hist['Poverty Rate'], label='Historical', marker='o')
    plt.plot(years, predictions, label='Forecast', linestyle='--', marker='x', color='orange')
    plt.title(f'Poverty Rate Forecast for {country_name}')
    plt.xlabel('Year')
    plt.ylabel('Poverty Rate')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Get country list
country_list = df['Country Name'].unique().tolist()
country_dropdown = Dropdown(options=sorted(country_list), description='Country:')
year_slider = IntSlider(value=5, min=1, max=15, step=1, description='Years:')

# Display interactive widget
interact(update_chart, country_name=country_dropdown, years_to_forecast=year_slider)

interactive(children=(Dropdown(description='Country:', options=('Algeria', 'Angola', 'Benin', 'Botswana', 'Bur…

<function __main__.update_chart(country_name, years_to_forecast)>