In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from ydata_profiling import ProfileReport

In [4]:
# loading the dataset
df = pd.read_csv('African crises/African_crises_dataset.csv')
df.head()

Unnamed: 0,country_number,country_code,country,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,currency_crises,inflation_crises,banking_crisis
0,1,DZA,Algeria,1870,1,0.052264,0,0,0.0,3.441456,0,0,0,crisis
1,1,DZA,Algeria,1871,0,0.052798,0,0,0.0,14.14914,0,0,0,no_crisis
2,1,DZA,Algeria,1872,0,0.052274,0,0,0.0,-3.718593,0,0,0,no_crisis
3,1,DZA,Algeria,1873,0,0.05168,0,0,0.0,11.203897,0,0,0,no_crisis
4,1,DZA,Algeria,1874,0,0.051308,0,0,0.0,-3.848561,0,0,0,no_crisis


In [5]:
# basic information on the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country_number                   1059 non-null   int64  
 1   country_code                     1059 non-null   object 
 2   country                          1059 non-null   object 
 3   year                             1059 non-null   int64  
 4   systemic_crisis                  1059 non-null   int64  
 5   exch_usd                         1059 non-null   float64
 6   domestic_debt_in_default         1059 non-null   int64  
 7   sovereign_external_debt_default  1059 non-null   int64  
 8   gdp_weighted_default             1059 non-null   float64
 9   inflation_annual_cpi             1059 non-null   float64
 10  independence                     1059 non-null   int64  
 11  currency_crises                  1059 non-null   int64  
 12  inflation_crises    

In [6]:
#checking for missing values
df.isnull().sum()

country_number                     0
country_code                       0
country                            0
year                               0
systemic_crisis                    0
exch_usd                           0
domestic_debt_in_default           0
sovereign_external_debt_default    0
gdp_weighted_default               0
inflation_annual_cpi               0
independence                       0
currency_crises                    0
inflation_crises                   0
banking_crisis                     0
dtype: int64

In [7]:
# Generate profile report
profile = ProfileReport(df, title="African Crises Dataset Profiling Report", explorative=True)
profile.to_widgets()  # Display the report in Jupyter Notebook/Lab

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [8]:
#checking for duplicates
print ("Numberof duplicates:", df.duplicated().sum())

Numberof duplicates: 0


In [9]:
df.columns

Index(['country_number', 'country_code', 'country', 'year', 'systemic_crisis',
       'exch_usd', 'domestic_debt_in_default',
       'sovereign_external_debt_default', 'gdp_weighted_default',
       'inflation_annual_cpi', 'independence', 'currency_crises',
       'inflation_crises', 'banking_crisis'],
      dtype='object')

In [10]:
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['country', 'banking_crisis'])

In [11]:
df.head()

Unnamed: 0,country_number,country_code,year,systemic_crisis,exch_usd,domestic_debt_in_default,sovereign_external_debt_default,gdp_weighted_default,inflation_annual_cpi,independence,...,country_Kenya,country_Mauritius,country_Morocco,country_Nigeria,country_South Africa,country_Tunisia,country_Zambia,country_Zimbabwe,banking_crisis_crisis,banking_crisis_no_crisis
0,1,DZA,1870,1,0.052264,0,0,0.0,3.441456,0,...,False,False,False,False,False,False,False,False,True,False
1,1,DZA,1871,0,0.052798,0,0,0.0,14.14914,0,...,False,False,False,False,False,False,False,False,False,True
2,1,DZA,1872,0,0.052274,0,0,0.0,-3.718593,0,...,False,False,False,False,False,False,False,False,False,True
3,1,DZA,1873,0,0.05168,0,0,0.0,11.203897,0,...,False,False,False,False,False,False,False,False,False,True
4,1,DZA,1874,0,0.051308,0,0,0.0,-3.848561,0,...,False,False,False,False,False,False,False,False,False,True


In [12]:
# Selecting target and feature variable (excluding target and any irrelevant columns)
target = 'systemic_crisis'
features = df.columns.drop([target, 'country_code', 'year'])

X = df[features]
y = df[target]

In [13]:
#splitting dataset 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
# Training a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

In [16]:
# Making predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9905660377358491


In [17]:
#confuion matrix
print ("Confusion Matrix:", confusion_matrix(y_test, y_pred))

Confusion Matrix: [[194   1]
 [  1  16]]


In [18]:
#classification report
print ("\n Classification Matrix:", classification_report(y_test, y_pred))


 Classification Matrix:               precision    recall  f1-score   support

           0       0.99      0.99      0.99       195
           1       0.94      0.94      0.94        17

    accuracy                           0.99       212
   macro avg       0.97      0.97      0.97       212
weighted avg       0.99      0.99      0.99       212



In [19]:
# Feature importance
feature_importance = pd.DataFrame({'feature': features, 'importance': rf_classifier.feature_importances_})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features:")
print(feature_importance.head(10))


Top 10 Most Important Features:
                             feature  importance
23          banking_crisis_no_crisis    0.306305
22             banking_crisis_crisis    0.281400
1                           exch_usd    0.095873
5               inflation_annual_cpi    0.095167
0                     country_number    0.042868
3    sovereign_external_debt_default    0.023666
12                     country_Egypt    0.017767
8                   inflation_crises    0.016742
14                     country_Kenya    0.015077
11  country_Central African Republic    0.014745


## How to improve the model performance

1. Test different algorithms like Logistic Regression, Support Vector Machines, or Gradient Boosting Classifiers.
2. Feature engineering: Create new features or transform existing ones to capture more information.
3. Hyperparameter tuning: Use techniques like Grid Search or Random Search to find optimal hyperparameters for your model.
4. Combine multiple models to create a stronger predictor.
5. Address class imbalance: If the target variable is imbalanced, use techniques like SMOTE or class weighting.
6. Cross-validation: Use k-fold cross-validation to get a more robust estimate of model performance.
7. Feature selection: Use techniques like Recursive Feature Elimination to select the most important features.