In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Supervised models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Metrics & plots
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)


## 1. Data Preparation

In [None]:

path = "/mnt/data/Global finance data.csv"
df = pd.read_csv(path)

print(df.shape)
df.head()


In [None]:

df.info()


## 2. Supervised Learning — Classification


**Task:** Predict the categorical target `Banking_Sector_Health` (Weak/Moderate/Strong).  
We will:
1. Split data into features `X` and target `y`  
2. Build a preprocessing pipeline (imputation, scaling, one-hot)  
3. Train **Decision Tree**, **Random Forest**, and **KNN**  
4. Evaluate Accuracy / Precision / Recall / F1 and show confusion matrices  


In [None]:

# Define target and features
target_col = "Banking_Sector_Health"
y = df[target_col].copy()

# Drop obvious identifiers / leakage columns from features, keep the rest
drop_cols = [target_col, "Country", "Date"]  # Drop IDs and target
X = df.drop(columns=drop_cols, errors='ignore')

# Separate numeric & categorical columns
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

num_transformer = SimpleImputer(strategy="median")
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ]
)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape, y_train.value_counts()


In [None]:

from sklearn.pipeline import Pipeline

def evaluate_clf(model, X_train, X_test, y_train, y_test, name="Model"):
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    metrics = {
        "Accuracy": accuracy_score(y_test, pred),
        "Macro Precision": precision_score(y_test, pred, average="macro", zero_division=0),
        "Macro Recall": recall_score(y_test, pred, average="macro", zero_division=0),
        "Macro F1": f1_score(y_test, pred, average="macro", zero_division=0)
    }
    print(f"\n{name} metrics:")
    for k,v in metrics.items():
        print(f"{k}: {v:.3f}")
    print("\nClassification report:")
    print(classification_report(y_test, pred, zero_division=0))

    cm = confusion_matrix(y_test, pred, labels=sorted(y_test.unique()))
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=sorted(y_test.unique()), yticklabels=sorted(y_test.unique()))
    plt.title(f"{name} — Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()
    return pipe, metrics

dt_model = DecisionTreeClassifier(random_state=42, max_depth=None)
rf_model = RandomForestClassifier(random_state=42, n_estimators=300, max_depth=None)
knn_model = KNeighborsClassifier(n_neighbors=5)

dt_pipe, dt_metrics = evaluate_clf(dt_model, X_train, X_test, y_train, y_test, name="Decision Tree")
rf_pipe, rf_metrics = evaluate_clf(rf_model, X_train, X_test, y_train, y_test, name="Random Forest")
knn_pipe, knn_metrics = evaluate_clf(knn_model, X_train, X_test, y_train, y_test, name="KNN")


In [None]:

# Compare models
summary = pd.DataFrame([
    {"Model":"Decision Tree", **dt_metrics},
    {"Model":"Random Forest", **rf_metrics},
    {"Model":"KNN", **knn_metrics},
])
summary


## 3. Unsupervised Learning — K-Means Clustering

In [None]:

features_clust = ["Inflation_Rate_Percent", "Unemployment_Rate_Percent"]
Xc = df[features_clust].copy()

scaler = StandardScaler()
Xc_scaled = scaler.fit_transform(Xc)

# Elbow Method
inertias = []
K_range = range(2, 8)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
    kmeans.fit(Xc_scaled)
    inertias.append(kmeans.inertia_)

plt.figure()
plt.plot(list(K_range), inertias, marker="o")
plt.title("Elbow Method")
plt.xlabel("k")
plt.ylabel("Inertia")
plt.show()

# Choose k=3 (consistent with 3 macro-states) and fit
k_opt = 3
kmeans = KMeans(n_clusters=k_opt, random_state=42, n_init="auto")
clusters = kmeans.fit_predict(Xc_scaled)
df["Cluster"] = clusters

plt.figure()
plt.scatter(Xc_scaled[:,0], Xc_scaled[:,1], c=clusters, cmap="Set1")
plt.title("Clusters (Inflation vs Unemployment, scaled)")
plt.xlabel(features_clust[0])
plt.ylabel(features_clust[1])
plt.show()

df[["Country"] + features_clust + ["Cluster"]].head(10)


In [None]:

# Optional PCA visualization with more features
pca_features = ["GDP_Growth_Rate_Percent", "Inflation_Rate_Percent",
                "Unemployment_Rate_Percent", "Political_Risk_Score"]
Xp = df[pca_features].copy()
Xp_scaled = StandardScaler().fit_transform(Xp)

pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(Xp_scaled)

kmeans_p = KMeans(n_clusters=3, random_state=42, n_init="auto").fit(Xp_scaled)
clusters_p = kmeans_p.labels_

plt.figure()
plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters_p, cmap="Set1")
plt.title("PCA View of Clusters (4 macro features)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()
