<a href="https://colab.research.google.com/github/NajlaZuhir/Financial-Distress-Predictor/blob/main/V2_MLZoomCamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Imports**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score

from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

import joblib
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score



**Data preparation**

In [None]:
def read_dataframe(filename):
    le = LabelEncoder()
    df = pd.read_csv(filename)

    df['Financial Distress'] = np.where(df['Financial Distress'] > -0.5, 'Distressed', 'Not Distressed')
    df['Financial Distress'] = le.fit_transform(df['Financial Distress'])

    return df

In [None]:
def split_data(X, y, test_size=0.4, val_size=0.5, random_state=42):

    # Step 1: Split into training and a temporary set
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # Step 2: Split the temporary set into validation and test
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=random_state, stratify=y_temp
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
def scaling_data(X_train, X_val, X_test):

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_val_scaled, X_test_scaled


In [None]:
def train_model(X_train_scaled, y_train):

  model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="liblinear"
  )

  model.fit(X_train_scaled, y_train)

  return model

In [None]:
def feature_selection(model, X_train, X_train_scaled, X_val_scaled, y_train, y_val):
  feature_importance = pd.Series(
    np.abs(model.coef_[0]),
    index=X_train.columns
    ).sort_values(ascending=False)

  f1_scores = []
  accuracy_scores = []
  feature_counts = range(1, len(feature_importance) + 1)

  for k in feature_counts:
      top_features = feature_importance.index[:k]

      cols_idx = [X.columns.get_loc(f) for f in top_features]

      X_train_k = X_train_scaled[:, cols_idx]
      X_val_k = X_val_scaled[:, cols_idx]

      model = LogisticRegression(
          max_iter=1000,
          class_weight="balanced",
          solver="liblinear"
      )

      model.fit(X_train_k, y_train)
      y_pred = model.predict(X_val_k)

      f1_scores.append(f1_score(y_val, y_pred, average="macro"))
      accuracy_scores.append(accuracy_score(y_val, y_pred))

  results_df = pd.DataFrame({
    "num_features": feature_counts,
    "f1_macro": f1_scores,
    "accuracy": accuracy_scores
  })

  results_df["delta_f1"] = results_df["f1_macro"].diff()

  threshold = 0.005
  window = 5
  plateau_point = None

  for i in range(len(results_df) - window):
      if results_df["delta_f1"].iloc[i+1:i+window+1].abs().max() < threshold:
          plateau_point = results_df["num_features"].iloc[i]
          break

  optimal_features = feature_importance.index[:plateau_point]

  return optimal_features

In [None]:

def predict_financial_distress(new_data_df, trained_model, optimal_features):
    X_new = new_data_df.loc[:, optimal_features]
    y_pred = trained_model.predict(X_new)
    return y_pred


In [None]:
def evaluate_model(model, X, y, dataset_name="Dataset"):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1 = f1_score(y, y_pred, average='macro')
    cm = confusion_matrix(y, y_pred)
    cr = classification_report(y, y_pred)
    return acc, f1, cm, cr



**Usage / Testing Script**

In [None]:

# =============================
# Step 1: Load dataset
# =============================
filename = "Financial Distress.csv"
df = read_dataframe(filename)

# Separate features and target
X = df.iloc[:, 3:]  # columns x1 to x83
y = df['Financial Distress']

# =============================
# Step 2: Split data
# =============================
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y, test_size=0.4, val_size=0.5)

# =============================
# Step 3: Scale data
# =============================
X_train_scaled, X_val_scaled, X_test_scaled = scaling_data(X_train, X_val, X_test)

# =============================
# Step 4: Train initial model
# =============================
model = train_model(X_train_scaled, y_train)

# =============================
# Step 5: Feature Selection (plateau-based)
# =============================
optimal_features = feature_selection(
    model,
    X_train,
    X_train_scaled,
    X_val_scaled,
    y_train,
    y_val
)

# =============================
# Step 6: Retrain model on optimal features
# =============================
# select only optimal features
cols_idx = [X_train.columns.get_loc(f) for f in optimal_features]
X_train_opt = X_train_scaled[:, cols_idx]
X_val_opt = X_val_scaled[:, cols_idx]
X_test_opt = X_test_scaled[:, cols_idx]

final_model = train_model(X_train_opt, y_train)

# =============================
# Step 7: Evaluate on Validation & Test
# =============================
acc_val, f1_val, cm_val, cr_val = evaluate_model(final_model, X_val_opt, y_val, "Validation Set")
acc_test, f1_test, cm_test, cr_test = evaluate_model(final_model, X_test_opt, y_test, "Test Set")

# =============================
# Step 8: Predict on new data (example)
# =============================
# Example: take first 5 rows of test as "new" data
new_data_df = X_test[:5].copy()  # assuming X_test is a DataFrame
predictions = predict_financial_distress(new_data_df, final_model, optimal_features)





In [None]:
print("Optimal features:", optimal_features)
print("\n--- Validation ---")
print("Accuracy:", acc_val)
print("F1-macro:", f1_val)
print("Confusion Matrix:\n", cm_val)
print("Classification Report:\n", cr_val)

print("\n--- Test ---")
print("Accuracy:", acc_test)
print("F1-macro:", f1_test)
print("Confusion Matrix:\n", cm_test)
print("Classification Report:\n", cr_test)
print("\nPredictions for new data:", predictions)


Optimal features: Index(['x36', 'x26', 'x7', 'x25', 'x51', 'x53', 'x5', 'x54', 'x48', 'x81',
       'x33', 'x10', 'x13', 'x3', 'x21', 'x58', 'x30'],
      dtype='object')

--- Validation ---
Accuracy: 0.837874659400545
F1-macro: 0.5893754554215735
Confusion Matrix:
 [[593 114]
 [  5  22]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.84      0.91       707
           1       0.16      0.81      0.27        27

    accuracy                           0.84       734
   macro avg       0.58      0.83      0.59       734
weighted avg       0.96      0.84      0.89       734


--- Test ---
Accuracy: 0.854421768707483
F1-macro: 0.6097041702026292
Confusion Matrix:
 [[605 103]
 [  4  23]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.85      0.92       708
           1       0.18      0.85      0.30        27

    accuracy                           0.85       735
   m

In [None]:
import joblib

joblib.dump(model, "financial_distress_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(optimal_features, "optimal_features.pkl")


['optimal_features.pkl']