Load Required Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=UndefinedMetricWarning)



Data Preparation

In [2]:
# Load the dataset (assuming already in df)
df=pd.read_csv("df_model.csv")
# Define the features and target variable
    
# Shuffle and split the data into features (X) and target (y)
X = df.drop(columns=["loan_status"])  # Feature columns
y = df["loan_status"]  # Target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Implement Decision Tree from Scratch

In [3]:
class DecisionTreeScratch:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def _gini(self, groups, classes):
        # Calculate Gini index for split groups
        gini = 0.0
        total_samples = sum([len(group) for group in groups])
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            for class_val in classes:
                proportion = [row[-1] for row in group].count(class_val) / size
                score += proportion**2
            gini += (1 - score) * (size / total_samples)
        return gini

    def _split(self, index, value, dataset):
        left, right = [], []
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    def _best_split(self, dataset):
        class_values = list(set(row[-1] for row in dataset))
        best_index, best_value, best_score, best_groups = None, None, float("inf"), None
        for index in range(len(dataset[0]) - 1):
            for row in dataset:
                groups = self._split(index, row[index], dataset)
                gini = self._gini(groups, class_values)
                if gini < best_score:
                    best_index, best_value, best_score, best_groups = index, row[index], gini, groups
        return {"index": best_index, "value": best_value, "groups": best_groups}

    def _terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)

    def _split_node(self, node, depth):
        left, right = node["groups"]
        del node["groups"]
        if not left or not right:
            node["left"] = node["right"] = self._terminal(left + right)
            return
        if depth >= self.max_depth:
            node["left"], node["right"] = self._terminal(left), self._terminal(right)
            return
        node["left"] = self._best_split(left)
        self._split_node(node["left"], depth + 1)
        node["right"] = self._best_split(right)
        self._split_node(node["right"], depth + 1)

    def fit(self, X, y):
        dataset = np.hstack((X, y.reshape(-1, 1)))
        self.tree = self._best_split(dataset)
        self._split_node(self.tree, 1)

    def _predict(self, node, row):
        if row[node["index"]] < node["value"]:
            if isinstance(node["left"], dict):
                return self._predict(node["left"], row)
            else:
                return node["left"]
        else:
            if isinstance(node["right"], dict):
                return self._predict(node["right"], row)
            else:
                return node["right"]

    def predict(self, X):
        predictions = [self._predict(self.tree, row) for row in X]
        return np.array(predictions)


Implement Random Forest from Scratch

In [4]:
class RandomForestScratch:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, sample_size=1.0):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.sample_size = sample_size
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = round(len(X) * self.sample_size)
        indices = np.random.choice(len(X), n_samples, replace=True)
        return X[indices], y[indices]

    def fit(self, X, y):
        for _ in range(self.n_trees):
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree = DecisionTreeScratch(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.round(np.mean(predictions, axis=0))


Train Models

In [None]:
# Convert to NumPy for the scratch implementation
X_train_np, X_test_np = X_train.values, X_test.values
y_train_np, y_test_np = y_train.values, y_test.values

# Train Random Forest from scratch
rf_scratch = RandomForestScratch(n_trees=10, max_depth=1)
rf_scratch.fit(X_train_np, y_train_np)

# Train scikit-learn Random Forest
rf_sklearn = RandomForestClassifier(n_estimators=10, max_depth=1, random_state=42)
rf_sklearn.fit(X_train, y_train)


Evaluate Models

In [None]:
# Predictions
y_pred_scratch = rf_scratch.predict(X_test_np)
y_pred_sklearn = rf_sklearn.predict(X_test)

# Accuracy Comparison
accuracy_scratch = accuracy_score(y_test_np, y_pred_scratch)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

print(f"Scratch Random Forest Accuracy: {accuracy_scratch}")
print(f"Scikit-learn Random Forest Accuracy: {accuracy_sklearn}")

# Metrics Visualization
conf_matrix_scratch = confusion_matrix(y_test_np, y_pred_scratch)
conf_matrix_sklearn = confusion_matrix(y_test, y_pred_sklearn)

fig, axes = plt.subplots(1, 2, figsize=(12, 6))
sns.heatmap(conf_matrix_scratch, annot=True, fmt="d", ax=axes[0], cmap="Blues")
axes[0].set_title("Scratch RF Confusion Matrix")
sns.heatmap(conf_matrix_sklearn, annot=True, fmt="d", ax=axes[1], cmap="Blues")
axes[1].set_title("Sklearn RF Confusion Matrix")
plt.show()


In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("df_model.csv")

# Define the features and target variable
X = df.drop(columns=["loan_status"]).values  # Feature columns
y = df["loan_status"].values  # Target column

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Optimized Random Forest from Scratch
class DecisionTreeScratch:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def _gini(self, groups, classes):
        gini = 0.0
        total_samples = sum(len(group) for group in groups)
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            for c in classes:
                proportion = np.sum(group[:, -1] == c) / size  # Fix for class counting
                score += proportion ** 2
            gini += (1 - score) * (size / total_samples)
        return gini

    def _split(self, index, value, dataset):
        left = dataset[dataset[:, index] < value]
        right = dataset[dataset[:, index] >= value]
        return left, right

    def _best_split(self, dataset):
        class_values = np.unique(dataset[:, -1])  # Fix to get unique classes
        best_index, best_value, best_score, best_groups = None, None, float("inf"), None
        for index in range(dataset.shape[1] - 1):
            for row in dataset:
                groups = self._split(index, row[index], dataset)
                gini = self._gini(groups, class_values)
                if gini < best_score:
                    best_index, best_value, best_score, best_groups = index, row[index], gini, groups
        return {"index": best_index, "value": best_value, "groups": best_groups}

    def _terminal(self, group):
        outcomes = group[:, -1]
        return np.bincount(outcomes.astype(int)).argmax()  # Fix for terminal value

    def _split_node(self, node, depth):
        left, right = node["groups"]
        del node["groups"]
        if not len(left) or not len(right):
            node["left"] = node["right"] = self._terminal(np.vstack((left, right)))
            return
        if depth >= self.max_depth:
            node["left"], node["right"] = self._terminal(left), self._terminal(right)
            return
        node["left"] = self._best_split(left)
        self._split_node(node["left"], depth + 1)
        node["right"] = self._best_split(right)
        self._split_node(node["right"], depth + 1)

    def fit(self, X, y):
        dataset = np.hstack((X, y.reshape(-1, 1)))
        self.tree = self._best_split(dataset)
        self._split_node(self.tree, 1)

    def _predict(self, node, row):
        if row[node["index"]] < node["value"]:
            if isinstance(node["left"], dict):
                return self._predict(node["left"], row)
            else:
                return node["left"]
        else:
            if isinstance(node["right"], dict):
                return self._predict(node["right"], row)
            else:
                return node["right"]

    def predict(self, X):
        return np.array([self._predict(self.tree, row) for row in X])


# Train Random Forest from scratch
rf_scratch = RandomForestScratch(n_trees=5, max_depth=1, min_samples_split=10)
rf_scratch.fit(X_train, y_train)

# Predict with Random Forest from scratch
y_pred_scratch = rf_scratch.predict(X_test)

# Train scikit-learn Random Forest
rf_sklearn = RandomForestClassifier(n_estimators=5, max_depth=1, random_state=42)
rf_sklearn.fit(X_train, y_train)
y_pred_sklearn = rf_sklearn.predict(X_test)

# Evaluate performance
print("Accuracy (Scratch):", accuracy_score(y_test, y_pred_scratch))
print("Confusion Matrix (Scratch):\n", confusion_matrix(y_test, y_pred_scratch))
print("Classification Report (Scratch):\n", classification_report(y_test, y_pred_scratch))

print("\nAccuracy (sklearn):", accuracy_score(y_test, y_pred_sklearn))
print("Confusion Matrix (sklearn):\n", confusion_matrix(y_test, y_pred_sklearn))
print("Classification Report (sklearn):\n", classification_report(y_test, y_pred_sklearn))


Accuracy (Scratch): 0.7765921973991331
Confusion Matrix (Scratch):
 [[6987    0]
 [2010    0]]
Classification Report (Scratch):
               precision    recall  f1-score   support

           0       0.78      1.00      0.87      6987
           1       0.00      0.00      0.00      2010

    accuracy                           0.78      8997
   macro avg       0.39      0.50      0.44      8997
weighted avg       0.60      0.78      0.68      8997


Accuracy (sklearn): 0.7765921973991331
Confusion Matrix (sklearn):
 [[6987    0]
 [2010    0]]
Classification Report (sklearn):
               precision    recall  f1-score   support

           0       0.78      1.00      0.87      6987
           1       0.00      0.00      0.00      2010

    accuracy                           0.78      8997
   macro avg       0.39      0.50      0.44      8997
weighted avg       0.60      0.78      0.68      8997



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
