# Importing Libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load California Housing Dataset

In [12]:

california = fetch_california_housing()
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.Series(california.target, name='PRICE')

# Normalize numerical features

In [13]:

X = (X - X.mean()) / X.std()


# Split the dataset

In [14]:

X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.2, random_state=42
)

# Custom Linear Regression Model

In [5]:

class LinearRegressionScratch:
    def fit(self, X, y):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Add bias term
        self.theta = np.linalg.inv(X_b.T @ X_b) @ X_b.T @ y

    def predict(self, X):
        X_b = np.c_[np.ones((X.shape[0], 1)), X]
        return X_b @ self.theta

# Custom Decision Tree and Random Forest

In [6]:


class DecisionTree:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth

    def fit(self, X, y, depth=0):
        self.feature_index = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = np.mean(y)

        if depth >= self.max_depth or len(set(y)) == 1:
            return

        best_mse = float('inf')
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_mask = X[:, feature] <= t
                right_mask = ~left_mask
                if sum(left_mask) == 0 or sum(right_mask) == 0:
                    continue
                left_y, right_y = y[left_mask], y[right_mask]
                mse = (left_y.var() * len(left_y) + right_y.var() * len(right_y)) / len(y)
                if mse < best_mse:
                    best_mse = mse
                    self.feature_index = feature
                    self.threshold = t

        if self.feature_index is not None:
            left_mask = X[:, self.feature_index] <= self.threshold
            right_mask = ~left_mask
            self.left = DecisionTree(self.max_depth)
            self.right = DecisionTree(self.max_depth)
            self.left.fit(X[left_mask], y[left_mask], depth + 1)
            self.right.fit(X[right_mask], y[right_mask], depth + 1)

    def predict_one(self, x):
        if self.feature_index is None:
            return self.value
        if x[self.feature_index] <= self.threshold:
            return self.left.predict_one(x)
        else:
            return self.right.predict_one(x)

    def predict(self, X):
        return np.array([self.predict_one(x) for x in X])

class RandomForest:
    def __init__(self, n_trees=10, max_depth=3):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            idxs = np.random.choice(len(X), len(X), replace=True)
            tree = DecisionTree(self.max_depth)
            tree.fit(X[idxs], y[idxs])
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return tree_preds.mean(axis=0)

# Custom XGBoost Regressor

In [7]:

class XGBoostRegressor:
    def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        pred = np.full(y.shape, np.mean(y))
        for _ in range(self.n_estimators):
            residuals = y - pred
            tree = DecisionTree(self.max_depth)
            tree.fit(X, residuals)
            update = tree.predict(X)
            pred += self.learning_rate * update
            self.trees.append(tree)

    def predict(self, X):
        pred = np.zeros(X.shape[0]) + np.mean(y_train)
        for tree in self.trees:
            pred += self.learning_rate * tree.predict(X)
        return pred


#  Evaluation Function

In [8]:

def evaluate(model, name):
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name} RMSE: {rmse:.4f}, R²: {r2:.4f}")
    return rmse, r2

# Train and Evaluate All Models

In [9]:


print("Training Linear Regression...")
lin_reg = LinearRegressionScratch()
lin_reg.fit(X_train, y_train)
evaluate(lin_reg, "Linear Regression")


Training Linear Regression...
Linear Regression RMSE: 0.7456, R²: 0.5758


(0.7455813830127764, 0.5757877060324508)

In [None]:
print("\nTraining Random Forest...")
rf = RandomForest(n_trees=10, max_depth=4)
rf.fit(X_train, y_train)
evaluate(rf, "Random Forest")

In [None]:
print("\nTraining XGBoost...")
xgb = XGBoostRegressor(n_estimators=10, learning_rate=0.1, max_depth=3)
xgb.fit(X_train, y_train)
evaluate(xgb, "XGBoost")

#  Feature Importance Visualization

In [None]:
def compute_feature_importance(model, feature_names):
    importance = {name: 0 for name in feature_names}

    def count_splits(tree):
        if tree.feature_index is not None:
            importance[feature_names[tree.feature_index]] += 1
            count_splits(tree.left)
            count_splits(tree.right)

    for tree in model.trees:
        count_splits(tree)

    return importance

In [None]:
def plot_importance(importance_dict, title):
    items = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
    features, scores = zip(*items)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=scores, y=features)
    plt.title(title)
    plt.xlabel("Importance (split count)")
    plt.tight_layout()
    plt.show()

In [None]:

print("\nVisualizing Feature Importance...")
rf_importance = compute_feature_importance(rf, california.feature_names)
xgb_importance = compute_feature_importance(xgb, california.feature_names)

In [None]:
plot_importance(rf_importance, "Random Forest Feature Importance")

In [None]:
plot_importance(xgb_importance, "XGBoost Feature Importance")