# Decision Tree and Random Forest Implementation

> In this notebook we implement Decision Tree and Random Forest algorithms from scratch and then compare it with the ones from the scikit-learn library.

In [1]:
import numpy as np
import pandas as pd
from error_define import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from dataset import *

In [2]:
X,y = load_dataset()
X = X.values
y = y.values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



Importing the scratch implementation of Decision Tree from dec_tree.py and using it for training and prediction.

In [9]:
from dec_tree import DecisionTreeRegressor #importing the DecisionTreeRegressor class from dec_tree.py, where we have implemented it from scratch

tree = DecisionTreeRegressor()

# Fit the model
tree.fit(X_train, y_train)
# Predict the test set
y_pred = tree.predict(X_test)


Evaluating Performance of DecisionTreeRegressor of our implementation

In [12]:
#Evaluating performance
mse = mean_squared_error(y_test, y_pred)
dse = downside_square_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Downside Squared Error: {dse}")

Mean Squared Error: 2722.166545290609
Downside Squared Error: 1473.2391975308642


Using library implementation of Decision Tree Regressor from scikit-learn library to compare with our results

In [13]:
from sklearn.tree import DecisionTreeRegressor
# Create a decision tree classifier model object for comparison
decision_tree_regressor = DecisionTreeRegressor()

# Train the decision tree classifier model using the training data.
decision_tree_regressor.fit(X_train, y_train)

# Use the trained model to make predictions on the test data.
y_pred = decision_tree_regressor.predict(X_test)

Evaluating Performance of Sklearn DecisionTreeRegressor

In [14]:
# Evaluate the performance of the model
mse = (mean_squared_error(y_test, y_pred))
dse = downside_square_error(y_test, y_pred)
print(f"Mean Squared Error(scikit-learn): {mse}")
print(f"Downside Squared Error(scikit-learn): {dse}")

Mean Squared Error(scikit-learn): 2899.0544458315994
Downside Squared Error(scikit-learn): 1544.4888334026912


In [3]:
X,y = load_dataset_features()
X = X.values
y = y.values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from dec_tree import DecisionTreeRegressor #importing the DecisionTreeRegressor class from dec_tree.py, where we have implemented it from scratch

tree = DecisionTreeRegressor()

# Fit the model
tree.fit(X_train, y_train)
# Predict the test set
y_pred = tree.predict(X_test)



In [6]:
#Evaluating performance
mse = mean_squared_error(y_test, y_pred)
dse = downside_square_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Downside Squared Error: {dse}")

Mean Squared Error: 587.848175891247
Downside Squared Error: 297.85436607019005


Using library implementation of Decision Tree Regressor from scikit-learn library to compare with our results

In [9]:
from sklearn.tree import DecisionTreeRegressor
# Create a decision tree classifier model object for comparison
decision_tree_regressor = DecisionTreeRegressor(max_depth=100)

# Train the decision tree classifier model using the training data.
decision_tree_regressor.fit(X_train, y_train)

# Use the trained model to make predictions on the test data.
y_pred = decision_tree_regressor.predict(X_test)

Evaluating Performance of Sklearn DecisionTreeRegressor

In [10]:
# Evaluate the performance of the model
mse = (mean_squared_error(y_test, y_pred))
dse = downside_square_error(y_test, y_pred)
print(f"Mean Squared Error(scikit-learn): {mse}")
print(f"Downside Squared Error(scikit-learn): {dse}")

Mean Squared Error(scikit-learn): 782.5233562213899
Downside Squared Error(scikit-learn): 345.6849597725066


Implentation of Random Forest Using DecisionTreeRegressor() function

In [11]:
from sklearn.tree import DecisionTreeRegressor

class RandomForestRegressor:
    
    def __init__(self, n_estimators=10, max_depth=10, min_size=2, C=None):
        """
        Initialize the RandomForestRegressor.

        Parameters:
        - n_estimators (int): Number of decision trees in the forest.
        - max_depth (int): Maximum depth of each decision tree.
        - min_size (int): Minimum number of samples required to split a node.
        - C (int): Number of features to consider when looking for the best split.
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_size = min_size
        self.C = C
        self.trees = []
    
    def fit(self, X, y):
        """
        Fit the RandomForestRegressor to the training data.

        Parameters:
        - X (array-like): Training input samples.
        - y (array-like): Target values.
        """
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            indices = np.random.choice(len(X), len(X), replace=True)  # Bootstrap sampling
            X_bootstrap, y_bootstrap = X[indices], y[indices]
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)
    
    def predict(self, X_test):
        """
        Predict target values for the test data.

        Parameters:
        - X_test (array-like): Test input samples.

        Returns:
        - predictions (array-like): Predicted target values.
        """
        predictions = np.zeros((len(X_test), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X_test)
        return np.mean(predictions, axis=1)

Using the implemented Random Forest model for training and prediction

In [12]:
X,y = load_dataset()
X = X.values
y = y.values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [13]:
# Initialize the random forest regressor
random_forest = RandomForestRegressor(n_estimators=100, max_depth=50, min_size=2, C=None)

# Fit the random forest model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred = random_forest.predict(X_test)



Evaluating Perforce of RandomForestRegressor from our implementation

In [14]:
# Evaluate the performance of the model (e.g., using mean squared error)
mse = mean_squared_error(y_test, y_pred)
dse = downside_square_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Downside Squared Error:", dse)

Mean Squared Error: 1287.0119149993059
Downside Squared Error: 551.9467275574896


Using Random Forest Regressor from scikit-learn library to compare results with our implementation

In [17]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the random forest regressor
sklearn_random_forest = RandomForestRegressor(n_estimators=100, max_depth=50, min_samples_split=2)

# Fit the random forest model to the training data
sklearn_random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred_sklearn = sklearn_random_forest.predict(X_test)


Evaluating Perforce of RandomForestRegressor from Library

In [18]:
# Evaluate the performance of the scikit-learn random forest model (e.g., using mean squared error)
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
dse_sklearn = downside_square_error(y_test, y_pred_sklearn)
print("Mean Squared Error (scikit-learn RandomForestRegressor):", mse_sklearn)
print("Downside Squared Error (scikit-learn RandomForestRegressor):", dse_sklearn)

Mean Squared Error (scikit-learn RandomForestRegressor): 1277.8464548344277
Downside Squared Error (scikit-learn RandomForestRegressor): 547.3786408745048


## Evaluating on feature engineered dataset

In [19]:
from sklearn.tree import DecisionTreeRegressor

class RandomForestRegressor:
    
    def __init__(self, n_estimators=10, max_depth=10, min_size=2, C=None):
        """
        Initialize the RandomForestRegressor.

        Parameters:
        - n_estimators (int): Number of decision trees in the forest.
        - max_depth (int): Maximum depth of each decision tree.
        - min_size (int): Minimum number of samples required to split a node.
        - C (int): Number of features to consider when looking for the best split.
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_size = min_size
        self.C = C
        self.trees = []
    
    def fit(self, X, y):
        """
        Fit the RandomForestRegressor to the training data.

        Parameters:
        - X (array-like): Training input samples.
        - y (array-like): Target values.
        """
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            indices = np.random.choice(len(X), len(X), replace=True)  # Bootstrap sampling
            X_bootstrap, y_bootstrap = X[indices], y[indices]
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)
    
    def predict(self, X_test):
        """
        Predict target values for the test data.

        Parameters:
        - X_test (array-like): Test input samples.

        Returns:
        - predictions (array-like): Predicted target values.
        """
        predictions = np.zeros((len(X_test), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X_test)
        return np.mean(predictions, axis=1)

In [21]:
X,y = load_dataset_features()
X = X.values
y = y.values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
# Initialize the random forest regressor
random_forest = RandomForestRegressor(n_estimators=100, max_depth=50, min_size=2, C=None)

# Fit the random forest model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred = random_forest.predict(X_test)



Evaluating Perforce of RandomForestRegressor from our implementation

In [23]:
# Evaluate the performance of the model (e.g., using mean squared error)
mse = mean_squared_error(y_test, y_pred)
dse = downside_square_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Downside Squared Error:", dse)

Mean Squared Error: 343.535684297406
Downside Squared Error: 127.85881667533698


Using Random Forest Regressor from scikit-learn library to compare results with our implementation

In [24]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the random forest regressor
sklearn_random_forest = RandomForestRegressor(n_estimators=100, max_depth=50, min_samples_split=2)

# Fit the random forest model to the training data
sklearn_random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred_sklearn = sklearn_random_forest.predict(X_test)


Evaluating Perforce of RandomForestRegressor from Library

In [25]:
# Evaluate the performance of the scikit-learn random forest model (e.g., using mean squared error)
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
dse_sklearn = downside_square_error(y_test, y_pred_sklearn)
print("Mean Squared Error (scikit-learn RandomForestRegressor):", mse_sklearn)
print("Downside Squared Error (scikit-learn RandomForestRegressor):", dse_sklearn)

Mean Squared Error (scikit-learn RandomForestRegressor): 339.09980123109995
Downside Squared Error (scikit-learn RandomForestRegressor): 124.006240451173
