# Decision Tree and Random Forest Implementation

> In this notebook we implement Decision Tree and Random Forest algorithms from scratch and then compare it with the ones from the scikit-learn library.

In [None]:
import numpy as np
import pandas as pd
from error_define import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [None]:
cleaned_df = pd.read_csv('cleaned_shifted_data.csv')

In [None]:

# Sample the dataset
sample_size = 1000
sampled_df = cleaned_df.sample(n=sample_size, random_state=11)

# Filter relevant columns (AQI constituents)
relevant_columns = ['PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)', 'NO2 (µg/m³)',
                    'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
                    'Ozone (µg/m³)']
X = sampled_df[relevant_columns].values
y = sampled_df['AQI_calculated_shifted'].values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Importing the scratch implementation of Decision Tree from dec_tree.py and using it for training and prediction.

In [None]:
from dec_tree import DecisionTreeRegressor #importing the DecisionTreeRegressor class from dec_tree.py, where we have implemented it from scratch

tree = DecisionTreeRegressor()

# Fit the model
tree.fit(X_train, y_train)
# Predict the test set
y_pred = tree.predict(X_test)


Evaluating Performance of DecisionTreeRegressor of our implementation

In [None]:
#Evaluating performance
mse = mean_squared_error(y_test, y_pred)
dse = downside_square_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"Downside Squared Error: {dse}")

Using library implementation of Decision Tree Regressor from scikit-learn library to compare with our results

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Create a decision tree classifier model object for comparison
decision_tree_regressor = DecisionTreeRegressor()

# Train the decision tree classifier model using the training data.
decision_tree_regressor.fit(X_train, y_train)

# Use the trained model to make predictions on the test data.
y_pred = decision_tree_regressor.predict(X_test)

Evaluating Performance of Sklearn DecisionTreeRegressor

In [None]:
# Evaluate the performance of the model
mse = (mean_squared_error(y_test, y_pred))
dse = downside_square_error(y_test, y_pred)
print(f"Mean Squared Error(scikit-learn): {mse}")
print(f"Downside Squared Error(scikit-learn): {dse}")

Implentation of Random Forest Using DecisionTreeRegressor() function

In [None]:
from sklearn.tree import DecisionTreeRegressor

class RandomForestRegressor:
    
    def __init__(self, n_estimators=10, max_depth=10, min_size=2, C=None):
        """
        Initialize the RandomForestRegressor.

        Parameters:
        - n_estimators (int): Number of decision trees in the forest.
        - max_depth (int): Maximum depth of each decision tree.
        - min_size (int): Minimum number of samples required to split a node.
        - C (int): Number of features to consider when looking for the best split.
        """
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_size = min_size
        self.C = C
        self.trees = []
    
    def fit(self, X, y):
        """
        Fit the RandomForestRegressor to the training data.

        Parameters:
        - X (array-like): Training input samples.
        - y (array-like): Target values.
        """
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            indices = np.random.choice(len(X), len(X), replace=True)  # Bootstrap sampling
            X_bootstrap, y_bootstrap = X[indices], y[indices]
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)
    
    def predict(self, X_test):
        """
        Predict target values for the test data.

        Parameters:
        - X_test (array-like): Test input samples.

        Returns:
        - predictions (array-like): Predicted target values.
        """
        predictions = np.zeros((len(X_test), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X_test)
        return np.mean(predictions, axis=1)

Using the implemented Random Forest model for training and prediction

In [None]:
# Initialize the random forest regressor
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, min_size=2, C=None)

# Fit the random forest model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred = random_forest.predict(X_test)



Evaluating Perforce of RandomForestRegressor from our implementation

In [None]:
# Evaluate the performance of the model (e.g., using mean squared error)
mse = mean_squared_error(y_test, y_pred)
dse = downside_square_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Downside Squared Error:", dse)

Using Random Forest Regressor from scikit-learn library to compare results with our implementation

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the random forest regressor
sklearn_random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=2)

# Fit the random forest model to the training data
sklearn_random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred_sklearn = sklearn_random_forest.predict(X_test)


Evaluating Perforce of RandomForestRegressor from Library

In [None]:
# Evaluate the performance of the scikit-learn random forest model (e.g., using mean squared error)
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
dse_sklearn = downside_square_error(y_test, y_pred_sklearn)
print("Mean Squared Error (scikit-learn RandomForestRegressor):", mse_sklearn)
print("Downside Squared Error (scikit-learn RandomForestRegressor):", dse_sklearn)