In [121]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [122]:
dataset_path = './NSO_Population_Sex_dataset/NSO_POPULATION_DATA_CLEANED.csv'

In [123]:
df = pd.read_csv(dataset_path)

In [124]:
X = df.iloc[:, list(range(3)) + list(range(-2, 0))]
y = df.iloc[:, 3]

In [125]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [126]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor() # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [127]:
regressor.fit(X_train, y_train)

In [128]:
prediction = regressor.predict(X_test)

In [129]:
mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 0.01025814236259967
Mean Squared Error: 0.0004177557796820712


In [130]:
class RandomForestRegressorSKLearnDecisionTreeRegressor:
    def __init__(self, n_estimators=100, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf
            )

            # Bootstrap sampling
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X.values[indices]
            y_bootstrap = y.values[indices]

            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.trees)))

        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X.values)

        return np.mean(predictions, axis=1)

In [131]:
# Create and train the Random Forest Regressor
random_forest = RandomForestRegressorSKLearnDecisionTreeRegressor(n_estimators=100, max_depth=None)
random_forest.fit(X_train, y_train)

In [132]:
# Make predictions on the test set
predictions = random_forest.predict(X_test)

In [133]:
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 0.01030374186590192
Mean Squared Error: 0.0004156843626537644
