In [124]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [125]:
dataset_path = './NSO_Population_Sex_dataset/NSO_POPULATION_DATA_CLEANED.csv'

In [126]:
df = pd.read_csv(dataset_path)

In [127]:
X = df.iloc[:, list(range(3)) + list(range(-2, 0))]
y = df.iloc[:, 3]

In [128]:
X.shape, y.shape

((192, 5), (192,))

In [129]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [130]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor() # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [131]:
regressor.fit(X_train, y_train)

In [132]:
prediction = regressor.predict(X_test)

In [133]:
mae = mean_absolute_error(y_test, prediction)
mse = mean_squared_error(y_test, prediction)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 0.006682624082336712
Mean Squared Error: 0.00020596128886278466


In [134]:
# Number of trees in the forest
n_trees = 100

# List to store individual tree predictions
tree_predictions = []

In [135]:
# Building the Random Forest
for _ in range(n_trees):
    # Randomly sample data with replacement (bootstrap)
    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_bootstrap = X_train.values[indices]
    y_bootstrap = y_train.values[indices]

    # Create and train a decision tree
    tree = DecisionTreeRegressor()
    tree.fit(X_bootstrap, y_bootstrap)

    # Make predictions on the test set
    tree_pred = tree.predict(X_test.values)
    
    # Append tree predictions to the list
    tree_predictions.append(tree_pred)

In [136]:
# Aggregate predictions from all trees
rf_predictions = np.mean(tree_predictions, axis=0)

In [137]:
mae = mean_absolute_error(y_test, rf_predictions)
mse = mean_squared_error(y_test, rf_predictions)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 0.006631928123447219
Mean Squared Error: 0.00016175899591449794
