# Random Forest Algorithm 

In [3]:
import pandas as pd

# Load the preprocessed training and testing data
X_train = pd.read_csv("../data/processed/X_train_with_outliers_sel.csv")
X_test = pd.read_csv("../data/processed/X_test_with_outliers_sel.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Random Forest model: {accuracy}")

Accuracy of the Random Forest model: 0.7727272727272727


In [None]:
from sklearn.model_selection import GridSearchCV

#Hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],  
    "max_depth": [5, 10, 20, None]  
}

#5-fold cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
print(f'Best parameters found: {grid_search.best_params_}') 
print(f'Best cross-validated accuracy: {grid_search.best_score_}')

Best parameters found: {'max_depth': 5, 'n_estimators': 200}
Best cross-validated accuracy: 0.7459816073570572


In [None]:
#Prediction on the test data with the best model
y_best_pred = best_rf_model.predict(X_test)

#accuracy
best_accuracy = accuracy_score(y_test, y_best_pred)
print(f"Accuracy of the optimized Random Forest model: {best_accuracy}")

Accuracy of the optimized Random Forest model: 0.7857


In [10]:
from pickle import dump

# Save the best model to a file for future use
dump(best_rf_model, open("../src/ENC/optimized_random_forest_model.sav", "wb"))
print("Optimized Random Forest model saved successfully.")

Optimized Random Forest model saved successfully.
