In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# ----------------------------
# User-defined Configuration
# ----------------------------

# Name of the CSV file containing training data
filename = "trainDataFile"    #Do not include ".csv"

# Insert the desired name of your trained model to be stored as
model_name = "ExampleModelName"    #Do not include ".pkl"

# (Optional) random seed for reproducibility
# For truly random shuffling input: None
random_state = 42


# ----------------------------
# Data Loading and Preprocessing
# ----------------------------

# Read the CSV file, fill missing values, and shuffle the data.
data = pd.read_csv(f"{filename}.csv")
data = data.ffill()
data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)

# Define predictor and target columns.
PREDICTORS = [f"xy{i}" for i in range(1, 151)]
TARGET = "preLoad"

# Scale the predictor columns.
scaler = StandardScaler()
data[PREDICTORS] = scaler.fit_transform(data[PREDICTORS])

# Define Predictors and Targets.
X = data[PREDICTORS]
y = data[TARGET]

# ----------------------------
# RandomForest Model and Grid Search
# ----------------------------

# Set up the parameter grid.
param_grid = {
    'n_estimators': [100, 1000, 5000],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 4, 6]
}

# Initialize the RandomForestRegressor.
rf = RandomForestRegressor(random_state=random_state)

# Set up GridSearchCV to use negative MSE for scoring with 3-fold cross-validation.
grid_search_rf = GridSearchCV(estimator=rf,
                              param_grid=param_grid,
                              scoring='neg_mean_squared_error',
                              cv=3,
                              n_jobs=-1,
                              verbose=1)

print("Starting Grid Search for RandomForest hyperparameters...")
grid_search_rf.fit(X, y)

print("\nBest parameters for RandomForest:", grid_search_rf.best_params_)
print("Best CV MSE (negated):", grid_search_rf.best_score_)

# Obtain the best estimator.
best_rf = grid_search_rf.best_estimator_

# ----------------------------
# Save the Best Model
# ----------------------------

# Save the model, scaler, predictors, and target information as a dictionary.
model_data = {
    "model": best_rf,
    "scaler": scaler,
    "predictors": PREDICTORS,
    "target": TARGET
}

with open(f"{model_name}.pkl", "wb") as f:
    pickle.dump(model_data, f)

print(f"\nBest RandomForest model saved to '{model_name}.pkl'.")
