In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from google.colab import drive
import pickle
drive.mount('/content/drive')

# Change directory to the location of the file
%cd /content/drive/Shareddrives/ACS6403 - Group Project/Deliverable 5 Final Project Report/Prediction Model/

input_data_1 = '/content/drive/Shareddrives/ACS6403 - Group Project/Preprocessed Data/input_data.txt'  # Update the path
output_data_1 = '/content/drive/Shareddrives/ACS6403 - Group Project/Preprocessed Data/output_data.txt'  # Update the path

# Load the input and output data
input_Data = pd.read_csv(input_data_1, sep=',')
input_Data.columns = ['Input 1', 'Input 2', 'Input 3']

output_Data = pd.read_csv(output_data_1, sep='\s+')
output_Data.columns = ['Output 1', 'Output 2', 'Output 3', 'Output 4', 'Output 5', 'Output 6', 'Output 7']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale input and output data
input_Data_normalized = scaler.fit_transform(input_Data)
input_Data_normalized = pd.DataFrame(input_Data_normalized, columns=input_Data.columns)
output_Data_normalized = scaler.fit_transform(output_Data)
output_Data_normalized = pd.DataFrame(output_Data_normalized, columns=output_Data.columns)

# Initialize lists to store metrics for each output
mse_list = []
r_squared_list = []
mae_list = []

# Define the grid of hyperparameters to search
grid = {
    'n_estimators': [100, 150],      # Number of trees in the forest
    'max_depth': [8, 10],             # Maximum depth of the trees
    'min_samples_split': [50, 100], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [40],        # Minimum number of samples required to be at a leaf node
}

# Loop through each output
for column in output_Data_normalized.columns:
    dependent = output_Data_normalized[column]
    independent = input_Data_normalized

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.2, random_state=1)

    # Initialize Random Forest Regressor with desired parameters
    rf_regressor = RandomForestRegressor(random_state=1)

    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(estimator=rf_regressor, param_grid=grid, scoring='neg_mean_absolute_error', cv=5)
    grid_search.fit(X_train, y_train)  # Fit the model

    # Get the best estimator from the grid search
    best_estimator = grid_search.best_estimator_

    # Predict on the test data using the best estimator
    predictions = best_estimator.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    print("Mean Squared Error for", column, ":", mse)

    r_squared = r2_score(y_test, predictions)
    r_squared_list.append(r_squared)
    print("R-squared for", column, ":", r_squared)

    mae = mean_absolute_error(y_test, predictions)
    mae_list.append(mae)
    print("Mean Absolute Error for", column, ":", mae)

    print('___________________________________________________________________________________')

# Print the best estimator from the last iteration
print("Best Estimator:", grid_search.best_estimator_)



Mounted at /content/drive
/content/drive/Shareddrives/ACS6403 - Group Project/Deliverable 5 Final Project Report/Prediction Model
Mean Squared Error for Output 1 : 1.3942611939728644e-05
R-squared for Output 1 : 0.9992336544429474
Mean Absolute Error for Output 1 : 0.002526347286784918
___________________________________________________________________________________
Mean Squared Error for Output 2 : 5.612698246830249e-06
R-squared for Output 2 : 0.9999273383952085
Mean Absolute Error for Output 2 : 0.0018425381521966404
___________________________________________________________________________________
Mean Squared Error for Output 3 : 1.0589465969691455e-05
R-squared for Output 3 : 0.99942145787602
Mean Absolute Error for Output 3 : 0.0021333571983395824
___________________________________________________________________________________
Mean Squared Error for Output 4 : 1.4531308851778308e-06
R-squared for Output 4 : 0.9999556797827455
Mean Absolute Error for Output 4 : 0.0003958975

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from joblib import dump

# Assuming input_Data and output_Data are already loaded and defined as DataFrame

# Normalize input and output data
scaler = MinMaxScaler()
input_Data_normalized = pd.DataFrame(scaler.fit_transform(input_Data), columns=input_Data.columns)
output_Data_normalized = pd.DataFrame(scaler.fit_transform(output_Data), columns=output_Data.columns)

# Initialize lists to store metrics for each output
mse_list = []
r_squared_list = []
mae_list = []

# Initialize empty list to store best estimators for each output
best_estimators = []

# Parameters to tune
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

# Loop through each output
for column in output_Data_normalized.columns:
    dependent = output_Data_normalized[column]
    independent = input_Data_normalized

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.2, random_state=1)

    # Initialize GridSearchCV
    grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=1),
                               param_grid=param_grid,
                               cv=3,  # Number of cross-validation folds
                               scoring='neg_mean_squared_error',
                               verbose=1)  # Show progress

    # Execute the grid search
    grid_search.fit(X_train, y_train)

    # Get the best estimator
    best_rf = grid_search.best_estimator_
    best_estimators.append(best_rf)

    # Train the Random Forest Regressor on the training data
    best_rf.fit(X_train, y_train)

    # Predict on the test data using the trained model
    predictions = best_rf.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    mse_list.append(mse)
    r_squared = r2_score(y_test, predictions)
    r_squared_list.append(r_squared)
    mae = mean_absolute_error(y_test, predictions)
    mae_list.append(mae)

    print(f'Metrics for {column}: MSE={mse}, R2={r_squared}, MAE={mae}')

    # Save the best model for each output
    model_filename = f'/content/drive/Shareddrives/ACS6403 - Group Project/Deliverable 5 Final Project Report/Ensemble Model _ Final/randomforest_model_for_output_{column}.joblib'
    dump(best_rf, model_filename)
    print('Check 1 ',{column})

# Print summary of best estimators
print("Best Estimators:", best_estimators)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
Metrics for Output 1: MSE=2.9229793442421506e-08, R2=0.9999983934055947, MAE=0.00010458884271570335
Check 1  {'Output 1'}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Metrics for Output 2: MSE=9.178566911902912e-08, R2=0.9999988117490516, MAE=0.0002163600846068118
Check 1  {'Output 2'}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Metrics for Output 3: MSE=1.4242505352806597e-08, R2=0.9999992218786744, MAE=7.562668263347847e-05
Check 1  {'Output 3'}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Metrics for Output 4: MSE=1.529679811949544e-09, R2=0.999999953345055, MAE=1.1648301422195244e-05
Check 1  {'Output 4'}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Metrics for Output 5: MSE=6.861717424869329e-07, R2=0.9999404651566574, MAE=0.00022068806062426022
Check 1  {'Output 5'}
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Metrics for Output 6: MSE=5.69

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
