In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

input_data_1 = '/content/drive/Shareddrives/ACS6403 - Group Project/Preprocessed Data/input_data.txt'  # Update the path
output_data_1 = '/content/drive/Shareddrives/ACS6403 - Group Project/Preprocessed Data/output_data.txt'  # Update the path

# Load the input and output data
input_Data = pd.read_csv(input_data_1, sep=',')
input_Data.columns = ['Input 1', 'Input 2', 'Input 3']

output_Data = pd.read_csv(output_data_1, sep='\s+')
output_Data.columns = ['Output 1', 'Output 2', 'Output 3', 'Output 4', 'Output 5', 'Output 6', 'Output 7']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale input and output data
input_Data_normalized = scaler.fit_transform(input_Data)
input_Data_normalized = pd.DataFrame(input_Data_normalized, columns=input_Data.columns)
output_Data_normalized = scaler.fit_transform(output_Data)
output_Data_normalized = pd.DataFrame(output_Data_normalized, columns=output_Data.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(input_Data_normalized, output_Data_normalized, test_size=0.2, random_state=42)

# Define the multi-output linear regression model
model = MultiOutputRegressor(LinearRegression())

# Define hyperparameters grid for tuning
param_grid = {
    'estimator__copy_X': [True, False],
    'estimator__fit_intercept': [True, False],
    'estimator__positive': [True, False]
}

# Define GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Perform grid search to find the best estimator
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Make predictions using the best estimator
y_pred = best_model.predict(X_test)

# Evaluate the best model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (Best Model):", mae)
print("Best Hyperparameters:", grid_search.best_params_)


Mean Absolute Error (Best Model): 0.03914355372487015
Best Hyperparameters: {'estimator__copy_X': True, 'estimator__fit_intercept': True, 'estimator__positive': False}


In [3]:
# Calculate MAE for each output variable
mae_per_output = {}
for i in range(y_test.shape[1]):
    mae = mean_absolute_error(y_test.iloc[:, i], y_pred[:, i])
    mae_per_output[y_test.columns[i]] = mae

# Print MAE for each output variable
for output, mae in mae_per_output.items():
    print(f"Mean Absolute Error for {output}: {mae}")

Mean Absolute Error for Output 1: 0.025979658661468195
Mean Absolute Error for Output 2: 0.017577140882257473
Mean Absolute Error for Output 3: 0.024950497516051634
Mean Absolute Error for Output 4: 0.09814491260686858
Mean Absolute Error for Output 5: 0.05567454163080941
Mean Absolute Error for Output 6: 0.048091084136872395
Mean Absolute Error for Output 7: 0.0035870406397632616


In [4]:
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from joblib import dump

# Load the input and output data
input_Data = pd.read_csv(input_data_1, sep=',')
input_Data.columns = ['Input 1', 'Input 2', 'Input 3']

output_Data = pd.read_csv(output_data_1, sep='\s+')
output_Data.columns = ['Output 1', 'Output 2', 'Output 3', 'Output 4', 'Output 5', 'Output 6', 'Output 7']

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale input and output data
input_Data_normalized = scaler.fit_transform(input_Data)
input_Data_normalized = pd.DataFrame(input_Data_normalized, columns=input_Data.columns)
output_Data_normalized = scaler.fit_transform(output_Data)
output_Data_normalized = pd.DataFrame(output_Data_normalized, columns=output_Data.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(input_Data_normalized, output_Data_normalized, test_size=0.2, random_state=42)

# Define the multi-output linear regression model
model = MultiOutputRegressor(LinearRegression())

# Define hyperparameters grid for tuning
param_grid = {
    'estimator__copy_X': [True, False],
    'estimator__fit_intercept': [True, False],
    'estimator__positive': [True, False]
}

# Define GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

# Perform grid search to find the best estimator
grid_search.fit(X_train, y_train)

# Get the best estimator
best_model = grid_search.best_estimator_

# Save each individual model
for i, estimator in enumerate(best_model.estimators_):
    dump(estimator, f'/content/drive/Shareddrives/ACS6403 - Group Project/Deliverable 5 Final Project Report/Ensemble Model _ Final/poly_model_for_output_{i+1}.joblib')
    print(f"Model for Output {i+1} saved.")

# Make predictions using the best estimator
y_pred = best_model.predict(X_test)

# Evaluate the best model
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (Best Model):", mae)
print("Best Hyperparameters:", grid_search.best_params_)

Model for Output 1 saved.
Model for Output 2 saved.
Model for Output 3 saved.
Model for Output 4 saved.
Model for Output 5 saved.
Model for Output 6 saved.
Model for Output 7 saved.
Mean Absolute Error (Best Model): 0.03914355372487015
Best Hyperparameters: {'estimator__copy_X': True, 'estimator__fit_intercept': True, 'estimator__positive': False}
