In [19]:
# Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [20]:
# Load the datasets
df_macbook_pro1 = pd.read_csv('../datasets/MacBookPro1.csv')
df_macbook_pro2 = pd.read_csv('../datasets/MacBookPro2.csv')
df_raspberry_pi = pd.read_csv('../datasets/RaspberryPi.csv')
df_vm = pd.read_csv('../datasets/VM.csv')

In [21]:
# Pre-processing steps
datasets = [df_macbook_pro1, df_macbook_pro2, df_raspberry_pi, df_vm]
server_names = ['MacBookPro1', 'MacBookPro2', 'RaspberryPi', 'VM']

for i, df in enumerate(datasets):
    # Convert the 'Time' column to datetime
    df['Time'] = pd.to_datetime(df['Time'])

    # Impute missing values with the median
    imputer = SimpleImputer(strategy='median')
    df[['Execution Time']] = imputer.fit_transform(df[['Execution Time']])

    # Detect and handle outliers with IQR
    Q1 = df['Execution Time'].quantile(0.25)
    Q3 = df['Execution Time'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df['Execution Time'] = np.where(df['Execution Time'] < lower_bound, lower_bound, df['Execution Time'])
    df['Execution Time'] = np.where(df['Execution Time'] > upper_bound, upper_bound, df['Execution Time'])

    # Extract features from 'Time'
    df['Hour'] = df['Time'].dt.hour
    df['DayOfWeek'] = df['Time'].dt.dayofweek

    # Add server type
    df['ServerType'] = server_names[i]

In [22]:
# Combine datasets
df_combined = pd.concat(datasets).reset_index(drop=True)
df_combined = pd.get_dummies(df_combined, columns=['ServerType'], drop_first=False)
df_combined.drop('Time', axis=1, inplace=True)

In [23]:
# Prepare features and target variable
X = df_combined.drop('Execution Time', axis=1)
y = df_combined['Execution Time']

In [24]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [26]:
# Predict and evaluate
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

In [27]:
print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R2 Score: {r2}')

Mean Absolute Error: 0.03493515830265272
Mean Squared Error: 0.0027879717477100417
Root Mean Squared Error: 0.052801247596151
R2 Score: 0.9744019772279322
