In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [7]:
def preprocess_data():
    # Read the individual and collective monitoring CSV files
    individual_df = pd.read_csv('processed_individual_monitoring.csv')
    collective_df = pd.read_csv('processed_collective_monitoring.csv')

    # Merge the DataFrames on the Identifier column
    merged_df = pd.merge(collective_df, individual_df, on='Identifier', suffixes=('', '_Individual'))
    
    # Calculate the deviation in Elapsed Time
    merged_df['Elapsed Time Deviation'] = ((merged_df['Elapsed Time'] - merged_df['Elapsed Time_Individual']) / merged_df['Elapsed Time_Individual']) * 100
    
    # Remove the categorical columns and the Elapsed Time_Individual column
    numerical_df = merged_df.drop(columns=['URI', 'Method', 'Pricing', 'Identifier', 'Elapsed Time_Individual'])

    return numerical_df

In [8]:
def train_and_export_model(target_column, model_type='linear', k_best_features=5):
    numerical_df = preprocess_data()

    # Group by Timestamp and aggregate the individual numerical values
    aggregated_df = numerical_df.groupby('Timestamp').agg(['sum', 'mean', 'max', 'min'])

    # Flatten the MultiIndex columns
    aggregated_df.columns = ['_'.join(col).strip() for col in aggregated_df.columns.values]

    # Count the number of rows for each Timestamp and add it as a new attribute
    row_counts = numerical_df.groupby('Timestamp').size()
    aggregated_df['row_count'] = row_counts

    # Drop the Timestamp column
    aggregated_df = aggregated_df.reset_index(drop=True)

    # Ensure the target column is not dropped
    if target_column not in aggregated_df.columns:
        raise ValueError(f"Target column '{target_column}' not found in the data.")

    # Select all columns except those that include '_Individual' and the target_column
    columns_to_drop = [col for col in aggregated_df.columns if '_Individual' not in col and col != target_column and 'Elapsed Time' not in col]
    filtered_df = aggregated_df.drop(columns=columns_to_drop)

    # Separate features and target
    X = filtered_df.drop(columns=[target_column])
    y = filtered_df[target_column]

    # Select the feature selection method based on the model type
    if model_type == 'linear':
        selector = SelectKBest(score_func=f_regression, k=k_best_features)
    elif model_type in ['random_forest', 'svm', 'gradient_boosting', 'knn', 'decision_tree']:
        selector = SelectKBest(score_func=mutual_info_regression, k=k_best_features)
    else:
        raise ValueError(f"Model type '{model_type}' is not supported.")

    # Feature selection
    X_new = selector.fit_transform(X, y)

    # Get the selected feature names
    selected_features = X.columns[selector.get_support()]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

    # Select the model based on the parameter
    if model_type == 'linear':
        model = LinearRegression()
    elif model_type == 'random_forest':
        model = RandomForestRegressor(random_state=42)
    elif model_type == 'svm':
        model = SVR()
    elif model_type == 'gradient_boosting':
        model = GradientBoostingRegressor(random_state=42)
    elif model_type == 'knn':
        model = KNeighborsRegressor()
    elif model_type == 'decision_tree':
        model = DecisionTreeRegressor(random_state=42)
    else:
        raise ValueError(f"Model type '{model_type}' is not supported.")

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # Display the metrics
    print("------------------------------------------------------------")
    print(f"Model Performance for {target_column} using {model_type}:")
    print(f"{'Metric':<20}{'Value':<20}")
    print(f"{'-'*40}")
    print(f"{'Mean Squared Error (MSE)':<20}{mse:<20.4f}")
    print(f"{'Mean Absolute Error (MAE)':<20}{mae:<20.4f}")
    print(f"{'Root Mean Squared Error (RMSE)':<20}{rmse:<20.4f}")
    print(f"{'R-squared (R2)':<20}{r2:<20.4f}")

    # Display the selected features
    print(f'Selected Features for {target_column}: {selected_features}')

    # Export the feature selector and the model
    joblib.dump(selector, f'feature_selector_{target_column}_{model_type}.joblib')
    joblib.dump(model, f'{model_type}_model_{target_column}.joblib')

    print(f"Feature selector and model for {target_column} using {model_type} have been saved.")

In [11]:
train_and_export_model('CPU Load_mean', model_type='linear', k_best_features=5)
train_and_export_model('CPU Load_mean', model_type='random_forest', k_best_features=5)
train_and_export_model('CPU Load_mean', model_type='svm', k_best_features=5)
train_and_export_model('CPU Load_mean', model_type='gradient_boosting', k_best_features=5)
train_and_export_model('CPU Load_mean', model_type='knn', k_best_features=5)
train_and_export_model('CPU Load_mean', model_type='decision_tree', k_best_features=5)
train_and_export_model('Memory Load_mean', model_type='linear', k_best_features=5)
train_and_export_model('Memory Load_mean', model_type='random_forest', k_best_features=5)
train_and_export_model('Memory Load_mean', model_type='svm', k_best_features=5)
train_and_export_model('Memory Load_mean', model_type='gradient_boosting', k_best_features=5)
train_and_export_model('Memory Load_mean', model_type='knn', k_best_features=5)
train_and_export_model('Memory Load_mean', model_type='decision_tree', k_best_features=5)

------------------------------------------------------------
Model Performance for CPU Load_mean using linear:
Metric              Value               
----------------------------------------
Mean Squared Error (MSE)477.4620            
Mean Absolute Error (MAE)16.4361             
Root Mean Squared Error (RMSE)21.8509             
R-squared (R2)      0.3965              
Selected Features for CPU Load_mean: Index(['CPU Load_Individual_sum', 'Memory Load_Individual_sum',
       'Elapsed Time Deviation_mean', 'Elapsed Time Deviation_max',
       'Elapsed Time Deviation_min'],
      dtype='object')
Feature selector and model for CPU Load_mean using linear have been saved.
------------------------------------------------------------
Model Performance for CPU Load_mean using random_forest:
Metric              Value               
----------------------------------------
Mean Squared Error (MSE)240.0403            
Mean Absolute Error (MAE)9.1408              
Root Mean Squared Error (RMSE