In [23]:
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore
import pandas as pd
import re

In [24]:
def read_csv_file(filename):
    
    try:
        # Read the CSV file into DataFrame
        df = pd.read_csv(filename)
        return df
    except FileNotFoundError:
        print(f"File {filename} not found.")
        return None

In [25]:
# Read the CSV file
df = read_csv_file('individual_monitoring.csv')

# Remove all lines that contain the header
df = df[df['Timestamp'] != 'Timestamp']

# Trim the headers
df.columns = df.columns.str.strip()

# Reformat the values of the rows to make them exclusively numeric
df['CPU Load'] = df['CPU Load'].str.replace('CPU Load: ', '').str.replace('%', '').astype(float)
df['Memory Load'] = df['Memory Load'].str.replace('Memory Load: ', '').str.replace('GB', '').astype(float)
df['Elapsed Time'] = df['Elapsed Time'].str.replace('ms', '').astype(float)

# Remove rows with CPU Load values of 100, 0, or -100
df = df[(df['CPU Load'] != 100) & (df['CPU Load'] != 0) & (df['CPU Load'] != -100)]

# Merge the columns URI, Method, and Pricing into a new column called Identifier
df['Identifier'] = df['URI'].str.strip() + '_' + df['Method'].str.strip() + '_' + df['Pricing'].str.strip()

# Remove the Timestamp, Request ID, URI, Method, and Pricing columns
df = df.drop(columns=['Timestamp', 'Request ID', 'URI', 'Method', 'Pricing'])

# Create a new dataframe for each possible identifier
dfs = [group for _, group in df.groupby('Identifier')]

# Apply outlier detection to each dataframe
cleaned_dfs = []
for sub_df in dfs:
    # Apply Isolation Forest for outlier detection
    iso_forest = IsolationForest(contamination=0.05, random_state=42)
    sub_df['anomaly'] = iso_forest.fit_predict(sub_df[['CPU Load', 'Memory Load', 'Elapsed Time']])
    sub_df = sub_df[sub_df['anomaly'] == 1].drop(columns=['anomaly'])

    # Apply Z-score filtering for outlier detection
    z_scores = sub_df[['CPU Load', 'Memory Load', 'Elapsed Time']].apply(zscore)
    sub_df = sub_df[(z_scores < 3).all(axis=1)]
    
    cleaned_dfs.append(sub_df)

# Reform the dataframe
cleaned_df = pd.concat(cleaned_dfs, ignore_index=True)

In [26]:
# Group by Identifier and calculate the mean of CPU Load and Memory Load
grouped_df = df.groupby('Identifier').mean().reset_index()

In [27]:
# Write the new DataFrame to a CSV file
grouped_df.to_csv('processed_individual_monitoring.csv', index=False)

# Display a message indicating completion
print("The processed DataFrame has been saved to 'processed_individual_monitoring.csv'.")

The processed DataFrame has been saved to 'processed_individual_monitoring.csv'.
