In [1]:
import pandas as pd
import re
from sklearn.ensemble import IsolationForest
from scipy.stats import zscore

In [2]:
def read_csv_file(filename):
    
    try:
        # Read the CSV file into DataFrame
        df = pd.read_csv(filename)
        return df
    except FileNotFoundError:
        print(f"File {filename} not found.")
        return None

In [3]:
# Read the CSV file
df = pd.read_csv('collective_monitoring.csv')

# Remove all lines that contain the header
df = df[df['Timestamp'] != 'Timestamp']

# Trim the headers
df.columns = df.columns.str.strip()

# Reformat the values of the rows to make them exclusively numeric
df['CPU Load'] = df['CPU Load'].str.replace('CPU Load: ', '').str.replace('%', '').astype(float)
df['Memory Load'] = df['Memory Load'].str.replace('Memory Load: ', '').str.replace('GB', '').astype(float)
df['Elapsed Time'] = df['Elapsed Time'].str.replace('ms', '').astype(float)

print(df.shape[0])

# Remove rows with CPU Load values of 100, 0, or -100
df = df[(df['CPU Load'] != 0) & (df['CPU Load'] != -100)]

# Remove the RequestId column
df = df.drop(columns=['Request ID'])

# Merge the columns URI, Method, and Pricing into a new column called Identifier
df['Identifier'] = df['URI'].str.strip() + '_' + df['Method'].str.strip() + '_' + df['Pricing'].str.strip()

print(df.shape[0])

# Apply Isolation Forest for outlier detection
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(df[['CPU Load', 'Memory Load', 'Elapsed Time']])
df = df[outliers == 1]

# Apply Z-score filtering for outlier detection
z_scores = df[['CPU Load', 'Memory Load', 'Elapsed Time']].apply(zscore)
df = df[(z_scores < 3).all(axis=1)]

print(df.shape[0])

40267
20423
18875


In [4]:
# Write the new DataFrame to a CSV file
df.to_csv('processed_collective_monitoring.csv', index=False)

# Display a message indicating completion
print("The processed DataFrame has been saved to 'processed_collective_monitoring.csv'.")

The processed DataFrame has been saved to 'processed_collective_monitoring.csv'.
