In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

# --------------------------
# Data Loading
# --------------------------
def load_and_merge_data(file_paths, aggregate=False):
    inventory_df = pd.read_csv(file_paths['inventory'])
    if 'VM_Name' in inventory_df.columns:
        inventory_df.rename(columns={'VM_Name': 'InstanceName'}, inplace=True)
    
    keep_cols = ['InstanceName', 'Size', 'HourlyUSD', 'MonthlyUSD']
    inventory_df = inventory_df[keep_cols]
    inventory_df['CloudProvider'] = inventory_df['InstanceName'].apply(lambda x: x.split('-')[0].upper())

    all_metrics_list = []
    metrics_column_mapping = {
        'Percentage CPU (Average)': 'CPU Utilization (Average)',
        'Network In Total (Sum)': 'Network In (Sum)',
        'Network Out Total (Sum)': 'Network Out (Sum)',
        'Disk Read Bytes (Sum)': 'Disk Read Bytes (Sum)',
        'Disk Write Bytes (Sum)': 'Disk Write Bytes (Sum)'
    }

    for cloud_name, path in file_paths.items():
        if cloud_name != 'inventory':
            metrics_df = pd.read_csv(path)
            if 'VM_Name' in metrics_df.columns:
                metrics_df.rename(columns={'VM_Name': 'InstanceName'}, inplace=True)
            metrics_df.rename(columns=metrics_column_mapping, inplace=True)
            all_metrics_list.append(metrics_df)

    all_metrics_df = pd.concat(all_metrics_list, ignore_index=True)
    all_metrics_df['Date'] = pd.to_datetime(all_metrics_df['Date'], format='%d-%m-%Y')

    if aggregate:
        metrics_df = all_metrics_df.groupby('InstanceName').agg({
            'CPU Utilization (Average)': 'mean',
            'Network In (Sum)': 'mean',
            'Network Out (Sum)': 'mean',
            'Disk Read Bytes (Sum)': 'mean',
            'Disk Write Bytes (Sum)': 'mean'
        }).reset_index()
    else:
        metrics_df = all_metrics_df.copy()

    final_df = pd.merge(inventory_df, metrics_df, on='InstanceName', how='left')
    return final_df

# --------------------------
# Normalization with Filtering
# --------------------------
def normalize_data(df, save_path=None):
    metrics_to_normalize = [
        'CPU Utilization (Average)',
        'Network In (Sum)',
        'Network Out (Sum)',
        'Disk Read Bytes (Sum)',
        'Disk Write Bytes (Sum)',
        'HourlyUSD',
        'MonthlyUSD'
    ]

    # Remove rows with 0 in ANY of the important metric columns
    df = df[(df[metrics_to_normalize] != 0).all(axis=1)]

    # Fill NaN (if any remain after filtering)
    for col in metrics_to_normalize:
        if df[col].isnull().any():
            df[col].fillna(df[col].mean(), inplace=True)

    # Scale
    scaler = MinMaxScaler()
    normalized_df = df.copy()
    normalized_data = scaler.fit_transform(normalized_df[metrics_to_normalize])
    normalized_metrics_df = pd.DataFrame(
        normalized_data,
        columns=[f'Normalized_{col}' for col in metrics_to_normalize],
        index=normalized_df.index
    )
    normalized_df = pd.concat([normalized_df, normalized_metrics_df], axis=1)

    keep_final = [
        'CloudProvider', 'InstanceName', 'Size'] + [col for col in normalized_df.columns if col.startswith("Normalized_")]
    keep_final = [c for c in keep_final if c in normalized_df.columns]
    normalized_df = normalized_df[keep_final]

    if save_path:
        normalized_df.to_csv(save_path, index=False)
        print(f"Normalized dataset saved to: {save_path} with {len(normalized_df)} records")

    return normalized_df, scaler

# --------------------------
# Main
# --------------------------
def test_program():

    file_paths = {
        'inventory': 'multi_cloud_vm_inventory.csv',
        'aws': 'aws_vm_metrics_daywise.csv',
        'azure': 'azure_vm_metrics_daywise.csv',
        'gcp': 'gcp_vm_metrics_daywise.csv'
    }

    if not all(os.path.exists(path) for path in file_paths.values()):
        print("Error: One or more CSV files not found.")
        return
        
    daily_df = load_and_merge_data(file_paths, aggregate=False)
    normalize_data(daily_df, save_path="normalized_vm_data_daily.csv")

if __name__ == '__main__':
    test_program()


Normalized dataset saved to: normalized_vm_data_daily.csv with 704 records
