# 1

In [None]:
import pandas as pd

# Charger le fichier CSV
file_path = "data/istio_request_2.2.csv"
df = pd.read_csv(file_path)

df['grpc_response_status'].fillna(0, inplace=True)

df['response_flags'] = df['response_flags'].astype(str).str.strip()  # Convertir en string et enlever espaces

# Ajouter une colonne 'result' avec 'success' ou 'error'
df['result'] = df.apply(
    lambda row: 'success' if row['response_code'] == 200 and row['grpc_response_status'] == 0 and row['response_flags'] == '-' else 'error',
    axis=1
)

# R√©organiser les donn√©es par 'source_workload', 'destination_workload' et 'timestamp'
df_sorted = df.sort_values(by=['source_workload', 'destination_workload', 'timestamp'])

# todas las solicitudes entre cada par origen-destino est√°n ordenadas en el tiempo.
# Esto es √∫til para c√°lculos secuenciales, como determinar el n√∫mero de solicitudes
# en un per√≠odo de tiempo o calcular m√©tricas basadas en la secuencia temporal de eventos.

# Sauvegarder le fichier r√©sultant
df_sorted.to_csv("results sahra2/aggregated_istio_data.csv", index=False)




In [None]:

import pandas as pd

# Charger le fichier
file_path = "results sahra2/aggregated_istio_data.csv"
df = pd.read_csv(file_path)

# Convertir timestamp en datetime pour le tri
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Trier avant la s√©paration
df.sort_values(by=['source_workload', 'destination_workload', 'timestamp'], inplace=True)

# S√©parer les succ√®s
df_success = df[df['result'] == 'success'].copy()
#Separamos para obtener solo las solicitudes exitosas

# Calculer new_request, new_istio_request_bytes et new_istio_request_duration_milliseconds avec 0 pour la premi√®re ligne
df_success['new_request'] = df_success.groupby(['source_workload', 'destination_workload'])['total_request'].diff().fillna(0)
df_success['new_istio_request_bytes'] = df_success.groupby(['source_workload', 'destination_workload'])['istio_request_bytes_sum'].diff().fillna(0)
df_success['new_istio_request_duration_milliseconds'] = df_success.groupby(['source_workload', 'destination_workload'])['istio_request_duration_milliseconds_sum'].diff().fillna(0)

# diff() calcula la diferencia entre una fila y la anterior en cada grupo de source_workload y destination_workload.
# Para la primera fila de cada grupo, no hay una fila anterior, as√≠ que diff() devuelve NaN.
# Con fillna(0) reemplaza esos NaN con 0, indicando que no hubo solicitudes nuevas en ese primer intervalo.




# Appliquer la condition si new_request == 0
df_success.loc[df_success['new_request'] == 0, ['new_istio_request_bytes', 'new_istio_request_duration_milliseconds']] = 0
#Si no hubo nuevas solicitudes en un intervalo, asegura que los bytes y la duraci√≥n tambi√©n se registren como cero.



# Calculer latency
df_success['latency'] = df_success['new_istio_request_duration_milliseconds'] / df_success['new_request']
df_success['latency'].fillna(0, inplace=True)
# Calcula la latencia dividiendo el tiempo total de solicitudes por el n√∫mero de solicitudes nuevas en cada intervalo.

# Sauvegarder les succ√®s dans un fichier
df_success.to_csv("results sahra2/success_istio_data.csv", index=False)
# Si no hubo nuevas solicitudes, la latencia se rellena con cero

# S√©parer les erreurs HTTP et gRPC
df_http_errors = df[(df['result'] == 'error') & (df['request_protocol'] == 'http')].copy()
df_grpc_errors = df[(df['result'] == 'error') & (df['request_protocol'] == 'grpc')].copy()

error_files = []  # Liste des fichiers d'erreur g√©n√©r√©s

# Traitement des erreurs HTTP
http_groups = df_http_errors.groupby(['request_protocol', 'response_code', 'grpc_response_status', 'response_flags'])

for (request_protocol, response_code, grpc_status, response_flags), df_error in http_groups:
    df_error = df_error.copy()
    
    # Calculer new_request, new_istio_request_bytes et new_istio_request_duration_milliseconds avec 0 pour la premi√®re ligne
    df_error['new_request'] = df_error.groupby(['source_workload', 'destination_workload'])['total_request'].diff().fillna(0)
    df_error['new_istio_request_bytes'] = df_error.groupby(['source_workload', 'destination_workload'])['istio_request_bytes_sum'].diff().fillna(0)
    df_error['new_istio_request_duration_milliseconds'] = df_error.groupby(['source_workload', 'destination_workload'])['istio_request_duration_milliseconds_sum'].diff().fillna(0)
    
    # Appliquer la condition si new_request == 0
    df_error.loc[df_error['new_request'] == 0, ['new_istio_request_bytes', 'new_istio_request_duration_milliseconds']] = 0
    
    # Calculer latency
    df_error['latency'] = df_error['new_istio_request_duration_milliseconds'] / df_error['new_request']
    df_error['latency'].fillna(0, inplace=True)
    
    # Nommer le fichier selon l'erreur
    file_name = f"results sahra2/error_{request_protocol}_{response_code}_{response_flags}.csv"
    df_error.to_csv(file_name, index=False)
    error_files.append(df_error)

# Traitement des erreurs gRPC
grpc_groups = df_grpc_errors.groupby(['request_protocol', 'response_code', 'grpc_response_status', 'response_flags'])

for (request_protocol, response_code, grpc_status, response_flags), df_error in grpc_groups:
    df_error = df_error.copy()
    
    # Calculer new_request, new_istio_request_bytes et new_istio_request_duration_milliseconds avec 0 pour la premi√®re ligne
    df_error['new_request'] = df_error.groupby(['source_workload', 'destination_workload'])['total_request'].diff().fillna(0)
    df_error['new_istio_request_bytes'] = df_error.groupby(['source_workload', 'destination_workload'])['istio_request_bytes_sum'].diff().fillna(0)
    df_error['new_istio_request_duration_milliseconds'] = df_error.groupby(['source_workload', 'destination_workload'])['istio_request_duration_milliseconds_sum'].diff().fillna(0)
    
    # Appliquer la condition si new_request == 0
    df_error.loc[df_error['new_request'] == 0, ['new_istio_request_bytes', 'new_istio_request_duration_milliseconds']] = 0
    
    # Calculer latency
    df_error['latency'] = df_error['new_istio_request_duration_milliseconds'] / df_error['new_request']
    df_error['latency'].fillna(0, inplace=True)
    
    # Nommer le fichier selon l'erreur
    file_name = f"results sahra2/error_{request_protocol}_{response_code}_{grpc_status}_{response_flags}.csv"
    df_error.to_csv(file_name, index=False)
    error_files.append(df_error)

# Fusionner tous les fichiers (success + errors)
df_final = pd.concat([df_success] + error_files).sort_values(by=['source_workload', 'destination_workload', 'timestamp'])

# Sauvegarder le fichier final
df_final.to_csv("results sahra2/new_request_istio_data.csv", index=False)

print("Traitement termin√©. Fichier sauvegard√© sous 'new_request_istio_data.csv'.")

# 2

In [None]:

import pandas as pd 

# Charger le fichier
file_path = "results sahra2/new_request_istio_data.csv"
df = pd.read_csv(file_path)

# Convertir timestamp en datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Trier les donn√©es
df.sort_values(by=['source_workload', 'destination_workload', 'timestamp'], inplace=True)

# Calculer success rate, error rate, success count, error count, duration success request, duration error request et average latency par timestamp
grouped = df.groupby(['source_workload', 'destination_workload', 'timestamp'])
aggregated_rows = []

for (src, dst, ts), group in grouped:
    total_new_request = group['new_request'].sum()
    success_count = group[group['result'] == 'success']['new_request'].sum()
    error_count = total_new_request - success_count
    
    if total_new_request > 0:
        success_rate = success_count / total_new_request
        error_rate = 1 - success_rate
    else:
        success_rate = float('nan')
        error_rate = float('nan')
    
    # Calculer la dur√©e des requ√™tes r√©ussies et erron√©es
    duration_success_request = group[group['result'] == 'success']['latency'].sum()
    duration_error_request = group[group['result'] == 'error']['latency'].sum()
    average_latency = duration_success_request + duration_error_request

    # S√©parer new_istio_request_bytes en success et error
    new_istio_request_bytes_success = group[group['result'] == 'success']['new_istio_request_bytes'].sum()
    new_istio_request_bytes_error = group[group['result'] == 'error']['new_istio_request_bytes'].sum()
    istio_request_bytes = new_istio_request_bytes_success+new_istio_request_bytes_error
    aggregated_rows.append([ts, src, dst, group['total_request'].max(), total_new_request, success_count, error_count, success_rate, error_rate, duration_success_request, duration_error_request, average_latency, new_istio_request_bytes_success, new_istio_request_bytes_error,istio_request_bytes])

# Cr√©er un DataFrame final
df_final = pd.DataFrame(aggregated_rows, columns=['timestamp', 'source_workload', 'destination_workload', 'total_request', 'new_request', 'success_count', 'error_count', 'success_rate', 'error_rate', 'duration_success_request', 'duration_error_request', 'average_latency', 'new_istio_request_bytes_success', 'new_istio_request_bytes_error','istio_request_bytes'])

# Sauvegarder le fichier
output_file = "results sahra2/aggregated_istio_rates.csv"
df_final.to_csv(output_file, index=False)

print(f"Traitement termin√©. Fichier sauvegard√© sous {output_file}.") #Procesamiento completado. Archivo guardado como ...


# 3

In [None]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Charger le fichier
file_path = "results sahra2/aggregated_istio_rates.csv"
df = pd.read_csv(file_path)

# Convertir timestamp en datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# D√©finir les fen√™tres de temps
time_windows = ['15S', '30S', '1min', '5min', '10min']

# Initialiser un dictionnaire pour stocker les r√©sultats
kpi_results = []

for window in time_windows:
    df_resampled = (df
        .groupby(['source_workload', 'destination_workload'])  # Regrouper par workload
        .resample(window, on='timestamp', label='right', closed='right')  # Aligner sur la fin de la fen√™tre
        .agg({
            'total_request': 'max',
            'new_request': 'sum',
            'success_count': 'sum',
            'error_count': 'sum',
            'success_rate': 'mean',
            'error_rate': 'mean',
            'average_latency': 'sum',
            'istio_request_bytes': 'sum'
        })
        .reset_index()
    )

    # Calcul du throughput
    df_resampled['throughput'] = df_resampled['istio_request_bytes'] / pd.to_timedelta(window).total_seconds()
    
    # Calcul du request rate
    df_resampled['request_rate'] = df_resampled['new_request'] / pd.to_timedelta(window).total_seconds()
    
    df_resampled['time_window'] = window
    kpi_results.append(df_resampled)

# Concat√©ner tous les r√©sultats
df_final = pd.concat(kpi_results)

# Supprimer la ligne o√π timestamp == "2025-03-10 16:09:00"
starting_point = pd.Timestamp("2025-03-10 16:09:00")
df_final = df_final[df_final['timestamp'] != starting_point]

# Sauvegarder dans un fichier CSV
df_final.to_csv("results sahra2/kiali_kpi_metrics.csv", index=False)



# 4

In [None]:
import pandas as pd
import numpy as np

# Charger le fichier
df = pd.read_csv("results sahra2/kiali_kpi_metrics.csv")

# Convertir timestamp en datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# D√©finir les fen√™tres de temps et leurs intervalles respectifs
interval_mapping = {
    '15S': '1min',  # 15 secondes -> 1 minute
    '30S': '2min',  # 30 secondes -> 2 minutes
    '1min': '4min',  # 1 minute -> 4 minutes
    '5min': '10min',  # 5 minutes -> 10 minutes
    '10min': '10min'  # 10 minutes -> 10 minutes
}

latency_results = []

for window, interval in interval_mapping.items():
    df_filtered = df[df['time_window'] == window].copy()
    if df_filtered.empty:
        print(f"No data for time window: {window}")
        continue
    
    df_filtered.set_index('timestamp', inplace=True)
    
    try:
        df_grouped = (df_filtered.groupby(['source_workload', 'destination_workload'])
                      .resample(interval)
                      .agg({col: list for col in df.columns if col not in ['timestamp', 'source_workload', 'destination_workload', 'time_window']})
                      .reset_index())
    except Exception as e:
        print(f"Error during resampling for window {window}: {e}")
        continue
    
    for idx, row in df_grouped.iterrows():
        values = row.get('average_latency', [])
        frequencies = row.get('new_request', [])
        
        if not values or not frequencies or len(values) != len(frequencies):
            continue
        
        try:
            data = np.repeat(values, frequencies)
            if data.size == 0:
                continue
            
            percentiles = {
                'p50_latency': np.percentile(data, 50),
                'p90_latency': np.percentile(data, 90),
                'p95_latency': np.percentile(data, 95),
                'p99_latency': np.percentile(data, 99)
            }
        except Exception as e:
            print(f"Error calculating percentiles for row {idx} in window {window}: {e}")
            continue
        
        result = {**row.to_dict(), **percentiles, 'time_window': window}
        latency_results.append(result)

# Cr√©er un DataFrame final
df_latency = pd.DataFrame(latency_results)

# Sauvegarder dans un fichier CSV
df_latency.to_csv("results sahra2/kiali_latency_percentiles.csv", index=False)

print("Traitement termin√©. Fichier sauvegard√© sous kiali_latency_percentiles.csv.") #Procesamiento completado. Archivo guardado como ...


# 5

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load the dataset
file_path = "results sahra2/kiali_kpi_metrics.csv"  # Updated file path
data = pd.read_csv(file_path)

# Convert timestamp to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Define function to classify edge color based on success_rate
def get_edge_color(success_rate):
    if (success_rate > 0.95):
        return "green"
    elif (success_rate > 0.80):
        return "yellow"
    else:
        return "red"

# Define time windows
start_date = data['timestamp'].min()
end_date = data['timestamp'].max()
window_sizes = ["1T", "5T", "10T", "30T"]  # Added more time windows

# Generate graphs for each time window
for window_size in window_sizes:
    date_generated = pd.date_range(start_date, end_date, freq=window_size)
    for i in range(len(date_generated) - 1):
        start = date_generated[i]
        end = date_generated[i + 1]
        
        # Filter data for the current time window
        window_data = data[(data['timestamp'] >= start) & (data['timestamp'] < end)]
        
        # Create a directed graph
        G = nx.DiGraph()
        
        # Add edges and nodes with attributes
        for _, row in window_data.iterrows():
            src = row['source_workload']
            dst = row['destination_workload']
            success_rate = row['success_rate']
            success_count = row['success_count']  # Added success_count
            
            # Add nodes
            G.add_node(src)
            G.add_node(dst)
            
            # Add edge with color and success_count attributes
            G.add_edge(src, dst, weight=row['new_request'], color=get_edge_color(success_rate), success_count=success_count)
        
        # Get edge colors
        edge_colors = [G.edges[edge]['color'] for edge in G.edges]
        
        # Plot the graph
        plt.figure(figsize=(12, 10))
        pos = nx.spring_layout(G, seed=70,k=20)
        nx.draw(G, pos, with_labels=True, node_color="lightblue", edge_color=edge_colors, width=2, alpha=0.7,node_size=2800, font_size=8)
        
        # Add edge labels for success_count
        edge_labels = {(u, v): f"{d['success_count']}" for u, v, d in G.edges(data=True)}
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=7)
        
        plt.title(f"Graph from {start} to {end} (Window: {window_size})")
        plt.show()


# 6

### 1¬™ (completa)	
#### MODELOS: GCN, GAT, SAGE	 
#### VALIDACI√ìN CRUZADA: ‚úÖ S√≠	
#### MODULARIDAD: ‚úÖ Alta
#### COMPLEJIDAD: üî• Alta
#### IDEAL PARA: Comparar modelos, evaluar


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
import warnings

# Cargar datos
data = pd.read_csv("results sahra2/kiali_latency_percentiles.csv")
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Preprocesar columnas
selected_columns = [
    "p50_latency", "p90_latency", "p95_latency", "p99_latency",
    "istio_request_bytes", "success_rate", "throughput", "average_latency"
]
for col in selected_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Agrupar y normalizar
grouped = data.groupby('source_workload')[selected_columns].mean()
grouped.dropna(axis=1, how='all', inplace=True)  # Elimina columnas completamente vac√≠as

# Escalar caracter√≠sticas
scaler = StandardScaler()
node_features_tensor = torch.tensor(scaler.fit_transform(grouped.values), dtype=torch.float)

# Mapear nodos
node_mapping = {node: idx for idx, node in enumerate(grouped.index)}

# Crear edge_index (sin FutureWarning)
edge_df = data[['source_workload', 'destination_workload']].copy()
edge_df = edge_df[edge_df['source_workload'].isin(node_mapping) & edge_df['destination_workload'].isin(node_mapping)]
edge_df['source_workload'] = edge_df['source_workload'].map(node_mapping)
edge_df['destination_workload'] = edge_df['destination_workload'].map(node_mapping)
edge_index_tensor = torch.tensor(edge_df.values.T, dtype=torch.long)

# Etiquetas de anomal√≠a por latencia p99
anomaly_series = data.groupby('source_workload')["p99_latency"].mean()
anomaly_series = anomaly_series.loc[grouped.index]
threshold = anomaly_series.quantile(0.95)
anomaly_labels = (anomaly_series > threshold).astype(int)
anomaly_labels_tensor = torch.tensor(anomaly_labels.values, dtype=torch.float)

# Objeto PyG
data = Data(x=node_features_tensor, edge_index=edge_index_tensor)

# Definici√≥n de modelo
def get_model(model_type, in_channels, hidden_channels, out_channels):
    class GNNModel(torch.nn.Module):
        def __init__(self):
            super().__init__()
            if model_type == "GCN":
                self.conv1 = GCNConv(in_channels, hidden_channels)
                self.conv2 = GCNConv(hidden_channels, out_channels)
            elif model_type == "GAT":
                self.conv1 = GATConv(in_channels, hidden_channels, heads=4, concat=False)
                self.conv2 = GATConv(hidden_channels, out_channels, heads=4, concat=False)
            elif model_type == "GraphSAGE":
                self.conv1 = SAGEConv(in_channels, hidden_channels)
                self.conv2 = SAGEConv(hidden_channels, out_channels)
            else:
                raise ValueError("Modelo no soportado")

        def forward(self, x, edge_index):
            x = self.conv1(x, edge_index).relu()
            x = self.conv2(x, edge_index)
            return x

    return GNNModel()

# Entrenamiento y evaluaci√≥n
def train_and_evaluate(model_type, train_idx, test_idx, epochs=100, patience=10):
    model = get_model(model_type, data.x.size(1), 32, 1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    best_loss = float("inf")
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index).squeeze()
        loss = F.binary_cross_entropy_with_logits(out[train_idx], anomaly_labels_tensor[train_idx])
        loss.backward()
        optimizer.step()

        if loss.item() < best_loss:
            best_loss = loss.item()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break

    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index).squeeze()
        probs = torch.sigmoid(out[test_idx])
        preds = (probs > 0.5).int().numpy()
        true = anomaly_labels_tensor[test_idx].int().numpy()

        f1 = f1_score(true, preds)
        acc = accuracy_score(true, preds)
        try:
            auc = roc_auc_score(true, probs.numpy())
        except ValueError:
            auc = float('nan')

    return f1, acc, auc

# Validaci√≥n cruzada
y_np = anomaly_labels_tensor.numpy()
indices = np.arange(len(y_np))

minority_class_count = min(np.bincount(y_np.astype(int)))
if minority_class_count < 2:
    warnings.warn(
        f"No hay suficientes ejemplos en la clase minoritaria ({minority_class_count}) "
        "para realizar validaci√≥n cruzada. Se omite esta fase."
    )
else:
    n_splits = min(5, minority_class_count)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    print("\n--- Validaci√≥n cruzada ---")
    for model_type in ["GCN", "GAT", "GraphSAGE"]:
        f1s, accs, aucs = [], [], []
        for train_idx, test_idx in skf.split(indices, y_np):
            train_idx = torch.tensor(train_idx, dtype=torch.long)
            test_idx = torch.tensor(test_idx, dtype=torch.long)
            f1, acc, auc = train_and_evaluate(model_type, train_idx, test_idx)
            f1s.append(f1)
            accs.append(acc)
            aucs.append(auc)

        print(f"\nModelo: {model_type}")
        print(f"F1 promedio: {np.mean(f1s):.4f}")
        print(f"Accuracy promedio: {np.mean(accs):.4f}")
        print(f"AUC-ROC promedio: {np.mean(aucs):.4f}")


### 2¬™ (media)
#### MODELO: Solo GCN
#### VALIDACI√ìN CRUZADA: ‚ùå No
#### MODULARIDAD: ‚úÖ Media
#### COMPLEJIDAD: ‚≠ê Media	
#### IDEAL PARA: Prueba base de GCN


In [None]:
import torch
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import pandas as pd
import numpy as np

# Cargar datos
data = pd.read_csv("results sahra2/kiali_latency_percentiles.csv")
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Seleccionar m√©tricas relevantes
selected_columns = [
    "p50_latency", "p90_latency", "p95_latency", "p99_latency",
    "istio_request_bytes", "success_rate", "throughput", "average_latency"
]
for col in selected_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Agrupar por 'source_workload' y calcular la media
grouped = data.groupby('source_workload')[selected_columns].mean()
grouped.dropna(axis=1, how='all', inplace=True)

# Normalizar caracter√≠sticas
scaler = StandardScaler()
node_features = scaler.fit_transform(grouped.values)
node_features_tensor = torch.tensor(node_features, dtype=torch.float)

# Crear mapeo de nodos
node_mapping = {node: idx for idx, node in enumerate(grouped.index)}

# Crear edge_index
edge_df = data[['source_workload', 'destination_workload']].dropna()
edge_df = edge_df[edge_df['source_workload'].isin(node_mapping) & edge_df['destination_workload'].isin(node_mapping)]
edge_df['source_workload'] = edge_df['source_workload'].map(node_mapping)
edge_df['destination_workload'] = edge_df['destination_workload'].map(node_mapping)
edge_index_tensor = torch.tensor(edge_df.values.T, dtype=torch.long)

# Etiquetar anomal√≠as basadas en p99_latency
anomaly_series = data.groupby('source_workload')["p99_latency"].mean()
anomaly_series = anomaly_series.loc[grouped.index]
threshold = anomaly_series.quantile(0.95)
anomaly_labels = (anomaly_series > threshold).astype(int)
anomaly_labels_tensor = torch.tensor(anomaly_labels.values, dtype=torch.float)

# Crear objeto Data de PyG
graph_data = Data(x=node_features_tensor, edge_index=edge_index_tensor)

# Definir modelo GCN (similar a AddGraph)
class AddGraphModel(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Dividir datos en entrenamiento y prueba
train_ratio = 0.8
num_nodes = len(anomaly_labels_tensor)
train_size = int(train_ratio * num_nodes)
indices = np.random.permutation(num_nodes)
train_idx = torch.tensor(indices[:train_size], dtype=torch.long)
test_idx = torch.tensor(indices[train_size:], dtype=torch.long)

# Crear modelo
model = AddGraphModel(
    in_channels=graph_data.x.size(1),
    hidden_channels=32,
    out_channels=1
)

# Entrenar el modelo
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
epochs = 100

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    out = model(graph_data.x, graph_data.edge_index).squeeze()
    loss = torch.nn.functional.binary_cross_entropy_with_logits(out[train_idx], anomaly_labels_tensor[train_idx])
    loss.backward()
    optimizer.step()

# Evaluar el modelo
model.eval()
with torch.no_grad():
    out = model(graph_data.x, graph_data.edge_index).squeeze()
    probs = torch.sigmoid(out[test_idx])
    preds = (probs > 0.5).int().numpy()
    true = anomaly_labels_tensor[test_idx].int().numpy()

    f1 = f1_score(true, preds)
    acc = accuracy_score(true, preds)
    auc = roc_auc_score(true, probs.numpy()) if len(np.unique(true)) > 1 else float('nan')

print(f"F1 Score: {f1:.4f}, Accuracy: {acc:.4f}, AUC-ROC: {auc:.4f}")

This module builds an AddGraph-style anomaly detection model using PyTorch Geometric (PyG). It processes microservice telemetry data (e.g., latency percentiles, throughput, success rate) collected over time.

Key steps:

Temporal Graph Construction: The system constructs dynamic graph snapshots using 1-minute time windows. Each node represents a microservice (source_workload), and edges represent calls between services.

Feature Engineering: Metrics like p50‚Äìp99 latency, success rate, and throughput are averaged and normalized for each node in each snapshot.

Anomaly Labeling: Nodes are labeled as anomalous if their p99_latency exceeds the 95th percentile in the global dataset during that window.

GCN Model: A 2-layer Graph Convolutional Network (GCN) is trained to detect anomalous nodes based on their temporal behavior and structural context.

Evaluation: The model is evaluated using F1 score, accuracy, and ROC AUC on the test snapshots.

In [None]:
# 1. Instalar dependencias necesarias
!pip install torch torchvision torchaudio torch-geometric pandas scikit-learn matplotlib tqdm --quiet


In [None]:

# 2. Importar librer√≠as
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from datetime import timedelta
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from tqdm import tqdm
import ast

# 3. Cargar el dataset
df = pd.read_csv("results sahra2/kiali_latency_percentiles.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 4. Procesar columnas con listas como strings
def parse_and_average(col):
    return df[col].apply(lambda x: np.mean(ast.literal_eval(x)) if pd.notnull(x) and isinstance(x, str) else np.nan)

list_columns = ["average_latency", "throughput", "success_rate", "istio_request_bytes"]
for col in list_columns:
    df[col + "_mean"] = parse_and_average(col)

# 5. Definir las features finales
features = [
    "p50_latency", "p90_latency", "p95_latency", "p99_latency",
    "average_latency_mean", "throughput_mean", "success_rate_mean", "istio_request_bytes_mean"
]

# Eliminar filas con valores nulos
df = df.dropna(subset=["source_workload", "destination_workload", "timestamp"] + features)

# 6. Normalizar las features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# 7. Crear nodos √∫nicos e √≠ndice
nodes = pd.unique(df[["source_workload", "destination_workload"]].values.ravel())
node_map = {n: i for i, n in enumerate(nodes)}

# 8. Crear snapshots por ventana de tiempo
window_size = timedelta(minutes=1)
start = df["timestamp"].min()
end = df["timestamp"].max()

snapshots = []
labels = []

while start + window_size <= end:
    window = df[(df["timestamp"] >= start) & (df["timestamp"] < start + window_size)]
    if len(window) == 0:
        start += window_size
        continue

    src = window["source_workload"].map(node_map).values
    dst = window["destination_workload"].map(node_map).values
    edge_index = torch.tensor([src, dst], dtype=torch.long)

    grouped = window.groupby("source_workload")[features].mean()
    grouped = grouped.reindex(nodes).fillna(0)
    x = torch.tensor(grouped.values, dtype=torch.float)

    p99_series = window.groupby("source_workload")["p99_latency"].mean()
    p99_series = p99_series.reindex(nodes).fillna(0)
    threshold = df["p99_latency"].quantile(0.95)
    y = (p99_series > threshold).astype(int).values
    y = torch.tensor(y, dtype=torch.float)

    snapshots.append(Data(x=x, edge_index=edge_index))
    labels.append(y)

    start += window_size

print(f"Snapshots creados: {len(snapshots)}")

# 9. Definir el modelo GCN
class AddGraphGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x.squeeze()

model = AddGraphGCN(in_channels=len(features), hidden_channels=32)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.BCEWithLogitsLoss()

# 10. Entrenamiento
split = int(0.8 * len(snapshots))
train_snapshots = snapshots[:split]
test_snapshots = snapshots[split:]
test_labels = labels[split:]

for epoch in range(10):
    model.train()
    total_loss = 0
    for i, g in enumerate(train_snapshots):
        optimizer.zero_grad()
        out = model(g.x, g.edge_index)
        target = labels[i]
        loss = loss_fn(out, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

# 11. Evaluaci√≥n
model.eval()
all_preds = []
all_true = []

for i, g in enumerate(test_snapshots):
    with torch.no_grad():
        out = model(g.x, g.edge_index)
        probs = torch.sigmoid(out)
        pred = (probs > 0.5).int().numpy()
        true = test_labels[i].int().numpy()

        all_preds.extend(pred.tolist())
        all_true.extend(true.tolist())

f1 = f1_score(all_true, all_preds)
acc = accuracy_score(all_true, all_preds)
auc = roc_auc_score(all_true, all_preds) if len(np.unique(all_true)) > 1 else float('nan')
print(f"\nF1 Score: {f1:.4f} | Accuracy: {acc:.4f} | AUC-ROC: {auc:.4f}")


## TGN

In [10]:
# Instalar PyTorch (elige la versi√≥n seg√∫n tu sistema; esta es para CPU)
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

# Instalar PyTorch Geometric y Torch-Geometric-Temporal
!pip install torch-geometric
!pip install torch-geometric-temporal

# Instalar otras dependencias √∫tiles
!pip install pandas scikit-learn matplotlib tqdm




In [None]:
import torch
import pandas as pd
import numpy as np
from torch.nn import functional as F
from torch_geometric_temporal.nn import TGNMemory, TemporalConv
from torch_geometric_temporal.signal import StaticGraphTemporalSignal, temporal_signal_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# 1. Cargar datos
df = pd.read_csv("results sahra2/kiali_latency_percentiles.csv")
df['timestamp'] = pd.to_datetime(df['timestamp'])

# 2. Detectar anomal√≠as por percentil 95 de p99_latency
threshold = df["p99_latency"].quantile(0.95)
df["anomaly"] = (df["p99_latency"] > threshold).astype(int)

# 3. Crear snapshots por minuto
snapshots = []
scaler = StandardScaler()

for ts, group in df.groupby(df['timestamp'].dt.floor("min")):
    nodes = pd.concat([group["source_workload"], group["destination_workload"]]).dropna().unique()
    node_map = {n: i for i, n in enumerate(nodes)}
    
    # Crear edge_index
    edges = group[["source_workload", "destination_workload"]].dropna()
    if edges.empty: continue
    edge_idx = edges.applymap(node_map.get).dropna().astype(int).values.T
    edge_index = torch.tensor(edge_idx, dtype=torch.long)

    # edge_attr: usar p99_latency o throughput como atributos
    edge_attr_values = group["p99_latency"].fillna(0).values[:edge_index.shape[1]]
    edge_attr = torch.tensor(edge_attr_values.reshape(-1, 1), dtype=torch.float)

    # Atributos de nodo
    node_df = group.groupby("source_workload")["p99_latency"].mean().reindex(nodes).fillna(0).values
    x = torch.tensor(scaler.fit_transform(node_df.reshape(-1, 1)), dtype=torch.float)

    # Etiquetas (1 si hubo anomal√≠a en nodo)
    y_df = group.groupby("source_workload")["anomaly"].max().reindex(nodes).fillna(0).values
    y = torch.tensor(y_df.astype(int), dtype=torch.long)

    snapshots.append((x, edge_index, edge_attr, y))

# 4. Crear dataset temporal
dataset = StaticGraphTemporalSignal(
    edge_indices=[s[1] for s in snapshots],
    edge_features=[s[2] for s in snapshots],
    features=[s[0] for s in snapshots],
    targets=[s[3] for s in snapshots]
)

# 5. Dividir en entrenamiento y prueba
train_dataset, test_dataset = temporal_signal_split(dataset, train_ratio=0.8)

# 6. Definir modelo temporal
class TGN(torch.nn.Module):
    def __init__(self, node_features, edge_features, memory_dim, time_dim, output_dim):
        super().__init__()
        self.memory = TGNMemory(
            node_features=node_features,
            edge_features=edge_features,
            memory_dimension=memory_dim,
            time_dimension=time_dim,
            message_dimension=memory_dim,
        )
        self.temporal_conv = TemporalConv(in_channels=memory_dim, out_channels=output_dim)

    def forward(self, x, edge_index, edge_attr, time=None):
        mem = self.memory(x, edge_index, edge_attr, time)
        return self.temporal_conv(mem, edge_index)

# 7. Instanciar modelo
model = TGN(node_features=1, edge_features=1, memory_dim=32, time_dim=16, output_dim=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# 8. Entrenamiento
for epoch in range(10):
    model.train()
    total_loss = 0
    for snapshot in train_dataset:
        optimizer.zero_grad()
        out = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        loss = F.binary_cross_entropy_with_logits(out.squeeze(), snapshot.y.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 9. Evaluaci√≥n
model.eval()
f1s, accs, aucs = [], [], []

for snapshot in test_dataset:
    with torch.no_grad():
        out = model(snapshot.x, snapshot.edge_index, snapshot.edge_attr)
        probs = torch.sigmoid(out.squeeze())
        preds = (probs > 0.5).int()
        y_true = snapshot.y.cpu().numpy()
        y_pred = preds.cpu().numpy()
        y_prob = probs.cpu().numpy()

        f1s.append(f1_score(y_true, y_pred, zero_division=0))
        accs.append(accuracy_score(y_true, y_pred))
        if len(np.unique(y_true)) > 1:
            aucs.append(roc_auc_score(y_true, y_prob))

print(f"\nF1 Score: {np.mean(f1s):.4f}")
print(f"Accuracy: {np.mean(accs):.4f}")
print(f"AUC-ROC: {np.mean(aucs) if aucs else 'N/A'}")


## STGNN

In [12]:
import torch
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal.signal import temporal_signal_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd

# Definir el modelo ST-GNN
class STGNN(torch.nn.Module):
    def __init__(self, node_features, hidden_channels, output_dim):
        super(STGNN, self).__init__()
        self.recurrent = DCRNN(node_features, hidden_channels, K=2)  # Diffusion Convolutional Recurrent Network
        self.linear = torch.nn.Linear(hidden_channels, output_dim)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        out = self.linear(h)
        return out

# Cargar datos
data = pd.read_csv("results sahra2/kiali_latency_percentiles.csv")
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Seleccionar m√©tricas relevantes
selected_columns = [
    "p50_latency", "p90_latency", "p95_latency", "p99_latency",
    "istio_request_bytes", "success_rate", "throughput", "average_latency"
]
for col in selected_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Agrupar por 'source_workload' y calcular la media
grouped = data.groupby(['timestamp', 'source_workload'])[selected_columns].mean().reset_index()

# Crear nodos y aristas para cada snapshot temporal
timestamps = grouped['timestamp'].unique()
edge_indices, edge_weights, node_features, labels = [], [], [], []

for timestamp in timestamps:
    snapshot = grouped[grouped['timestamp'] == timestamp]
    
    # Crear nodos (caracter√≠sticas)
    node_features.append(torch.tensor(snapshot[selected_columns].values, dtype=torch.float))
    
    # Crear aristas (relaciones entre microservicios)
    edges = data[data['timestamp'] == timestamp][['source_workload', 'destination_workload']].dropna()
    edge_index = torch.tensor(edges.values.T, dtype=torch.long)
    edge_indices.append(edge_index)
    
    # Crear pesos de aristas (por ejemplo, tasa de √©xito)
    edge_weight = torch.tensor(edges['success_rate'].values, dtype=torch.float)
    edge_weights.append(edge_weight)
    
    # Etiquetas de anomal√≠as (por ejemplo, basadas en p99_latency)
    anomaly_labels = (snapshot['p99_latency'] > snapshot['p99_latency'].quantile(0.95)).astype(int)
    labels.append(torch.tensor(anomaly_labels.values, dtype=torch.float))

# Crear dataset temporal
from torch_geometric_temporal.signal import StaticGraphTemporalSignal
dataset = StaticGraphTemporalSignal(edge_indices, edge_weights, node_features, timestamps, labels)

# Dividir en entrenamiento y prueba
train_dataset, test_dataset = temporal_signal_split(dataset, train_ratio=0.8)

# Entrenar el modelo
model = STGNN(node_features=len(selected_columns), hidden_channels=32, output_dim=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(50):
    model.train()
    for snapshot in train_dataset:
        optimizer.zero_grad()
        out = model(snapshot.x, snapshot.edge_index, snapshot.edge_weight)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(out, snapshot.y.float())
        loss.backward()
        optimizer.step()

# Evaluar el modelo
model.eval()
f1_scores, accuracies, aucs = [], [], []
for snapshot in test_dataset:
    with torch.no_grad():
        out = model(snapshot.x, snapshot.edge_index, snapshot.edge_weight)
        preds = (torch.sigmoid(out) > 0.5).int()
        f1_scores.append(f1_score(snapshot.y, preds))
        accuracies.append(accuracy_score(snapshot.y, preds))
        aucs.append(roc_auc_score(snapshot.y, torch.sigmoid(out)))

print(f"F1 Score: {np.mean(f1_scores):.4f}, Accuracy: {np.mean(accuracies):.4f}, AUC-ROC: {np.mean(aucs):.4f}")

ModuleNotFoundError: No module named 'torch_geometric_temporal.nn.recurrent'

##  DCRNN +

In [None]:
import torch
from torch_geometric_temporal.nn.recurrent import DCRNN
from torch_geometric_temporal.signal import temporal_signal_split
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd

# Definir el modelo ST-GNN
class STGNN(torch.nn.Module):
    def __init__(self, node_features, hidden_channels, output_dim):
        super(STGNN, self).__init__()
        self.recurrent = DCRNN(node_features, hidden_channels, K=2)  # Diffusion Convolutional Recurrent Network
        self.linear = torch.nn.Linear(hidden_channels, output_dim)

    def forward(self, x, edge_index, edge_weight):
        h = self.recurrent(x, edge_index, edge_weight)
        out = self.linear(h)
        return out

# Cargar datos
data = pd.read_csv("results sahra2/kiali_latency_percentiles.csv")
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Seleccionar m√©tricas relevantes
selected_columns = [
    "p50_latency", "p90_latency", "p95_latency", "p99_latency",
    "istio_request_bytes", "success_rate", "throughput", "average_latency"
]
for col in selected_columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Agrupar por 'source_workload' y calcular la media
grouped = data.groupby(['timestamp', 'source_workload'])[selected_columns].mean().reset_index()

# Crear nodos y aristas para cada snapshot temporal
timestamps = grouped['timestamp'].unique()
edge_indices, edge_weights, node_features, labels = [], [], [], []

for timestamp in timestamps:
    snapshot = grouped[grouped['timestamp'] == timestamp]
    
    # Crear nodos (caracter√≠sticas)
    node_features.append(torch.tensor(snapshot[selected_columns].values, dtype=torch.float))
    
    # Crear aristas (relaciones entre microservicios)
    edges = data[data['timestamp'] == timestamp][['source_workload', 'destination_workload']].dropna()
    edge_index = torch.tensor(edges.values.T, dtype=torch.long)
    edge_indices.append(edge_index)
    
    # Crear pesos de aristas (por ejemplo, tasa de √©xito)
    edge_weight = torch.tensor(edges['success_rate'].values, dtype=torch.float)
    edge_weights.append(edge_weight)
    
    # Etiquetas de anomal√≠as (basadas en p99 y p95)
    anomaly_labels_p99 = (snapshot['p99_latency'] > snapshot['p99_latency'].quantile(0.95)).astype(int)
    anomaly_labels_p95 = (snapshot['p95_latency'] > snapshot['p95_latency'].quantile(0.95)).astype(int)
    combined_labels = (anomaly_labels_p99 | anomaly_labels_p95).astype(int)  # Etiqueta combinada
    labels.append(torch.tensor(combined_labels.values, dtype=torch.float))

# Crear dataset temporal
from torch_geometric_temporal.signal import StaticGraphTemporalSignal
dataset = StaticGraphTemporalSignal(edge_indices, edge_weights, node_features, timestamps, labels)

# Dividir en entrenamiento y prueba
train_dataset, test_dataset = temporal_signal_split(dataset, train_ratio=0.8)

# Entrenar el modelo
model = STGNN(node_features=len(selected_columns), hidden_channels=32, output_dim=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(50):
    model.train()
    for snapshot in train_dataset:
        optimizer.zero_grad()
        out = model(snapshot.x, snapshot.edge_index, snapshot.edge_weight)
        loss = torch.nn.functional.binary_cross_entropy_with_logits(out, snapshot.y.float())
        loss.backward()
        optimizer.step()

# Evaluar el modelo
model.eval()
f1_scores, accuracies, aucs = [], [], []
for snapshot in test_dataset:
    with torch.no_grad():
        out = model(snapshot.x, snapshot.edge_index, snapshot.edge_weight)
        preds = (torch.sigmoid(out) > 0.5).int()
        f1_scores.append(f1_score(snapshot.y, preds))
        accuracies.append(accuracy_score(snapshot.y, preds))
        aucs.append(roc_auc_score(snapshot.y, torch.sigmoid(out)))

print(f"F1 Score: {np.mean(f1_scores):.4f}, Accuracy: {np.mean(accuracies):.4f}, AUC-ROC: {np.mean(aucs):.4f}")

## KGROOT


In [None]:
# 1. Install dependencies
!pip install dgl torch pandas scikit-learn tqdm --quiet


In [None]:

# 2. Imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import dgl
import torch.nn.functional as F
from tqdm import tqdm

from dgl.nn import GraphConv
from sklearn.preprocessing import MinMaxScaler
from datetime import timedelta

# 3. Load and preprocess dataset
df = pd.read_csv("istio_request_2.2.csv")
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df.sort_values("timestamp")

# Calculate avg latency
df["avg_latency"] = df["istio_request_duration_milliseconds_sum"] / df["total_request"]
df["avg_latency"].fillna(0, inplace=True)

# Label relation type
def classify_relation(row, latency_thres):
    if row["avg_latency"] > latency_thres:
        return "high_latency"
    elif row["response_code"] >= 500:
        return "error_5xx"
    elif row["response_code"] >= 400:
        return "error_4xx"
    else:
        return "success"

lat_thres = df["avg_latency"].quantile(0.95)
df["relation"] = df.apply(lambda r: classify_relation(r, lat_thres), axis=1)

# 4. Encode services and relations
all_services = pd.unique(df[["source_workload", "destination_workload"]].values.ravel())
service2id = {svc: i for i, svc in enumerate(all_services)}
relation2id = {"success": 0, "error_4xx": 1, "error_5xx": 2, "high_latency": 3}

df["src_id"] = df["source_workload"].map(service2id)
df["dst_id"] = df["destination_workload"].map(service2id)
df["rel_id"] = df["relation"].map(relation2id)

# Normalize latency
scaler = MinMaxScaler()
df["latency_norm"] = scaler.fit_transform(df[["avg_latency"]])

# 5. Create graph snapshots (historical and current)
snapshots = []
labels = []

window_size = timedelta(minutes=1)
start_time = df["timestamp"].min()
end_time = df["timestamp"].max()
timepoints = []

while start_time + window_size < end_time:
    window_df = df[(df["timestamp"] >= start_time) & (df["timestamp"] < start_time + window_size)]
    if len(window_df) > 0:
        snapshots.append(window_df.copy())
        timepoints.append(start_time)
    start_time += window_size

# Use last snapshot as current graph, previous as historical
historical_graphs = snapshots[:-1]
current_graph_df = snapshots[-1]

# 6. GCN model
class GCN(nn.Module):
    def __init__(self, in_feats, hidden_feats):
        super().__init__()
        self.conv1 = GraphConv(in_feats, hidden_feats)
        self.conv2 = GraphConv(hidden_feats, 1)

    def forward(self, g, feat):
        h = F.relu(self.conv1(g, feat))
        h = self.conv2(g, h)
        return h

# 7. Prepare historical training data
graphs = []
edge_labels = []

for snapshot in historical_graphs:
    g = dgl.graph((snapshot["src_id"], snapshot["dst_id"]), num_nodes=len(service2id))
    edge_feat = torch.tensor(snapshot["latency_norm"].values, dtype=torch.float32).unsqueeze(1)
    rel_feat = torch.tensor(snapshot["rel_id"].values, dtype=torch.float32).unsqueeze(1)
    g.edata["feat"] = torch.cat([edge_feat, rel_feat], dim=1)
    graphs.append(g)

# Create training graph as average over history
def average_edge_features(graphs):
    edge_feats = [g.edata["feat"] for g in graphs]
    stacked = torch.stack([F.pad(e, (0, 0, 0, max([g.num_edges() for g in graphs]) - e.size(0))) for e in edge_feats])
    return stacked.mean(0)

avg_feat = average_edge_features(graphs)
train_g = dgl.graph((current_graph_df["src_id"], current_graph_df["dst_id"]), num_nodes=len(service2id))
edge_feat = torch.tensor(current_graph_df["latency_norm"].values, dtype=torch.float32).unsqueeze(1)
rel_feat = torch.tensor(current_graph_df["rel_id"].values, dtype=torch.float32).unsqueeze(1)
train_g.edata["feat"] = torch.cat([edge_feat, rel_feat], dim=1)

# 8. Train GCN to reconstruct historical pattern
model = GCN(in_feats=2, hidden_feats=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

for epoch in range(20):
    model.train()
    pred = model(train_g, train_g.edata["feat"])
    loss = loss_fn(pred, avg_feat[:len(pred)])
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")

# 9. Anomaly score: deviation from historical pattern
model.eval()
with torch.no_grad():
    pred = model(train_g, train_g.edata["feat"])
    scores = torch.abs(pred.squeeze() - avg_feat[:len(pred)].squeeze())
    topk = torch.topk(scores, k=10)
    print("\nTop-10 Anomalous Interactions:")
    for i in topk.indices:
        src = current_graph_df.iloc[i]["source_workload"]
        dst = current_graph_df.iloc[i]["destination_workload"]
        r = current_graph_df.iloc[i]["relation"]
        print(f"{src} ‚Üí {dst} | relation: {r} | anomaly score: {scores[i].item():.4f}")


In [None]:
# Visualizaci√≥n con NetworkX y Matplotlib
import networkx as nx
import matplotlib.pyplot as plt

# Crear grafo dirigido
G = nx.DiGraph()

# A√±adir nodos
for node_id in service2id.values():
    G.add_node(node_id)

# A√±adir aristas con anomal√≠a como atributo
for i, row in current_graph_df.iterrows():
    src = service2id[row["source_workload"]]
    dst = service2id[row["destination_workload"]]
    score = scores[i].item()
    G.add_edge(src, dst, score=score)

# Posiciones para layout
pos = nx.spring_layout(G, seed=42)

# Crear lista de colores para las aristas seg√∫n score
edge_colors = []
edge_widths = []
threshold = torch.quantile(scores, 0.90).item()

for u, v in G.edges():
    s = G[u][v]['score']
    if s > threshold:
        edge_colors.append('red')
        edge_widths.append(2.5)
    else:
        edge_colors.append('gray')
        edge_widths.append(0.8)

# Etiquetas de nodos
id2service = {v: k for k, v in service2id.items()}
node_labels = {node: id2service[node] for node in G.nodes()}

# Dibujar grafo
plt.figure(figsize=(14, 10))
nx.draw_networkx_nodes(G, pos, node_size=700, node_color='skyblue')
nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=edge_widths, arrows=True)
nx.draw_networkx_labels(G, pos, labels=node_labels, font_size=10)

plt.title("KGroot-based Anomaly Graph\n(Red edges are anomalous)", fontsize=14)
plt.axis("off")
plt.show()
