In [1]:
# Kubernetes LSTM Disaster Recovery System
# Predicting CPU and Memory Usage for Pod-level Monitoring

import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime, timedelta
import requests
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

PyTorch version: 2.6.0+cu124
CUDA available: False


In [None]:
# =============================================================================
# 1. DATA LOADING AND PREPROCESSING
# =============================================================================

class DataProcessor:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.scaler_features = StandardScaler()
        self.scaler_targets = MinMaxScaler()
        
    def load_and_preprocess(self):
        """Load CSV and perform initial preprocessing"""
        print("Loading data from CSV...")
        df = pd.read_csv(self.csv_path)
        
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M') 
('%Y-%m-%d %H:%M')
 # pd.to_datetime(df['timestamp'])
        
        # Derive pod-level CPU and memory percentages
        print("Deriving pod-level CPU and memory percentages...")
        df['pod_cpu_percentage'] = df['cpu_allocation_efficiency'] * df['node_cpu_usage']
        df['pod_memory_percentage'] = df['memory_allocation_efficiency'] * df['node_memory_usage']
        
        # Clip values to reasonable ranges (0-100% for percentages)
        df['pod_cpu_percentage'] = np.clip(df['pod_cpu_percentage'], 0, 100)
        df['pod_memory_percentage'] = np.clip(df['pod_memory_percentage'], 0, 100)
        
        print(f"Data shape: {df.shape}")
        print(f"Unique pods: {df['pod_name'].nunique()}")
        print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        
        return df
    
    def resample_to_5min(self, df):
        """Resample 1-minute data to 5-minute intervals"""
        print("Resampling data from 1-minute to 5-minute intervals...")

        new_df = df.copy()
        resampled_dfs = []
            
        # Resample numeric columns
        numeric_cols = ['cpu_allocation_efficiency', 'memory_allocation_efficiency', 
                      'disk_io', 'network_latency', 'node_temperature', 
                      'node_cpu_usage', 'node_memory_usage', 'pod_lifetime_seconds',
                      'pod_cpu_percentage', 'pod_memory_percentage']
        
        resampled = new_df[numeric_cols].resample('5T').mean()
        resampled['event_type'] = pod_df['event_type'].resample('5T').first()

        resampled.dropna(inplace = True)
        return resampled

    def prepare_sequences(self, df, sequence_length=24, prediction_horizon=12):
        """
        Prepare sequences for LSTM training
        sequence_length: Number of past time steps to use (24 = 2 hours of 5-min data)
        prediction_horizon: Number of future steps to predict (12 = 1 hour)
        """
        print(f"Preparing sequences (seq_len={sequence_length}, pred_horizon={prediction_horizon})...")

        df_copy = df.copy()
        # Feature columns (excluding targets and identifiers)
        feature_cols = ['cpu_allocation_efficiency', 'memory_allocation_efficiency', 
                       'disk_io', 'network_latency', 'node_temperature', 
                       'node_cpu_usage', 'node_memory_usage', 'pod_lifetime_seconds']
        
        # Target columns
        target_cols = ['pod_cpu_percentage', 'pod_memory_percentage']
        
        sequences_X, sequences_y, pod_names, timestamps = [], [], [] 
                   
        # Prepare features and targets
        features = pod_df[feature_cols].values
        targets = pod_df[target_cols].values
        
        # Create sequences
        for i in range(len(pod_df) - sequence_length - prediction_horizon + 1):
            X_seq = features[i:i + sequence_length]
            y_seq = targets[i + sequence_length:i + sequence_length + prediction_horizon]
            
            sequences_X.append(X_seq)
            sequences_y.append(y_seq)
            pod_names.append(pod_name)
            timestamps.append(pod_df.iloc[i + sequence_length]['timestamp'])
    
        sequences_X = np.array(sequences_X)
        sequences_y = np.array(sequences_y)
        
        print(f"Created {len(sequences_X)} sequences")
        print(f"Input shape: {sequences_X.shape}")
        print(f"Output shape: {sequences_y.shape}")
        
        return sequences_X, sequences_y, np.array(pod_names), np.array(timestamps)
    
    def normalize_data(self, X_train, y_train, X_val=None, y_val=None):
        """Normalize features and targets"""
        print("Normalizing data...")
        
        # Reshape for scaling
        X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
        y_train_reshaped = y_train.reshape(-1, y_train.shape[-1])
        
        # Fit scalers on training data
        X_train_scaled = self.scaler_features.fit_transform(X_train_reshaped)
        y_train_scaled = self.scaler_targets.fit_transform(y_train_reshaped)
        
        # Reshape back
        X_train_scaled = X_train_scaled.reshape(X_train.shape)
        y_train_scaled = y_train_scaled.reshape(y_train.shape)
        
        if X_val is not None and y_val is not None:
            X_val_reshaped = X_val.reshape(-1, X_val.shape[-1])
            y_val_reshaped = y_val.reshape(-1, y_val.shape[-1])
            
            X_val_scaled = self.scaler_features.transform(X_val_reshaped)
            y_val_scaled = self.scaler_targets.transform(y_val_reshaped)
            
            X_val_scaled = X_val_scaled.reshape(X_val.shape)
            y_val_scaled = y_val_scaled.reshape(y_val.shape)
            
            return X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled
        
        return X_train_scaled, y_train_scaled

In [18]:
# =============================================================================
# 1. DATA LOADING AND PREPROCESSING
# =============================================================================

class DataProcessor:
    def __init__(self, csv_path):
        self.csv_path = csv_path
        self.scaler_features = StandardScaler()
        self.scaler_targets = MinMaxScaler()
        
    def load_and_preprocess(self):
        """Load CSV and perform initial preprocessing"""
        print("Loading data from CSV...")
        df = pd.read_csv(self.csv_path)
        
        # Convert timestamp to datetime
        df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M') 
('%Y-%m-%d %H:%M')
 # pd.to_datetime(df['timestamp'])
        
        # Derive pod-level CPU and memory percentages
        print("Deriving pod-level CPU and memory percentages...")
        df['pod_cpu_percentage'] = df['cpu_allocation_efficiency'] * df['node_cpu_usage']
        df['pod_memory_percentage'] = df['memory_allocation_efficiency'] * df['node_memory_usage']
        
        # Clip values to reasonable ranges (0-100% for percentages)
        df['pod_cpu_percentage'] = np.clip(df['pod_cpu_percentage'], 0, 100)
        df['pod_memory_percentage'] = np.clip(df['pod_memory_percentage'], 0, 100)
        
        print(f"Data shape: {df.shape}")
        print(f"Unique pods: {df['pod_name'].nunique()}")
        print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
        
        return df
    
    def resample_to_5min(self, df):
        """Resample 1-minute data to 5-minute intervals"""
        print("Resampling data from 1-minute to 5-minute intervals...")

        new_df = df.copy()
        resampled_dfs = []
        # for pod_name in df['pod_name'].unique():
        #     pod_df = df[df['pod_name'] == pod_name].copy()
        #     pod_df = pod_df.set_index('timestamp')
            
        # Resample numeric columns
        numeric_cols = ['cpu_allocation_efficiency', 'memory_allocation_efficiency', 
                      'disk_io', 'network_latency', 'node_temperature', 
                      'node_cpu_usage', 'node_memory_usage', 'pod_lifetime_seconds',
                      'pod_cpu_percentage', 'pod_memory_percentage']
        
        resampled = new_df[numeric_cols].resample('5T').mean()
        # resampled['pod_name'] = pod_name
        resampled['event_type'] = pod_df['event_type'].resample('5T').first()
        
        # resampled_dfs.append(resampled.reset_index())
        
        # result = pd.concat(resampled_dfs, ignore_index=True)
        # result = result.dropna()  # Remove any NaN values from resampling
        
        # print(f"Resampled data shape: {result.shape}")
        # return result
        resampled.dropna(inplace = True)
        return resampled

    def prepare_sequences(self, df, sequence_length=24, prediction_horizon=12):
        """
        Prepare sequences for LSTM training
        sequence_length: Number of past time steps to use (24 = 2 hours of 5-min data)
        prediction_horizon: Number of future steps to predict (12 = 1 hour)
        """
        print(f"Preparing sequences (seq_len={sequence_length}, pred_horizon={prediction_horizon})...")
        
        # Feature columns (excluding targets and identifiers)
        feature_cols = ['cpu_allocation_efficiency', 'memory_allocation_efficiency', 
                       'disk_io', 'network_latency', 'node_temperature', 
                       'node_cpu_usage', 'node_memory_usage', 'pod_lifetime_seconds']
        
        # Target columns
        target_cols = ['pod_cpu_percentage', 'pod_memory_percentage']
        
        sequences_X, sequences_y, pod_names, timestamps = [], [], [], []
        
        for pod_name in df['pod_name'].unique():
            pod_df = df[df['pod_name'] == pod_name].sort_values('timestamp')
            
            if len(pod_df) < sequence_length + prediction_horizon:
                continue  # Skip pods with insufficient data
            
            # Prepare features and targets
            features = pod_df[feature_cols].values
            targets = pod_df[target_cols].values
            
            # Create sequences
            for i in range(len(pod_df) - sequence_length - prediction_horizon + 1):
                X_seq = features[i:i + sequence_length]
                y_seq = targets[i + sequence_length:i + sequence_length + prediction_horizon]
                
                sequences_X.append(X_seq)
                sequences_y.append(y_seq)
                pod_names.append(pod_name)
                timestamps.append(pod_df.iloc[i + sequence_length]['timestamp'])
        
        sequences_X = np.array(sequences_X)
        sequences_y = np.array(sequences_y)
        
        print(f"Created {len(sequences_X)} sequences")
        print(f"Input shape: {sequences_X.shape}")
        print(f"Output shape: {sequences_y.shape}")
        
        return sequences_X, sequences_y, np.array(pod_names), np.array(timestamps)
    
    def normalize_data(self, X_train, y_train, X_val=None, y_val=None):
        """Normalize features and targets"""
        print("Normalizing data...")
        
        # Reshape for scaling
        X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
        y_train_reshaped = y_train.reshape(-1, y_train.shape[-1])
        
        # Fit scalers on training data
        X_train_scaled = self.scaler_features.fit_transform(X_train_reshaped)
        y_train_scaled = self.scaler_targets.fit_transform(y_train_reshaped)
        
        # Reshape back
        X_train_scaled = X_train_scaled.reshape(X_train.shape)
        y_train_scaled = y_train_scaled.reshape(y_train.shape)
        
        if X_val is not None and y_val is not None:
            X_val_reshaped = X_val.reshape(-1, X_val.shape[-1])
            y_val_reshaped = y_val.reshape(-1, y_val.shape[-1])
            
            X_val_scaled = self.scaler_features.transform(X_val_reshaped)
            y_val_scaled = self.scaler_targets.transform(y_val_reshaped)
            
            X_val_scaled = X_val_scaled.reshape(X_val.shape)
            y_val_scaled = y_val_scaled.reshape(y_val.shape)
            
            return X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled
        
        return X_train_scaled, y_train_scaled

In [2]:
 def load_and_preprocess(csv_path):
    """Load CSV and perform initial preprocessing"""
    print("Loading data from CSV...")
    df = pd.read_csv(csv_path)
    
    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M')
    return df 

In [None]:
csv_path = "/data/kaggle/input/kubernetes_performance_metrics_dataset.csv"
df = pd.read_csv(csv_path)

df.head()

Unnamed: 0,timestamp,pod_name,namespace,cpu_allocation_efficiency,memory_allocation_efficiency,disk_io,network_latency,node_temperature,node_cpu_usage,node_memory_usage,event_type,event_message,scaling_event,pod_lifetime_seconds
0,01/01/2023 00:00,pod_0,dev,0.038162,0.949259,9.993579,13.722542,77.619073,93.177619,37.900532,Warning,Killed,False,119648
1,01/01/2023 00:00,pod_1,default,0.500763,0.048543,935.792442,55.493953,84.182245,61.442289,5.208161,Error,Failed,True,144516
2,01/01/2023 00:00,pod_2,kube-system,0.746726,0.447345,328.352359,173.910016,21.295244,55.819311,18.335802,Normal,Completed,True,68857
3,01/01/2023 00:00,pod_3,default,0.526692,0.870251,778.297708,67.395729,85.028829,78.968463,94.619689,Warning,OOMKilled,True,72080
4,01/01/2023 00:00,pod_4,prod,0.425342,0.885459,711.181295,91.72473,29.157695,52.718141,70.770594,Error,Killed,False,123016


In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M')

In [8]:
df.tail()

Unnamed: 0,timestamp,pod_name,namespace,cpu_allocation_efficiency,memory_allocation_efficiency,disk_io,network_latency,node_temperature,node_cpu_usage,node_memory_usage,event_type,event_message,scaling_event,pod_lifetime_seconds
14995,2023-01-01 04:09:00,pod_14995,kube-system,0.020767,0.697208,379.511285,147.03129,66.820729,23.68171,65.269283,Normal,Started,True,111871
14996,2023-01-01 04:09:00,pod_14996,default,0.02649,0.973705,93.014634,80.106794,21.358463,22.612871,48.674617,Normal,Started,False,64848
14997,2023-01-01 04:09:00,pod_14997,dev,0.321295,0.073787,112.686558,83.62158,70.648406,74.516728,76.802551,Error,Failed,True,126843
14998,2023-01-01 04:09:00,pod_14998,kube-system,0.087156,0.322506,804.890194,158.398994,41.005118,58.788146,53.529527,Warning,Started,False,137157
14999,2023-01-01 04:09:00,pod_14999,default,0.094542,0.052845,624.286513,74.228015,21.776262,14.834679,32.077468,Normal,Killed,True,112793


In [10]:
df["pod_name"].nunique()

15000

In [None]:
csv_path = "/kaggle/input/kubernetes-resource-and-performancemetricsallocation/kubernetes_performance_metrics_dataset.csv"
processor = DataProcessor(csv_path)

# Load and preprocess data
df = processor.load_and_preprocess()

# Resample to 5-minute intervals
df_resampled = processor.resample_to_5min(df)

Loading data from CSV...
Deriving pod-level CPU and memory percentages...
Data shape: (15000, 16)
Unique pods: 15000
Date range: 2023-01-01 00:00:00 to 2023-01-01 04:09:00
Resampling data from 1-minute to 5-minute intervals...
Resampled data shape: (15000, 13)


In [None]:
df_resampled.head()

In [None]:
df.head()

In [None]:
df_resa

In [None]:




# Prepare sequences
X, y, pod_names, timestamps = processor.prepare_sequences(df_resampled)

# Train/validation split (80/20)
split_idx = int(0.8 * len(X))

X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

# Normalize data
X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled = processor.normalize_data(
    X_train, y_train, X_val, y_val
)


In [None]:
# =============================================================================
# 2. LSTM MODEL ARCHITECTURE
# =============================================================================

class MultiOutputLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, prediction_horizon, dropout=0.2):
        super(MultiOutputLSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.prediction_horizon = prediction_horizon
        self.output_size = output_size
        
        # LSTM layers
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
        # Output layer that produces all future predictions at once
        self.fc = nn.Linear(hidden_size, output_size * prediction_horizon)
        
        # Initialize weights
        self._initialize_weights()
    
    def _initialize_weights(self):
        for name, param in self.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)
    
    def forward(self, x):
        batch_size = x.size(0)
        
        # Initialize hidden state
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        
        # LSTM forward pass
        lstm_out, _ = self.lstm(x, (h0, c0))
        
        # Use the last output
        last_output = lstm_out[:, -1, :]
        last_output = self.dropout(last_output)
        
        # Generate predictions
        output = self.fc(last_output)
        
        # Reshape to (batch_size, prediction_horizon, output_size)
        output = output.view(batch_size, self.prediction_horizon, self.output_size)
        
        return output

# =============================================================================
# 3. TRAINING PIPELINE
# =============================================================================

class LSTMTrainer:
    def __init__(self, model, device='cpu'):
        self.model = model.to(device)
        self.device = device
        self.training_history = {'train_loss': [], 'val_loss': []}
    
    def train_model(self, X_train, y_train, X_val, y_val, epochs=100, batch_size=32, lr=0.001):
        """Train the LSTM model"""
        print(f"Training model on {self.device}...")
        
        # Convert to tensors
        X_train = torch.FloatTensor(X_train).to(self.device)
        y_train = torch.FloatTensor(y_train).to(self.device)
        X_val = torch.FloatTensor(X_val).to(self.device)
        y_val = torch.FloatTensor(y_val).to(self.device)
        
        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        # Optimizer and loss function
        optimizer = optim.Adam(self.model.parameters(), lr=lr, weight_decay=1e-5)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.5)
        criterion = nn.MSELoss()
        
        best_val_loss = float('inf')
        patience = 15
        patience_counter = 0
        
        for epoch in range(epochs):
            # Training
            self.model.train()
            train_loss = 0.0
            
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                predictions = self.model(batch_X)
                loss = criterion(predictions, batch_y)
                loss.backward()
                
                # Gradient clipping
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
                
                optimizer.step()
                train_loss += loss.item()
            
            # Validation
            self.model.eval()
            with torch.no_grad():
                val_predictions = self.model(X_val)
                val_loss = criterion(val_predictions, y_val).item()
            
            # Learning rate scheduling
            scheduler.step(val_loss)
            
            # Track history
            avg_train_loss = train_loss / len(train_loader)
            self.training_history['train_loss'].append(avg_train_loss)
            self.training_history['val_loss'].append(val_loss)
            
            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                # Save best model
                torch.save(self.model.state_dict(), 'best_model.pth')
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break
            
            if (epoch + 1) % 10 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Train Loss: {avg_train_loss:.6f}, Val Loss: {val_loss:.6f}')
        
        # Load best model
        self.model.load_state_dict(torch.load('best_model.pth'))
        print("Training completed!")
        
        return self.training_history
    
    def plot_training_history(self):
        """Plot training and validation loss"""
        plt.figure(figsize=(10, 6))
        plt.plot(self.training_history['train_loss'], label='Training Loss')
        plt.plot(self.training_history['val_loss'], label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training History')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
# =============================================================================
# 4. DISASTER PREDICTION AND ALERTING SYSTEM
# =============================================================================

class DisasterPredictor:
    def __init__(self, model, scaler_features, scaler_targets, threshold=0.7, device='cpu'):
        self.model = model.to(device)
        self.scaler_features = scaler_features
        self.scaler_targets = scaler_targets
        self.threshold = threshold
        self.device = device
        self.model.eval()
    
    def predict_single_pod(self, pod_data):
        """
        Predict future CPU/Memory usage for a single pod
        pod_data: numpy array of shape (sequence_length, n_features)
        """
        # Normalize input
        pod_data_normalized = self.scaler_features.transform(pod_data)
        
        # Convert to tensor
        X = torch.FloatTensor(pod_data_normalized).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            predictions = self.model(X)
            predictions = predictions.cpu().numpy().squeeze()
        
        # Denormalize predictions
        predictions_reshaped = predictions.reshape(-1, predictions.shape[-1])
        predictions_denormalized = self.scaler_targets.inverse_transform(predictions_reshaped)
        predictions_final = predictions_denormalized.reshape(predictions.shape)
        
        return predictions_final
    
    def check_disaster_conditions(self, predictions, pod_name, current_time):
        """
        Check if predictions exceed disaster threshold
        Returns alert information if disaster conditions are met
        """
        cpu_predictions = predictions[:, 0]  # CPU percentage predictions
        memory_predictions = predictions[:, 1]  # Memory percentage predictions
        
        # Convert to percentage (0-1 to 0-100 if needed)
        if cpu_predictions.max() <= 1.0:
            cpu_predictions *= 100
            memory_predictions *= 100
        
        alerts = []
        
        # Check CPU threshold
        cpu_breach_indices = np.where(cpu_predictions > self.threshold * 100)[0]
        if len(cpu_breach_indices) > 0:
            first_breach = cpu_breach_indices[0]
            time_to_breach = first_breach * 5  # 5 minutes per prediction step
            
            alerts.append({
                'pod_name': pod_name,
                'metric': 'CPU',
                'predicted_value': cpu_predictions[first_breach],
                'threshold': self.threshold * 100,
                'time_until_breach_minutes': time_to_breach,
                'breach_time': current_time + timedelta(minutes=time_to_breach),
                'severity': 'HIGH' if cpu_predictions[first_breach] > 90 else 'MEDIUM'
            })
        
        # Check Memory threshold
        memory_breach_indices = np.where(memory_predictions > self.threshold * 100)[0]
        if len(memory_breach_indices) > 0:
            first_breach = memory_breach_indices[0]
            time_to_breach = first_breach * 5
            
            alerts.append({
                'pod_name': pod_name,
                'metric': 'Memory',
                'predicted_value': memory_predictions[first_breach],
                'threshold': self.threshold * 100,
                'time_until_breach_minutes': time_to_breach,
                'breach_time': current_time + timedelta(minutes=time_to_breach),
                'severity': 'HIGH' if memory_predictions[first_breach] > 90 else 'MEDIUM'
            })
        
        return alerts
    
    def send_slack_alert(self, alerts, webhook_url):
        """Send disaster alert to Slack"""
        if not alerts:
            return
        
        for alert in alerts:
            color = '#ff0000' if alert['severity'] == 'HIGH' else '#ff9900'
            
            message = {
                "attachments": [
                    {
                        "color": color,
                        "title": f"🚨 K8s Disaster Alert - {alert['severity']} Priority",
                        "fields": [
                            {
                                "title": "Pod Name",
                                "value": alert['pod_name'],
                                "short": True
                            },
                            {
                                "title": "Metric",
                                "value": alert['metric'],
                                "short": True
                            },
                            {
                                "title": "Predicted Value",
                                "value": f"{alert['predicted_value']:.1f}%",
                                "short": True
                            },
                            {
                                "title": "Threshold",
                                "value": f"{alert['threshold']:.0f}%",
                                "short": True
                            },
                            {
                                "title": "Time Until Breach",
                                "value": f"{alert['time_until_breach_minutes']} minutes",
                                "short": True
                            },
                            {
                                "title": "Expected Breach Time",
                                "value": alert['breach_time'].strftime('%Y-%m-%d %H:%M:%S'),
                                "short": True
                            }
                        ],
                        "footer": "K8s LSTM Disaster Recovery System",
                        "ts": int(datetime.now().timestamp())
                    }
                ]
            }
            
            try:
                response = requests.post(webhook_url, json=message)
                response.raise_for_status()
                print(f"Alert sent for {alert['pod_name']} - {alert['metric']}")
            except requests.exceptions.RequestException as e:
                print(f"Failed to send Slack alert: {e}")

# =============================================================================
# 5. MAIN EXECUTION PIPELINE
# =============================================================================

def main_pipeline(csv_path, slack_webhook_url=None):
    """Main execution pipeline for disaster recovery system"""
    
    print("=" * 60)
    print("KUBERNETES LSTM DISASTER RECOVERY SYSTEM")
    print("=" * 60)
    
    # 1. Data Processing
    processor = DataProcessor(csv_path)
    
    # Load and preprocess data
    df = processor.load_and_preprocess()
    
    # Resample to 5-minute intervals
    df_resampled = processor.resample_to_5min(df)
    
    # Prepare sequences
    X, y, pod_names, timestamps = processor.prepare_sequences(df_resampled)
    
    # Train/validation split (80/20)
    split_idx = int(0.8 * len(X))
    
    X_train, X_val = X[:split_idx], X[split_idx:]
    y_train, y_val = y[:split_idx], y[split_idx:]
    
    # Normalize data
    X_train_scaled, y_train_scaled, X_val_scaled, y_val_scaled = processor.normalize_data(
        X_train, y_train, X_val, y_val
    )
    
    print(f"Training samples: {len(X_train_scaled)}")
    print(f"Validation samples: {len(X_val_scaled)}")
    
    # 2. Model Training
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Model parameters
    input_size = X_train_scaled.shape[2]  # Number of features
    hidden_size = 64
    num_layers = 2
    output_size = 2  # CPU and Memory
    prediction_horizon = 12  # 1 hour of 5-minute predictions
    
    # Initialize model
    model = MultiOutputLSTM(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=num_layers,
        output_size=output_size,
        prediction_horizon=prediction_horizon,
        dropout=0.3
    )
    
    # Train model
    trainer = LSTMTrainer(model, device)
    history = trainer.train_model(
        X_train_scaled, y_train_scaled, 
        X_val_scaled, y_val_scaled,
        epochs=100, batch_size=32, lr=0.001
    )
    
    # Plot training history
    trainer.plot_training_history()
    
    # 3. Model Evaluation
    print("\n" + "=" * 40)
    print("MODEL EVALUATION")
    print("=" * 40)
    
    # Make predictions on validation set
    model.eval()
    with torch.no_grad():
        X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
        val_predictions = model(X_val_tensor).cpu().numpy()
    
    # Denormalize predictions and targets
    val_pred_reshaped = val_predictions.reshape(-1, val_predictions.shape[-1])
    val_true_reshaped = y_val_scaled.reshape(-1, y_val_scaled.shape[-1])
    
    val_pred_denorm = processor.scaler_targets.inverse_transform(val_pred_reshaped)
    val_true_denorm = processor.scaler_targets.inverse_transform(val_true_reshaped)
    
    # Calculate metrics
    mse_cpu = mean_squared_error(val_true_denorm[:, 0], val_pred_denorm[:, 0])
    mae_cpu = mean_absolute_error(val_true_denorm[:, 0], val_pred_denorm[:, 0])
    
    mse_memory = mean_squared_error(val_true_denorm[:, 1], val_pred_denorm[:, 1])
    mae_memory = mean_absolute_error(val_true_denorm[:, 1], val_pred_denorm[:, 1])
    
    print(f"CPU Prediction - MSE: {mse_cpu:.4f}, MAE: {mae_cpu:.4f}")
    print(f"Memory Prediction - MSE: {mse_memory:.4f}, MAE: {mae_memory:.4f}")
    
    # 4. Disaster Prediction System
    print("\n" + "=" * 40)
    print("DISASTER PREDICTION SYSTEM")
    print("=" * 40)
    
    disaster_predictor = DisasterPredictor(
        model=model,
        scaler_features=processor.scaler_features,
        scaler_targets=processor.scaler_targets,
        threshold=0.7,
        device=device
    )
    
    # Test disaster prediction on a sample
    if len(X_val) > 0:
        sample_idx = 0
        sample_pod = pod_names[split_idx + sample_idx]
        sample_data = X_val[sample_idx]
        
        print(f"Testing disaster prediction for pod: {sample_pod}")
        
        # Make prediction
        predictions = disaster_predictor.predict_single_pod(sample_data)
        
        # Check for disaster conditions
        current_time = datetime.now()
        alerts = disaster_predictor.check_disaster_conditions(
            predictions, sample_pod, current_time
        )
        
        if alerts:
            print(f"⚠️  DISASTER CONDITIONS DETECTED for {sample_pod}!")
            for alert in alerts:
                print(f"  - {alert['metric']}: {alert['predicted_value']:.1f}% "
                      f"(threshold: {alert['threshold']:.0f}%) "
                      f"in {alert['time_until_breach_minutes']} minutes")
            
            # Send Slack alert if webhook provided
            if slack_webhook_url:
                disaster_predictor.send_slack_alert(alerts, slack_webhook_url)
        else:
            print(f"✅ No disaster conditions detected for {sample_pod}")
    
    # 5. Save Model and Components
    print("\n" + "=" * 40)
    print("SAVING MODEL COMPONENTS")
    print("=" * 40)
    
    # Save model
    torch.save(model.state_dict(), 'k8s_lstm_model.pth')
    
    # Save scalers
    with open('feature_scaler.pkl', 'wb') as f:
        pickle.dump(processor.scaler_features, f)
    
    with open('target_scaler.pkl', 'wb') as f:
        pickle.dump(processor.scaler_targets, f)
    
    # Save model configuration
    model_config = {
        'input_size': input_size,
        'hidden_size': hidden_size,
        'num_layers': num_layers,
        'output_size': output_size,
        'prediction_horizon': prediction_horizon,
        'threshold': 0.7
    }
    
    with open('model_config.json', 'w') as f:
        json.dump(model_config, f)
    
    print("Model and components saved successfully!")
    
    return model, processor, disaster_predictor

# =============================================================================
# 6. DEPLOYMENT UTILITIES
# =============================================================================

def load_model_for_deployment(model_path='k8s_lstm_model.pth', 
                             config_path='model_config.json',
                             feature_scaler_path='feature_scaler.pkl',
                             target_scaler_path='target_scaler.pkl'):
    """Load trained model and components for deployment"""
    
    # Load configuration
    with open(config_path, 'r') as f:
        config = json.load(f)
    
    # Load scalers
    with open(feature_scaler_path, 'rb') as f:
        feature_scaler = pickle.load(f)
    
    with open(target_scaler_path, 'rb') as f:
        target_scaler = pickle.load(f)
    
    # Initialize model
    model = MultiOutputLSTM(
        input_size=config['input_size'],
        hidden_size=config['hidden_size'],
        num_layers=config['num_layers'],
        output_size=config['output_size'],
        prediction_horizon=config['prediction_horizon']
    )
    
    # Load trained weights
    model.load_state_dict(torch.load(model_path, map_location='cpu'))
    model.eval()
    
    # Initialize disaster predictor
    disaster_predictor = DisasterPredictor(
        model=model,
        scaler_features=feature_scaler,
        scaler_targets=target_scaler,
        threshold=config['threshold']
    )
    
    return disaster_predictor

def process_new_data_batch(disaster_predictor, csv_path, slack_webhook_url=None):
    """Process new CSV data for real-time monitoring"""
    
    print(f"Processing new data batch: {csv_path}")
    
    # Load new data
    df = pd.read_csv(csv_path)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Derive pod percentages
    df['pod_cpu_percentage'] = df['cpu_allocation_efficiency'] * df['node_cpu_usage']
    df['pod_memory_percentage'] = df['memory_allocation_efficiency'] * df['node_memory_usage']
    
    # Feature columns
    feature_cols = ['cpu_allocation_efficiency', 'memory_allocation_efficiency', 
                   'disk_io', 'network_latency', 'node_temperature', 
                   'node_cpu_usage', 'node_memory_usage', 'pod_lifesycle_seconds']
    
    all_alerts = []
    
    # Process each pod
    for pod_name in df['pod_name'].unique():
        pod_df = df[df['pod_name'] == pod_name].sort_values('timestamp')
        
        if len(pod_df) < 24:  # Need at least 24 time steps (2 hours)
            continue
        
        # Get latest 24 data points
        latest_data = pod_df[feature_cols].tail(24).values
        
        # Make prediction
        try:
            predictions = disaster_predictor.predict_single_pod(latest_data)
            
            # Check for alerts
            current_time = datetime.now()
            alerts = disaster_predictor.check_disaster_conditions(
                predictions, pod_name, current_time
            )
            
            all_alerts.extend(alerts)
            
        except Exception as e:
            print(f"Error processing pod {pod_name}: {e}")
    
    # Send alerts
    if all_alerts and slack_webhook_url:
        disaster_predictor.send_slack_alert(all_alerts, slack_webhook_url)
        print(f"Sent {len(all_alerts)} alerts to Slack")
    
    return all_alerts

# =============================================================================
# 7. EXAMPLE USAGE AND TESTING
# =============================================================================

if __name__ == "__main__":
    # Example usage

In [None]:
# =============================================================================
# 4. DISASTER PREDICTION AND ALERTING SYSTEM
# =============================================================================

class DisasterPredictor:
    def __init__(self, model, scaler_features, scaler_targets, threshold=0.7, device='cpu'):
        self.model = model.to(device)
        self.scaler_features = scaler_features
        self.scaler_targets = scaler_targets
        self.threshold = threshold
        self.device = device
        self.model.eval()
    
    def predict_cluster_usage(self, cluster_data):
        """
        Predict future CPU/Memory usage for the cluster
        cluster_data: numpy array of shape (sequence_length, n_features)
        """
        # Normalize input
        cluster_data_normalized = self.scaler_features.transform(cluster_data)
        
        # Convert to tensor
        X = torch.FloatTensor(cluster_data_normalized).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            predictions = self.model(X)
            predictions = predictions.cpu().numpy().squeeze()
        
        # Denormalize predictions
        predictions_reshaped = predictions.reshape(-1, predictions.shape[-1])
        predictions_denormalized = self.scaler_targets.inverse_transform(predictions_reshaped)
        predictions_final = predictions_denormalized.reshape(predictions.shape)
        
        return predictions_final
    
    def check_disaster_conditions(self, predictions, current_time):
        """
        Check if predictions exceed disaster threshold for cluster
        Returns alert information if disaster conditions are met
        """
        cpu_predictions = predictions[:, 0]  # CPU percentage predictions
        memory_predictions = predictions[:, 1]  # Memory percentage predictions
        
        # Convert to percentage (0-1 to 0-100 if needed)
        if cpu_predictions.max() <= 1.0:
            cpu_predictions *= 100
            memory_predictions *= 100
        
        alerts = []
        
        # Check CPU threshold
        cpu_breach_indices = np.where(cpu_predictions > self.threshold * 100)[0]
        if len(cpu_breach_indices) > 0:
            first_breach = cpu_breach_indices[0]
            time_to_breach = first_breach * 5  # 5 minutes per prediction step
            max_cpu = cpu_predictions[cpu_breach_indices].max()
            
            alerts.append({
                'metric': 'Cluster CPU',
                'predicted_value': cpu_predictions[first_breach],
                'max_predicted_value': max_cpu,
                'threshold': self.threshold * 100,
                'time_until_breach_minutes': time_to_breach,
                'breach_time': current_time + timedelta(minutes=time_to_breach),
                'severity': 'CRITICAL' if max_cpu > 90 else 'HIGH' if max_cpu > 80 else 'MEDIUM',
                'affected_timespan_minutes': len(cpu_breach_indices) * 5
            })
        
        # Check Memory threshold
        memory_breach_indices = np.where(memory_predictions > self.threshold * 100)[0]
        if len(memory_breach_indices) > 0:
            first_breach = memory_breach_indices[0]
            time_to_breach = first_breach * 5
            max_memory = memory_predictions[memory_breach_indices].max()
            
            alerts.append({
                'metric': 'Cluster Memory',
                'predicted_value': memory_predictions[first_breach],
                'max_predicted_value': max_memory,
                'threshold': self.threshold * 100,
                'time_until_breach_minutes': time_to_breach,
                'breach_time': current_time + timedelta(minutes=time_to_breach),
                'severity': 'CRITICAL' if max_memory > 95 else 'HIGH' if max_memory > 85 else 'MEDIUM',
                'affected_timespan_minutes': len(memory_breach_indices) * 5
            })
        
        return alerts
    
    def send_slack_alert(self, alerts, webhook_url):
        """Send disaster alert to Slack"""
        if not alerts:
            return
        
        for alert in alerts:
            # Color coding based on severity
            color_map = {'CRITICAL': '#ff0000', 'HIGH': '#ff6600', 'MEDIUM': '#ff9900'}
            color = color_map.get(alert['severity'], '#ff9900')
            
            # Create rich Slack message
            message = {
                "attachments": [
                    {
                        "color": color,
                        "title": f"🚨 K8s Cluster Disaster Alert - {alert['severity']} Priority",
                        "text": f"Cluster resource exhaustion predicted for *{alert['metric']}*",
                        "fields": [
                            {
                                "title": "Metric",
                                "value": alert['metric'],
                                "short": True
                            },
                            {
                                "title": "Severity",
                                "value": alert['severity'],
                                "short": True
                            },
                            {
                                "title": "Predicted Value at Breach",
                                "value": f"{alert['predicted_value']:.1f}%",
                                "short": True
                            },
                            {
                                "title": "Maximum Predicted Value",
                                "value": f"{alert['max_predicted_value']:.1f}%",
                                "short": True
                            },
                            {
                                "title": "Threshold",
                                "value": f"{alert['threshold']:.0f}%",
                                "short": True
                            },
                            {
                                "title": "Time Until Breach",
                                "value": f"{alert['time_until_breach_minutes']} minutes",
                                "short": True
                            },
                            {
                                "title": "Expected Breach Time",
                                "value": alert['breach_time'].strftime('%Y-%m-%d %H:%M:%S'),
                                "short": False
                            },
                            {
                                "title": "Duration Above Threshold",
                                "value": f"{alert['affected_timespan_minutes']} minutes",
                                "short": True
                            }
                        ],
                        "footer": "K8s LSTM Disaster Recovery System",
                        "ts": int(datetime.now().timestamp())
                    }
                ]
            }
            
            try:
                response = requests.post(webhook_url, json=message)
                response.raise_for_status()
                print(f"Alert sent for {alert['metric']} - {alert['severity']}")
            except requests.exceptions.RequestException as e:
                print(f"Failed to send Slack alert: {e}")
    
    def generate_prediction_plot(self, historical_data, predictions, current_time):
        """Generate visualization of predictions"""
        plt.figure(figsize=(15, 8))
        
        # Create time arrays
        historical_times = pd.date_range(
            start=current_time - timedelta(minutes=len(historical_data)*5), 
            periods=len(historical_data), 
            freq='5T'
        )
        
        prediction_times = pd.date_range(
            start=current_time, 
            periods=len(predictions), 
            freq='5T'
        )
        
        # Plot CPU
        plt.subplot(2, 1, 1)
        plt.plot(historical_times, historical_data[:, 0] * 100, 'b-', label='Historical CPU', linewidth=2)
        plt.plot(prediction_times, predictions[:, 0] * 100, 'r--', label='Predicted CPU', linewidth=2)
        plt.axhline(y=self.threshold * 100, color='orange', linestyle=':', label=f'Threshold ({self.threshold*100}%)')
        plt.ylabel('CPU Usage (%)')
        plt.title('Cluster CPU Usage Prediction')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot Memory
        plt.subplot(2, 1, 2)
        plt.plot(historical_times, historical_data[:, 1] * 100, 'b-', label='Historical Memory', linewidth=2)
        plt.plot(prediction_times, predictions[:, 1] * 100, 'g--', label='Predicted Memory', linewidth=2)
        plt.axhline(y=self.threshold * 100, color='orange', linestyle=':', label=f'Threshold ({self.threshold*100}%)')
        plt.ylabel('Memory Usage (%)')
        plt.xlabel('Time')
        plt.title('Cluster Memory Usage Prediction')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
