In [None]:
"""
BatteryMind - Resource Usage Analysis Notebook

Comprehensive resource usage analysis for BatteryMind AI models including
CPU, GPU, memory, and disk usage patterns during training and inference.

This notebook provides:
- CPU utilization monitoring
- GPU utilization and memory tracking
- System memory usage analysis
- Disk I/O performance monitoring
- Network usage for distributed training
- Power consumption estimation
- Resource optimization recommendations

Author: BatteryMind Development Team
Version: 1.0.0
"""

import numpy as np
import pandas as pd
import time
import psutil
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Any
import threading
import queue
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings('ignore')

# GPU monitoring
try:
    import GPUtil
    import pynvml
    pynvml.nvmlInit()
    GPU_AVAILABLE = True
except ImportError:
    GPU_AVAILABLE = False
    print("GPU monitoring not available")

# System monitoring
import platform
import socket
from datetime import datetime, timedelta

# Model imports
import sys
sys.path.append('../../')
from transformers.battery_health_predictor.trainer import BatteryHealthTrainer
from transformers.degradation_forecaster.trainer import DegradationTrainer
from reinforcement_learning.training.rl_trainer import RLTrainer
from federated_learning.server.federated_server import FederatedServer

# Utility imports
from utils.data_utils import generate_training_data
from utils.model_utils import get_model_size
from utils.visualization import plot_resource_usage

print("BatteryMind Resource Usage Analysis Notebook")
print("="*50)

# System Information
print(f"System: {platform.system()} {platform.release()}")
print(f"Processor: {platform.processor()}")
print(f"Python Version: {platform.python_version()}")
print(f"CPU Cores: {psutil.cpu_count()}")
print(f"Total Memory: {psutil.virtual_memory().total / (1024**3):.2f} GB")

if GPU_AVAILABLE:
    gpu_count = pynvml.nvmlDeviceGetCount()
    print(f"GPU Count: {gpu_count}")
    for i in range(gpu_count):
        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
        name = pynvml.nvmlDeviceGetName(handle).decode('utf-8')
        print(f"GPU {i}: {name}")

print("="*50)

class ResourceMonitor:
    """
    Comprehensive resource monitoring system for BatteryMind AI training and inference.
    """
    
    def __init__(self, monitoring_interval=1.0):
        self.monitoring_interval = monitoring_interval
        self.monitoring_active = False
        self.resource_data = []
        self.monitoring_thread = None
        
    def start_monitoring(self):
        """Start resource monitoring in a separate thread."""
        self.monitoring_active = True
        self.resource_data = []
        self.monitoring_thread = threading.Thread(target=self._monitor_resources)
        self.monitoring_thread.daemon = True
        self.monitoring_thread.start()
        print("Resource monitoring started")
    
    def stop_monitoring(self):
        """Stop resource monitoring."""
        self.monitoring_active = False
        if self.monitoring_thread:
            self.monitoring_thread.join()
        print("Resource monitoring stopped")
    
    def _monitor_resources(self):
        """Monitor system resources continuously."""
        while self.monitoring_active:
            timestamp = time.time()
            
            # CPU monitoring
            cpu_percent = psutil.cpu_percent(interval=None)
            cpu_freq = psutil.cpu_freq()
            
            # Memory monitoring
            memory = psutil.virtual_memory()
            swap = psutil.swap_memory()
            
            # Disk I/O monitoring
            disk_io = psutil.disk_io_counters()
            
            # Network monitoring
            network_io = psutil.net_io_counters()
            
            # Process-specific monitoring
            process = psutil.Process()
            process_info = process.as_dict(attrs=['pid', 'memory_info', 'cpu_percent', 'num_threads'])
            
            resource_entry = {
                'timestamp': timestamp,
                'cpu_percent': cpu_percent,
                'cpu_freq_current': cpu_freq.current if cpu_freq else 0,
                'memory_total': memory.total,
                'memory_used': memory.used,
                'memory_percent': memory.percent,
                'swap_used': swap.used,
                'swap_percent': swap.percent,
                'disk_read_bytes': disk_io.read_bytes if disk_io else 0,
                'disk_write_bytes': disk_io.write_bytes if disk_io else 0,
                'network_sent_bytes': network_io.bytes_sent if network_io else 0,
                'network_recv_bytes': network_io.bytes_recv if network_io else 0,
                'process_memory_rss': process_info['memory_info'].rss,
                'process_memory_vms': process_info['memory_info'].vms,
                'process_cpu_percent': process_info['cpu_percent'],
                'process_num_threads': process_info['num_threads']
            }
            
            # GPU monitoring if available
            if GPU_AVAILABLE:
                try:
                    gpu_info = self._get_gpu_info()
                    resource_entry.update(gpu_info)
                except Exception as e:
                    print(f"GPU monitoring error: {e}")
            
            self.resource_data.append(resource_entry)
            time.sleep(self.monitoring_interval)
    
    def _get_gpu_info(self):
        """Get GPU information and utilization."""
        gpu_info = {}
        
        try:
            gpu_count = pynvml.nvmlDeviceGetCount()
            
            for i in range(gpu_count):
                handle = pynvml.nvmlDeviceGetHandleByIndex(i)
                
                # GPU utilization
                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
                
                # Memory info
                memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
                
                # Temperature
                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
                
                # Power consumption
                power = pynvml.nvmlDeviceGetPowerUsage(handle)
                
                gpu_info[f'gpu_{i}_utilization'] = utilization.gpu
                gpu_info[f'gpu_{i}_memory_used'] = memory_info.used
                gpu_info[f'gpu_{i}_memory_total'] = memory_info.total
                gpu_info[f'gpu_{i}_memory_percent'] = (memory_info.used / memory_info.total) * 100
                gpu_info[f'gpu_{i}_temperature'] = temp
                gpu_info[f'gpu_{i}_power_usage'] = power / 1000  # Convert to watts
                
        except Exception as e:
            print(f"Error getting GPU info: {e}")
        
        return gpu_info
    
    def get_resource_summary(self):
        """Get summary statistics of resource usage."""
        if not self.resource_data:
            return {}
        
        df = pd.DataFrame(self.resource_data)
        
        summary = {
            'duration_seconds': df['timestamp'].max() - df['timestamp'].min(),
            'avg_cpu_percent': df['cpu_percent'].mean(),
            'max_cpu_percent': df['cpu_percent'].max(),
            'avg_memory_percent': df['memory_percent'].mean(),
            'max_memory_percent': df['memory_percent'].max(),
            'avg_process_memory_gb': df['process_memory_rss'].mean() / (1024**3),
            'max_process_memory_gb': df['process_memory_rss'].max() / (1024**3),
            'total_disk_read_gb': (df['disk_read_bytes'].max() - df['disk_read_bytes'].min()) / (1024**3),
            'total_disk_write_gb': (df['disk_write_bytes'].max() - df['disk_write_bytes'].min()) / (1024**3),
            'total_network_sent_gb': (df['network_sent_bytes'].max() - df['network_sent_bytes'].min()) / (1024**3),
            'total_network_recv_gb': (df['network_recv_bytes'].max() - df['network_recv_bytes'].min()) / (1024**3)
        }
        
        # GPU statistics if available
        if GPU_AVAILABLE:
            gpu_columns = [col for col in df.columns if col.startswith('gpu_')]
            for col in gpu_columns:
                if 'utilization' in col:
                    summary[f'avg_{col}'] = df[col].mean()
                    summary[f'max_{col}'] = df[col].max()
                elif 'memory_percent' in col:
                    summary[f'avg_{col}'] = df[col].mean()
                    summary[f'max_{col}'] = df[col].max()
                elif 'temperature' in col:
                    summary[f'avg_{col}'] = df[col].mean()
                    summary[f'max_{col}'] = df[col].max()
                elif 'power_usage' in col:
                    summary[f'avg_{col}'] = df[col].mean()
                    summary[f'max_{col}'] = df[col].max()
        
        return summary

class ModelResourceAnalyzer:
    """
    Analyze resource usage for different BatteryMind models.
    """
    
    def __init__(self):
        self.monitor = ResourceMonitor()
        self.analysis_results = {}
        
    def analyze_transformer_training(self, epochs=5, batch_size=32):
        """Analyze resource usage during transformer model training."""
        print("Analyzing transformer training resource usage...")
        
        # Initialize trainer
        trainer = BatteryHealthTrainer()
        
        # Generate training data
        train_data = generate_training_data(samples=1000, sequence_length=1000)
        
        # Start monitoring
        self.monitor.start_monitoring()
        
        try:
            # Train model
            start_time = time.time()
            trainer.train(train_data, epochs=epochs, batch_size=batch_size)
            end_time = time.time()
            
            # Stop monitoring
            self.monitor.stop_monitoring()
            
            # Get results
            resource_summary = self.monitor.get_resource_summary()
            resource_summary['training_time'] = end_time - start_time
            resource_summary['model_type'] = 'transformer'
            resource_summary['epochs'] = epochs
            resource_summary['batch_size'] = batch_size
            
            self.analysis_results['transformer_training'] = resource_summary
            
        except Exception as e:
            print(f"Error during transformer training analysis: {e}")
            self.monitor.stop_monitoring()
        
        return resource_summary
    
    def analyze_rl_training(self, episodes=1000):
        """Analyze resource usage during RL agent training."""
        print("Analyzing RL training resource usage...")
        
        # Initialize RL trainer
        trainer = RLTrainer()
        
        # Start monitoring
        self.monitor.start_monitoring()
        
        try:
            # Train RL agent
            start_time = time.time()
            trainer.train(total_episodes=episodes)
            end_time = time.time()
            
            # Stop monitoring
            self.monitor.stop_monitoring()
            
            # Get results
            resource_summary = self.monitor.get_resource_summary()
            resource_summary['training_time'] = end_time - start_time
            resource_summary['model_type'] = 'rl_agent'
            resource_summary['episodes'] = episodes
            
            self.analysis_results['rl_training'] = resource_summary
            
        except Exception as e:
            print(f"Error during RL training analysis: {e}")
            self.monitor.stop_monitoring()
        
        return resource_summary
    
    def analyze_federated_training(self, rounds=10, clients=5):
        """Analyze resource usage during federated learning."""
        print("Analyzing federated learning resource usage...")
        
        # Initialize federated server
        server = FederatedServer()
        
        # Start monitoring
        self.monitor.start_monitoring()
        
        try:
            # Run federated training
            start_time = time.time()
            server.run_federated_training(rounds=rounds, num_clients=clients)
            end_time = time.time()
            
            # Stop monitoring
            self.monitor.stop_monitoring()
            
            # Get results
            resource_summary = self.monitor.get_resource_summary()
            resource_summary['training_time'] = end_time - start_time
            resource_summary['model_type'] = 'federated'
            resource_summary['rounds'] = rounds
            resource_summary['clients'] = clients
            
            self.analysis_results['federated_training'] = resource_summary
            
        except Exception as e:
            print(f"Error during federated training analysis: {e}")
            self.monitor.stop_monitoring()
        
        return resource_summary
    
    def analyze_inference_resource_usage(self, model_type, batch_sizes=[1, 8, 32, 128]):
        """Analyze resource usage during inference."""
        print(f"Analyzing {model_type} inference resource usage...")
        
        # Load model based on type
        if model_type == 'transformer':
            from transformers.battery_health_predictor.predictor import BatteryHealthPredictor
            model = BatteryHealthPredictor.load_model(
                '../../model-artifacts/trained_models/transformer_v1.0/model.pkl'
            )
        elif model_type == 'rl_agent':
            from reinforcement_learning.agents.charging_agent import ChargingAgent
            model = ChargingAgent.load_model(
                '../../model-artifacts/trained_models/rl_agent_v1.0/policy_network.pt'
            )
        else:
            print(f"Unsupported model type: {model_type}")
            return {}
        
        inference_results = {}
        
        for batch_size in batch_sizes:
            print(f"Testing batch size: {batch_size}")
            
            # Generate test data
            if model_type == 'transformer':
                test_data = np.random.randn(batch_size, 1000, 10)
            else:  # RL agent
                test_data = np.random.randn(batch_size, 20)
            
            # Start monitoring
            self.monitor.start_monitoring()
            
            try:
                # Run inference multiple times
                start_time = time.time()
                for _ in range(100):
                    _ = model.predict(test_data)
                end_time = time.time()
                
                # Stop monitoring
                self.monitor.stop_monitoring()
                
                # Get results
                resource_summary = self.monitor.get_resource_summary()
                resource_summary['inference_time'] = end_time - start_time
                resource_summary['model_type'] = model_type
                resource_summary['batch_size'] = batch_size
                resource_summary['num_inferences'] = 100
                
                inference_results[f'batch_{batch_size}'] = resource_summary
                
            except Exception as e:
                print(f"Error during inference analysis: {e}")
                self.monitor.stop_monitoring()
        
        self.analysis_results[f'{model_type}_inference'] = inference_results
        return inference_results
    
    def compare_model_resource_usage(self):
        """Compare resource usage across different models."""
        print("Comparing resource usage across models...")
        
        comparison_data = []
        
        for analysis_name, results in self.analysis_results.items():
            if isinstance(results, dict) and 'model_type' in results:
                comparison_data.append({
                    'analysis_type': analysis_name,
                    'model_type': results['model_type'],
                    'avg_cpu_percent': results.get('avg_cpu_percent', 0),
                    'max_cpu_percent': results.get('max_cpu_percent', 0),
                    'avg_memory_percent': results.get('avg_memory_percent', 0),
                    'max_memory_percent': results.get('max_memory_percent', 0),
                    'avg_process_memory_gb': results.get('avg_process_memory_gb', 0),
                    'max_process_memory_gb': results.get('max_process_memory_gb', 0),
                    'duration_seconds': results.get('duration_seconds', 0),
                    'training_time': results.get('training_time', 0)
                })
        
        comparison_df = pd.DataFrame(comparison_data)
        return comparison_df
    
    def visualize_resource_usage(self):
        """Create visualizations of resource usage analysis."""
        print("Creating resource usage visualizations...")
        
        # Set up plotting style
        plt.style.use('seaborn-v0_8')
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # 1. CPU Usage Comparison
        ax1 = axes[0, 0]
        self._plot_cpu_usage_comparison(ax1)
        ax1.set_title('CPU Usage Comparison')
        ax1.set_xlabel('Model Type')
        ax1.set_ylabel('CPU Usage (%)')
        
        # 2. Memory Usage Comparison
        ax2 = axes[0, 1]
        self._plot_memory_usage_comparison(ax2)
        ax2.set_title('Memory Usage Comparison')
        ax2.set_xlabel('Model Type')
        ax2.set_ylabel('Memory Usage (GB)')
        
        # 3. Training Time Comparison
        ax3 = axes[1, 0]
        self._plot_training_time_comparison(ax3)
        ax3.set_title('Training Time Comparison')
        ax3.set_xlabel('Model Type')
        ax3.set_ylabel('Training Time (seconds)')
        
        # 4. Resource Efficiency
        ax4 = axes[1, 1]
        self._plot_resource_efficiency(ax4)
        ax4.set_title('Resource Efficiency')
        ax4.set_xlabel('Model Type')
        ax4.set_ylabel('Efficiency Score')
        
        plt.tight_layout()
        plt.savefig('resource_usage_analysis.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    def _plot_cpu_usage_comparison(self, ax):
        """Plot CPU usage comparison."""
        model_types = []
        cpu_usage = []
        
        for analysis_name, results in self.analysis_results.items():
            if isinstance(results, dict) and 'model_type' in results:
                model_types.append(results['model_type'])
                cpu_usage.append(results.get('avg_cpu_percent', 0))
        
        if model_types:
            ax.bar(model_types, cpu_usage)
            ax.set_xticklabels(model_types, rotation=45, ha='right')
    
    def _plot_memory_usage_comparison(self, ax):
        """Plot memory usage comparison."""
        model_types = []
        memory_usage = []
        
        for analysis_name, results in self.analysis_results.items():
            if isinstance(results, dict) and 'model_type' in results:
                model_types.append(results['model_type'])
                memory_usage.append(results.get('avg_process_memory_gb', 0))
        
        if model_types:
            ax.bar(model_types, memory_usage)
            ax.set_xticklabels(model_types, rotation=45, ha='right')
    
    def _plot_training_time_comparison(self, ax):
        """Plot training time comparison."""
        model_types = []
        training_times = []
        
        for analysis_name, results in self.analysis_results.items():
            if isinstance(results, dict) and 'training_time' in results:
                model_types.append(results['model_type'])
                training_times.append(results['training_time'])
        
        if model_types:
            ax.bar(model_types, training_times)
            ax.set_xticklabels(model_types, rotation=45, ha='right')
    
    def _plot_resource_efficiency(self, ax):
        """Plot resource efficiency score."""
        model_types = []
        efficiency_scores = []
        
        for analysis_name, results in self.analysis_results.items():
            if isinstance(results, dict) and 'model_type' in results:
                # Calculate efficiency score (lower is better)
                cpu_score = results.get('avg_cpu_percent', 100) / 100
                memory_score = results.get('avg_process_memory_gb', 10) / 10
                time_score = results.get('training_time', 3600) / 3600
                
                efficiency_score = 1 / (cpu_score + memory_score + time_score)
                
                model_types.append(results['model_type'])
                efficiency_scores.append(efficiency_score)
        
        if model_types:
            ax.bar(model_types, efficiency_scores)
            ax.set_xticklabels(model_types, rotation=45, ha='right')
    
    def generate_resource_report(self):
        """Generate comprehensive resource usage report."""
        print("Generating resource usage report...")
        
        report = {
            'system_info': {
                'platform': platform.system(),
                'processor': platform.processor(),
                'cpu_cores': psutil.cpu_count(),
                'total_memory_gb': psutil.virtual_memory().total / (1024**3),
                'python_version': platform.python_version(),
                'gpu_available': GPU_AVAILABLE
            },
            'analysis_timestamp': datetime.now().isoformat(),
            'analysis_results': self.analysis_results,
            'comparison_data': self.compare_model_resource_usage().to_dict('records')
        }
        
        # Calculate recommendations
        recommendations = self._generate_recommendations()
        report['recommendations'] = recommendations
        
        return report
    
    def _generate_recommendations(self):
        """Generate resource optimization recommendations."""
        recommendations = []
        
        # Analyze CPU usage
        cpu_usage = []
        for results in self.analysis_results.values():
            if isinstance(results, dict) and 'avg_cpu_percent' in results:
                cpu_usage.append(results['avg_cpu_percent'])
        
        if cpu_usage:
            avg_cpu = np.mean(cpu_usage)
            if avg_cpu > 80:
                recommendations.append("High CPU usage detected. Consider reducing batch size or using CPU optimization techniques.")
            elif avg_cpu < 20:
                recommendations.append("Low CPU usage detected. Consider increasing batch size for better resource utilization.")
        
        # Analyze memory usage
        memory_usage = []
        for results in self.analysis_results.values():
            if isinstance(results, dict) and 'avg_process_memory_gb' in results:
                memory_usage.append(results['avg_process_memory_gb'])
        
        if memory_usage:
            avg_memory = np.mean(memory_usage)
            total_memory = psutil.virtual_memory().total / (1024**3)
            memory_ratio = avg_memory / total_memory
            
            if memory_ratio > 0.8:
                recommendations.append("High memory usage detected. Consider reducing model size or batch size.")
            elif memory_ratio < 0.2:
                recommendations.append("Low memory usage detected. Consider increasing model capacity or batch size.")
        
        # GPU recommendations
        if GPU_AVAILABLE:
            recommendations.append("GPU available. Consider using GPU acceleration for faster training.")
        
        return recommendations

# Run the resource usage analysis
print("Initializing Resource Usage Analyzer...")
analyzer = ModelResourceAnalyzer()

# Analyze transformer training
print("\n1. Analyzing Transformer Training...")
transformer_results = analyzer.analyze_transformer_training(epochs=3, batch_size=16)

# Analyze RL training
print("\n2. Analyzing RL Training...")
rl_results = analyzer.analyze_rl_training(episodes=500)

# Analyze federated learning
print("\n3. Analyzing Federated Learning...")
federated_results = analyzer.analyze_federated_training(rounds=5, clients=3)

# Analyze inference resource usage
print("\n4. Analyzing Inference Resource Usage...")
transformer_inference = analyzer.analyze_inference_resource_usage('transformer', [1, 8, 32])
rl_inference = analyzer.analyze_inference_resource_usage('rl_agent', [1, 8, 32])

# Generate comparison and visualizations
print("\n5. Generating Comparisons and Visualizations...")
comparison_df = analyzer.compare_model_resource_usage()
analyzer.visualize_resource_usage()

# Generate comprehensive report
print("\n6. Generating Resource Usage Report...")
resource_report = analyzer.generate_resource_report()

# Save results
import json
with open('resource_usage_analysis_results.json', 'w') as f:
    json.dump(resource_report, f, indent=2, default=str)

print("\nResource usage analysis completed!")
print("Results saved to 'resource_usage_analysis_results.json'")
print("Visualizations saved to 'resource_usage_analysis.png'")

# Display summary results
print("\nSUMMARY RESULTS:")
print("="*50)

print("\nCPU Usage Summary:")
for analysis_name, results in analyzer.analysis_results.items():
    if isinstance(results, dict) and 'avg_cpu_percent' in results:
        print(f"{analysis_name}: {results['avg_cpu_percent']:.1f}%")

print("\nMemory Usage Summary:")
for analysis_name, results in analyzer.analysis_results.items():
    if isinstance(results, dict) and 'avg_process_memory_gb' in results:
        print(f"{analysis_name}: {results['avg_process_memory_gb']:.2f} GB")

print("\nTraining Time Summary:")
for analysis_name, results in analyzer.analysis_results.items():
    if isinstance(results, dict) and 'training_time' in results:
        print(f"{analysis_name}: {results['training_time']:.1f} seconds")

print("\nRecommendations:")
for i, rec in enumerate(resource_report['recommendations'], 1):
    print(f"{i}. {rec}")
 
print("\nResource Usage Analysis Completed Successfully!")
