In [None]:
"""
BatteryMind - Degradation Pattern Analysis Notebook

Advanced analysis of battery degradation patterns, capacity fade mechanisms,
and lifecycle modeling. This notebook provides deep insights into battery
aging processes and degradation prediction capabilities.

Features:
- Capacity fade analysis and modeling
- Degradation mechanism identification
- Lifecycle prediction and forecasting
- Environmental impact on degradation
- Comparative analysis across battery types
- Physics-informed degradation modeling

Author: BatteryMind Development Team
Version: 1.0.0
"""

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Scientific computing and modeling
from scipy import stats, optimize
from scipy.signal import find_peaks, savgol_filter
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Time series analysis
import datetime as dt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing

# Advanced analytics
from scipy.integrate import odeint
from scipy.stats import weibull_min, norm, lognorm

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("BatteryMind - Degradation Pattern Analysis Notebook")
print("=" * 60)

# Load degradation and related datasets
print("Loading degradation analysis datasets...")

# Load datasets
degradation_curves = pd.read_csv('../../training-data/synthetic_datasets/degradation_curves.csv')
battery_telemetry = pd.read_csv('../../training-data/synthetic_datasets/battery_telemetry.csv')
environmental_data = pd.read_csv('../../training-data/synthetic_datasets/environmental_data.csv')

print(f"Degradation curves shape: {degradation_curves.shape}")
print(f"Battery telemetry shape: {battery_telemetry.shape}")
print(f"Environmental data shape: {environmental_data.shape}")

# Data Overview
print("\n" + "="*60)
print("DEGRADATION DATA OVERVIEW")
print("="*60)

print("\nDegradation Curves Dataset Info:")
print(degradation_curves.info())

print("\nDegradation Statistics:")
print(degradation_curves.describe())

print("\nUnique Values:")
for col in degradation_curves.columns:
    if degradation_curves[col].dtype == 'object':
        print(f"{col}: {degradation_curves[col].nunique()} unique values")

# Capacity Fade Analysis
print("\n" + "="*60)
print("CAPACITY FADE ANALYSIS")
print("="*60)

# Ensure we have the required columns
required_cols = ['cycle_count', 'capacity_retention', 'battery_id']
available_cols = [col for col in required_cols if col in degradation_curves.columns]
print(f"Available columns for analysis: {available_cols}")

if 'capacity_retention' in degradation_curves.columns and 'cycle_count' in degradation_curves.columns:
    # Overall capacity fade trends
    plt.figure(figsize=(15, 10))
    
    # Individual battery degradation curves
    plt.subplot(2, 2, 1)
    if 'battery_id' in degradation_curves.columns:
        sample_batteries = degradation_curves['battery_id'].unique()[:10]
        for battery_id in sample_batteries:
            battery_data = degradation_curves[degradation_curves['battery_id'] == battery_id]
            plt.plot(battery_data['cycle_count'], battery_data['capacity_retention'], 
                    alpha=0.7, linewidth=1)
    else:
        plt.scatter(degradation_curves['cycle_count'], degradation_curves['capacity_retention'], 
                   alpha=0.5, s=1)
    
    plt.xlabel('Cycle Count')
    plt.ylabel('Capacity Retention')
    plt.title('Individual Battery Degradation Curves')
    plt.grid(True, alpha=0.3)
    
    # Average degradation trend
    plt.subplot(2, 2, 2)
    # Group by cycle count and calculate mean capacity retention
    cycle_groups = degradation_curves.groupby('cycle_count')['capacity_retention'].agg(['mean', 'std', 'count'])
    
    plt.plot(cycle_groups.index, cycle_groups['mean'], 'b-', linewidth=2, label='Mean')
    plt.fill_between(cycle_groups.index, 
                     cycle_groups['mean'] - cycle_groups['std'],
                     cycle_groups['mean'] + cycle_groups['std'],
                     alpha=0.3, label='±1 STD')
    plt.xlabel('Cycle Count')
    plt.ylabel('Capacity Retention')
    plt.title('Average Degradation Trend')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # Degradation rate analysis
    plt.subplot(2, 2, 3)
    # Calculate degradation rate (capacity loss per cycle)
    if 'battery_id' in degradation_curves.columns:
        degradation_rates = []
        for battery_id in degradation_curves['battery_id'].unique():
            battery_data = degradation_curves[degradation_curves['battery_id'] == battery_id].sort_values('cycle_count')
            if len(battery_data) > 1:
                # Calculate rate of capacity loss
                capacity_diff = battery_data['capacity_retention'].diff()
                cycle_diff = battery_data['cycle_count'].diff()
                rate = -capacity_diff / cycle_diff  # Negative because capacity decreases
                degradation_rates.extend(rate.dropna().values)
        
        plt.hist(degradation_rates, bins=50, alpha=0.7, edgecolor='black')
        plt.xlabel('Degradation Rate (Capacity Loss per Cycle)')
        plt.ylabel('Frequency')
        plt.title('Degradation Rate Distribution')
    
    # End-of-life analysis
    plt.subplot(2, 2, 4)
    # Define end-of-life as 80% capacity retention
    eol_threshold = 0.8
    
    if 'battery_id' in degradation_curves.columns:
        eol_cycles = []
        for battery_id in degradation_curves['battery_id'].unique():
            battery_data = degradation_curves[degradation_curves['battery_id'] == battery_id]
            eol_data = battery_data[battery_data['capacity_retention'] <= eol_threshold]
            if len(eol_data) > 0:
                eol_cycles.append(eol_data['cycle_count'].min())
        
        if eol_cycles:
            plt.hist(eol_cycles, bins=30, alpha=0.7, edgecolor='black')
            plt.xlabel('Cycles to End-of-Life (80% capacity)')
            plt.ylabel('Number of Batteries')
            plt.title('End-of-Life Distribution')
            plt.axvline(np.mean(eol_cycles), color='red', linestyle='--', 
                       label=f'Mean: {np.mean(eol_cycles):.0f} cycles')
            plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Degradation statistics
    print(f"Capacity Retention Statistics:")
    print(f"Mean: {degradation_curves['capacity_retention'].mean():.3f}")
    print(f"Min: {degradation_curves['capacity_retention'].min():.3f}")
    print(f"Max: {degradation_curves['capacity_retention'].max():.3f}")
    print(f"Standard Deviation: {degradation_curves['capacity_retention'].std():.3f}")
    
    if 'eol_cycles' in locals() and eol_cycles:
        print(f"\nEnd-of-Life Analysis (80% capacity threshold):")
        print(f"Mean cycles to EOL: {np.mean(eol_cycles):.0f}")
        print(f"Median cycles to EOL: {np.median(eol_cycles):.0f}")
        print(f"EOL range: {np.min(eol_cycles):.0f} - {np.max(eol_cycles):.0f} cycles")

# Degradation Mechanism Analysis
print("\n" + "="*60)
print("DEGRADATION MECHANISM ANALYSIS")
print("="*60)

# Analyze different degradation mechanisms
if 'temperature' in degradation_curves.columns or 'temperature' in battery_telemetry.columns:
    # Temperature impact on degradation
    
    # Merge temperature data if needed
    if 'temperature' not in degradation_curves.columns and 'battery_id' in degradation_curves.columns:
        # Calculate average temperature per battery
        temp_data = battery_telemetry.groupby('battery_id')['temperature'].mean().reset_index()
        degradation_curves = degradation_curves.merge(temp_data, on='battery_id', how='left')
    
    if 'temperature' in degradation_curves.columns:
        plt.figure(figsize=(15, 10))
        
        # Temperature vs degradation rate
        plt.subplot(2, 2, 1)
        
        # Calculate degradation rate for each battery
        if 'battery_id' in degradation_curves.columns:
            battery_degradation = []
            for battery_id in degradation_curves['battery_id'].unique():
                battery_data = degradation_curves[degradation_curves['battery_id'] == battery_id]
                if len(battery_data) > 1:
                    # Linear fit to get degradation rate
                    cycles = battery_data['cycle_count'].values
                    capacity = battery_data['capacity_retention'].values
                    if len(cycles) > 1:
                        slope, _, _, _, _ = stats.linregress(cycles, capacity)
                        avg_temp = battery_data['temperature'].mean()
                        battery_degradation.append({'temperature': avg_temp, 'degradation_rate': -slope})
            
            if battery_degradation:
                deg_df = pd.DataFrame(battery_degradation)
                plt.scatter(deg_df['temperature'], deg_df['degradation_rate'], alpha=0.7)
                
                # Fit exponential relationship (Arrhenius-like)
                try:
                    popt, _ = optimize.curve_fit(lambda x, a, b: a * np.exp(b * x), 
                                               deg_df['temperature'], deg_df['degradation_rate'])
                    temp_range = np.linspace(deg_df['temperature'].min(), deg_df['temperature'].max(), 100)
                    plt.plot(temp_range, popt[0] * np.exp(popt[1] * temp_range), 
                            'r-', label=f'Exponential fit: {popt[0]:.2e} * exp({popt[1]:.3f} * T)')
                    plt.legend()
                except:
                    pass
                
                plt.xlabel('Average Temperature (°C)')
                plt.ylabel('Degradation Rate (per cycle)')
                plt.title('Temperature vs Degradation Rate')
        
        # Temperature distribution impact
        plt.subplot(2, 2, 2)
        temp_bins = pd.cut(degradation_curves['temperature'], bins=5)
        degradation_by_temp = degradation_curves.groupby(temp_bins)['capacity_retention'].mean()
        
        temp_labels = [f"{interval.left:.1f}-{interval.right:.1f}°C" for interval in degradation_by_temp.index]
        plt.bar(range(len(temp_labels)), degradation_by_temp.values)
        plt.xlabel('Temperature Range')
        plt.ylabel('Average Capacity Retention')
        plt.title('Capacity Retention by Temperature Range')
        plt.xticks(range(len(temp_labels)), temp_labels, rotation=45)
        
        # Cycle count vs temperature impact
        plt.subplot(2, 2, 3)
        if len(degradation_curves) > 100:
            sample_data = degradation_curves.sample(n=1000)  # Sample for visualization
        else:
            sample_data = degradation_curves
        
        scatter = plt.scatter(sample_data['cycle_count'], sample_data['capacity_retention'], 
                            c=sample_data['temperature'], cmap='coolwarm', alpha=0.6)
        plt.colorbar(scatter, label='Temperature (°C)')
        plt.xlabel('Cycle Count')
        plt.ylabel('Capacity Retention')
        plt.title('Degradation vs Cycles (colored by temperature)')
        
        # Arrhenius plot for temperature dependence
        plt.subplot(2, 2, 4)
        if 'battery_degradation' in locals() and len(battery_degradation) > 0:
            deg_df = pd.DataFrame(battery_degradation)
            # Convert temperature to Kelvin and take reciprocal
            temp_kelvin = deg_df['temperature'] + 273.15
            inv_temp = 1000 / temp_kelvin  # 1000/T for better scaling
            
            plt.scatter(inv_temp, np.log(deg_df['degradation_rate']), alpha=0.7)
            plt.xlabel('1000/T (K⁻¹)')
            plt.ylabel('ln(Degradation Rate)')
            plt.title('Arrhenius Plot')
            
            # Linear fit for activation energy
            try:
                slope, intercept, r_value, _, _ = stats.linregress(inv_temp, np.log(deg_df['degradation_rate']))
                plt.plot(inv_temp, slope * inv_temp + intercept, 'r-', 
                        label=f'R² = {r_value**2:.3f}')
                plt.legend()
                
                # Calculate activation energy (simplified)
                activation_energy = -slope * 8.314  # R = 8.314 J/mol/K
                print(f"Estimated activation energy: {activation_energy:.1f} J/mol")
            except:
                pass
        
        plt.tight_layout()
        plt.show()

# Degradation Modeling and Prediction
print("\n" + "="*60)
print("DEGRADATION MODELING AND PREDICTION")
print("="*60)

if 'capacity_retention' in degradation_curves.columns and 'cycle_count' in degradation_curves.columns:
    
    # Prepare data for modeling
    modeling_data = degradation_curves[['cycle_count', 'capacity_retention']].dropna()
    
    if len(modeling_data) > 100:
        # Sample for faster computation
        modeling_data = modeling_data.sample(n=5000, random_state=42)
    
    X = modeling_data[['cycle_count']]
    y = modeling_data['capacity_retention']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Model 1: Linear degradation model
    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)
    y_pred_linear = linear_model.predict(X_test)
    
    # Model 2: Polynomial degradation model
    poly_features = PolynomialFeatures(degree=2)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)
    
    poly_model = Ridge(alpha=1.0)
    poly_model.fit(X_train_poly, y_train)
    y_pred_poly = poly_model.predict(X_test_poly)
    
    # Model 3: Exponential decay model
    def exponential_decay(x, a, b, c):
        return a * np.exp(-b * x) + c
    
    try:
        popt_exp, _ = optimize.curve_fit(exponential_decay, X_train.values.flatten(), y_train.values)
        y_pred_exp = exponential_decay(X_test.values.flatten(), *popt_exp)
    except:
        y_pred_exp = y_pred_linear  # Fallback
        popt_exp = [1, 0, 0]
    
    # Model 4: Power law model
    def power_law(x, a, b, c):
        return a * np.power(x + 1, -b) + c
    
    try:
        popt_power, _ = optimize.curve_fit(power_law, X_train.values.flatten(), y_train.values)
        y_pred_power = power_law(X_test.values.flatten(), *popt_power)
    except:
        y_pred_power = y_pred_linear  # Fallback
        popt_power = [1, 0, 0]
    
    # Evaluate models
    models = {
        'Linear': y_pred_linear,
        'Polynomial': y_pred_poly,
        'Exponential': y_pred_exp,
        'Power Law': y_pred_power
    }
    
    print("Model Performance Comparison:")
    print("-" * 40)
    
    model_scores = {}
    for name, predictions in models.items():
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        model_scores[name] = {'MSE': mse, 'R²': r2}
        print(f"{name:12} | MSE: {mse:.6f} | R²: {r2:.4f}")
    
    # Visualization of model predictions
    plt.figure(figsize=(15, 10))
    
    # Model comparison
    plt.subplot(2, 2, 1)
    plt.scatter(y_test, y_pred_linear, alpha=0.5, label='Linear')
    plt.scatter(y_test, y_pred_poly, alpha=0.5, label='Polynomial')
    plt.scatter(y_test, y_pred_exp, alpha=0.5, label='Exponential')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', alpha=0.8)
    plt.xlabel('Actual Capacity Retention')
    plt.ylabel('Predicted Capacity Retention')
    plt.title('Model Predictions vs Actual')
    plt.legend()
    
    # Residual analysis
    plt.subplot(2, 2, 2)
    residuals_linear = y_test - y_pred_linear
    residuals_poly = y_test - y_pred_poly
    
    plt.scatter(y_pred_linear, residuals_linear, alpha=0.5, label='Linear')
    plt.scatter(y_pred_poly, residuals_poly, alpha=0.5, label='Polynomial')
    plt.axhline(y=0, color='k', linestyle='--', alpha=0.8)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Analysis')
    plt.legend()
    
    # Degradation curves with model fits
    plt.subplot(2, 2, 3)
    cycle_range = np.linspace(X.min().values[0], X.max().values[0], 200)
    
    # Plot sample of actual data
    sample_indices = np.random.choice(len(modeling_data), size=min(1000, len(modeling_data)), replace=False)
    sample_data = modeling_data.iloc[sample_indices]
    plt.scatter(sample_data['cycle_count'], sample_data['capacity_retention'], 
               alpha=0.3, s=1, color='gray', label='Data')
    
    # Plot model predictions
    plt.plot(cycle_range, linear_model.predict(cycle_range.reshape(-1, 1)), 
             'r-', label='Linear', linewidth=2)
    
    cycle_range_poly = poly_features.transform(cycle_range.reshape(-1, 1))
    plt.plot(cycle_range, poly_model.predict(cycle_range_poly), 
             'g-', label='Polynomial', linewidth=2)
    
    plt.plot(cycle_range, exponential_decay(cycle_range, *popt_exp), 
             'b-', label='Exponential', linewidth=2)
    
    plt.xlabel('Cycle Count')
    plt.ylabel('Capacity Retention')
    plt.title('Degradation Model Comparison')
    plt.legend()
    
    # Feature importance (using Random Forest)
    plt.subplot(2, 2, 4)
    
    # Add more features if available
    feature_columns = ['cycle_count']
    if 'temperature' in degradation_curves.columns:
        feature_columns.append('temperature')
    if 'state_of_charge' in degradation_curves.columns:
        feature_columns.append('state_of_charge')
    
    if len(feature_columns) > 1:
        rf_data = degradation_curves[feature_columns + ['capacity_retention']].dropna()
        if len(rf_data) > 100:
            rf_data = rf_data.sample(n=2000, random_state=42)
        
        X_rf = rf_data[feature_columns]
        y_rf = rf_data['capacity_retention']
        
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_rf, y_rf)
        
        feature_importance = rf_model.feature_importances_
        plt.bar(feature_columns, feature_importance)
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.title('Feature Importance (Random Forest)')
        plt.xticks(rotation=45)
    else:
        plt.text(0.5, 0.5, 'Insufficient features\nfor importance analysis', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Feature Importance Analysis')
    
    plt.tight_layout()
    plt.show()

# Lifecycle Prediction and Forecasting
print("\n" + "="*60)
print("LIFECYCLE PREDICTION AND FORECASTING")
print("="*60)

if 'battery_id' in degradation_curves.columns:
    # Select a representative battery for detailed analysis
    battery_ids = degradation_curves['battery_id'].unique()
    sample_battery = battery_ids[0]
    
    battery_data = degradation_curves[degradation_curves['battery_id'] == sample_battery].sort_values('cycle_count')
    
    if len(battery_data) > 10:
        print(f"Analyzing battery {sample_battery} with {len(battery_data)} data points")
        
        # Prepare time series data
        ts_data = battery_data.set_index('cycle_count')['capacity_retention']
        
        # Split into train/test
        split_point = int(len(ts_data) * 0.8)
        train_data = ts_data[:split_point]
        test_data = ts_data[split_point:]
        
        plt.figure(figsize=(15, 10))
        
        # Original time series
        plt.subplot(2, 2, 1)
        plt.plot(train_data.index, train_data.values, 'b-', label='Training Data', linewidth=2)
        plt.plot(test_data.index, test_data.values, 'g-', label='Test Data', linewidth=2)
        plt.xlabel('Cycle Count')
        plt.ylabel('Capacity Retention')
        plt.title('Battery Degradation Time Series')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Exponential smoothing forecast
        plt.subplot(2, 2, 2)
        try:
            exp_smooth_model = ExponentialSmoothing(train_data, trend='add', seasonal=None)
            exp_smooth_fit = exp_smooth_model.fit()
            
            # Forecast
            forecast_steps = len(test_data)
            forecast = exp_smooth_fit.forecast(steps=forecast_steps)
            
            plt.plot(train_data.index, train_data.values, 'b-', label='Training', linewidth=2)
            plt.plot(test_data.index, test_data.values, 'g-', label='Actual', linewidth=2)
            plt.plot(test_data.index, forecast, 'r--', label='Forecast', linewidth=2)
            
            # Calculate forecast accuracy
            forecast_mse = mean_squared_error(test_data.values, forecast)
            plt.title(f'Exponential Smoothing Forecast (MSE: {forecast_mse:.4f})')
            
        except Exception as e:
            plt.text(0.5, 0.5, f'Exponential smoothing failed:\n{str(e)[:50]}...', 
                    ha='center', va='center', transform=plt.gca().transAxes)
            plt.title('Exponential Smoothing Forecast')
        
        plt.xlabel('Cycle Count')
        plt.ylabel('Capacity Retention')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Physics-based degradation model
        plt.subplot(2, 2, 3)
        
        # Implement simplified physics-based model
        def physics_degradation_model(cycles, params):
            """
            Simplified physics-based degradation model
            params: [initial_capacity, linear_fade_rate, sqrt_fade_rate]
            """
            initial_cap, linear_rate, sqrt_rate = params
            return initial_cap - linear_rate * cycles - sqrt_rate * np.sqrt(cycles)
        
        # Fit physics model
        try:
            cycles_train = train_data.index.values
            capacity_train = train_data.values
            
            # Initial parameter guess
            initial_guess = [1.0, 1e-5, 1e-3]
            
            popt_physics, _ = optimize.curve_fit(physics_degradation_model, 
                                               cycles_train, capacity_train, 
                                               p0=initial_guess)
            
            # Predict on test data
            cycles_test = test_data.index.values
            physics_forecast = physics_degradation_model(cycles_test, popt_physics)
            
            plt.plot(train_data.index, train_data.values, 'b-', label='Training', linewidth=2)
            plt.plot(test_data.index, test_data.values, 'g-', label='Actual', linewidth=2)
            plt.plot(test_data.index, physics_forecast, 'r--', label='Physics Model', linewidth=2)
            
            physics_mse = mean_squared_error(test_data.values, physics_forecast)
            plt.title(f'Physics-Based Model (MSE: {physics_mse:.4f})')
            
        except Exception as e:
            plt.text(0.5, 0.5, f'Physics model failed:\n{str(e)[:50]}...', 
                    ha='center', va='center', transform=plt.gca().transAxes)
            plt.title('Physics-Based Model')
        
        plt.xlabel('Cycle Count')
        plt.ylabel('Capacity Retention')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # End-of-life prediction
        plt.subplot(2, 2, 4)
        
        # Extrapolate to predict end-of-life (80% capacity)
        eol_threshold = 0.8
        max_cycles = 5000  # Maximum cycles to predict
        
        extended_cycles = np.arange(0, max_cycles, 10)
        
        # Use best performing model for EOL prediction
        if 'popt_physics' in locals():
            extended_prediction = physics_degradation_model(extended_cycles, popt_physics)
            model_name = "Physics-Based"
        else:
            # Fallback to linear extrapolation
            slope, intercept, _, _, _ = stats.linregress(train_data.index, train_data.values)
            extended_prediction = slope * extended_cycles + intercept
            model_name = "Linear"
        
        plt.plot(extended_cycles, extended_prediction, 'b-', label=f'{model_name} Prediction', linewidth=2)
        plt.axhline(y=eol_threshold, color='r', linestyle='--', alpha=0.8, label='EOL Threshold (80%)')
        
        # Find EOL point
        eol_indices = np.where(extended_prediction <= eol_threshold)[0]
        if len(eol_indices) > 0:
            eol_cycle = extended_cycles[eol_indices[0]]
            plt.axvline(x=eol_cycle, color='r', linestyle=':', alpha=0.8, 
                       label=f'Predicted EOL: {eol_cycle:.0f} cycles')
            print(f"Predicted End-of-Life for battery {sample_battery}: {eol_cycle:.0f} cycles")
        
        plt.xlabel('Cycle Count')
        plt.ylabel('Capacity Retention')
        plt.title('End-of-Life Prediction')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Comparative Degradation Analysis
print("\n" + "="*60)
print("COMPARATIVE DEGRADATION ANALYSIS")
print("="*60)

# Analyze degradation patterns across different conditions
if 'battery_id' in degradation_curves.columns and len(degradation_curves['battery_id'].unique()) > 5:
    
    # Group batteries by degradation characteristics
    battery_summary = []
    
    for battery_id in degradation_curves['battery_id'].unique()[:20]:  # Limit for performance
        battery_data = degradation_curves[degradation_curves['battery_id'] == battery_id]
        
        if len(battery_data) > 5:
            # Calculate degradation metrics
            initial_capacity = battery_data['capacity_retention'].max()
            final_capacity = battery_data['capacity_retention'].min()
            total_cycles = battery_data['cycle_count'].max() - battery_data['cycle_count'].min()
            
            if total_cycles > 0:
                avg_degradation_rate = (initial_capacity - final_capacity) / total_cycles
                
                # Additional metrics
                avg_temp = battery_data['temperature'].mean() if 'temperature' in battery_data.columns else 25.0
                
                battery_summary.append({
                    'battery_id': battery_id,
                    'initial_capacity': initial_capacity,
                    'final_capacity': final_capacity,
                    'total_cycles': total_cycles,
                    'degradation_rate': avg_degradation_rate,
                    'avg_temperature': avg_temp
                })
    
    # Continuation from the previous code block...

if battery_summary:
    summary_df = pd.DataFrame(battery_summary)
    
    plt.figure(figsize=(15, 10))
    
    # Degradation rate distribution
    plt.subplot(2, 2, 1)
    plt.hist(summary_df['degradation_rate'], bins=20, alpha=0.7, edgecolor='black')
    plt.xlabel('Degradation Rate (per cycle)')
    plt.ylabel('Number of Batteries')
    plt.title('Degradation Rate Distribution')
    plt.axvline(summary_df['degradation_rate'].mean(), color='red', linestyle='--', 
               label=f'Mean: {summary_df["degradation_rate"].mean():.2e}')
    plt.legend()
    
    # Temperature vs degradation rate
    plt.subplot(2, 2, 2)
    plt.scatter(summary_df['avg_temperature'], summary_df['degradation_rate'], alpha=0.7)
    plt.xlabel('Average Temperature (°C)')
    plt.ylabel('Degradation Rate (per cycle)')
    plt.title('Temperature vs Degradation Rate')
    
    # Correlation analysis
    if len(summary_df) > 3:
        corr_coef = summary_df['avg_temperature'].corr(summary_df['degradation_rate'])
        plt.text(0.05, 0.95, f'Correlation: {corr_coef:.3f}', 
                transform=plt.gca().transAxes, verticalalignment='top')
    
    # Cycle life distribution
    plt.subplot(2, 2, 3)
    plt.hist(summary_df['total_cycles'], bins=20, alpha=0.7, edgecolor='black')
    plt.xlabel('Total Cycles Observed')
    plt.ylabel('Number of Batteries')
    plt.title('Cycle Life Distribution')
    
    # Capacity retention vs cycles
    plt.subplot(2, 2, 4)
    plt.scatter(summary_df['total_cycles'], summary_df['final_capacity'], alpha=0.7)
    plt.xlabel('Total Cycles')
    plt.ylabel('Final Capacity Retention')
    plt.title('Capacity Retention vs Cycle Count')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical summary
    print("Battery Degradation Summary Statistics:")
    print("-" * 40)
    print(f"Number of batteries analyzed: {len(summary_df)}")
    print(f"Average degradation rate: {summary_df['degradation_rate'].mean():.2e} per cycle")
    print(f"Degradation rate std dev: {summary_df['degradation_rate'].std():.2e}")
    print(f"Average cycles observed: {summary_df['total_cycles'].mean():.0f}")
    print(f"Average final capacity: {summary_df['final_capacity'].mean():.3f}")
    
    # Identify best and worst performing batteries
    best_battery = summary_df.loc[summary_df['degradation_rate'].idxmin()]
    worst_battery = summary_df.loc[summary_df['degradation_rate'].idxmax()]
    
    print(f"\nBest performing battery: {best_battery['battery_id']}")
    print(f"  Degradation rate: {best_battery['degradation_rate']:.2e} per cycle")
    print(f"  Average temperature: {best_battery['avg_temperature']:.1f}°C")
    
    print(f"\nWorst performing battery: {worst_battery['battery_id']}")
    print(f"  Degradation rate: {worst_battery['degradation_rate']:.2e} per cycle")
    print(f"  Average temperature: {worst_battery['avg_temperature']:.1f}°C")
    
    # Advanced degradation pattern analysis
    print("\n" + "="*60)
    print("ADVANCED DEGRADATION PATTERN ANALYSIS")
    print("="*60)
    
    # Clustering analysis for degradation patterns
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    
    # Prepare features for clustering
    clustering_features = ['degradation_rate', 'avg_temperature', 'total_cycles', 
                          'final_capacity', 'avg_voltage', 'avg_current']
    
    # Check if all required columns exist
    available_features = [col for col in clustering_features if col in summary_df.columns]
    
    if len(available_features) >= 3:
        X_cluster = summary_df[available_features].fillna(summary_df[available_features].mean())
        
        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_cluster)
        
        # Determine optimal number of clusters using elbow method
        inertias = []
        k_range = range(2, min(8, len(summary_df)))
        
        for k in k_range:
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            kmeans.fit(X_scaled)
            inertias.append(kmeans.inertia_)
        
        # Find optimal k using elbow method
        if len(inertias) > 1:
            # Calculate rate of change
            rate_of_change = np.diff(inertias)
            optimal_k = k_range[np.argmin(rate_of_change)] if len(rate_of_change) > 0 else 3
        else:
            optimal_k = 2
        
        # Perform clustering
        kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(X_scaled)
        summary_df['cluster'] = cluster_labels
        
        # Visualize clustering results
        plt.figure(figsize=(15, 12))
        
        # PCA for visualization
        pca = PCA(n_components=2)
        X_pca = pca.fit_transform(X_scaled)
        
        plt.subplot(2, 3, 1)
        scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)
        plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%} variance)')
        plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%} variance)')
        plt.title('Battery Degradation Clusters (PCA)')
        plt.colorbar(scatter, label='Cluster')
        
        # Cluster characteristics
        plt.subplot(2, 3, 2)
        cluster_means = summary_df.groupby('cluster')['degradation_rate'].mean()
        plt.bar(range(len(cluster_means)), cluster_means.values, alpha=0.7)
        plt.xlabel('Cluster')
        plt.ylabel('Mean Degradation Rate')
        plt.title('Degradation Rate by Cluster')
        plt.xticks(range(len(cluster_means)), [f'Cluster {i}' for i in cluster_means.index])
        
        # Temperature distribution by cluster
        plt.subplot(2, 3, 3)
        for cluster_id in sorted(summary_df['cluster'].unique()):
            cluster_data = summary_df[summary_df['cluster'] == cluster_id]
            plt.hist(cluster_data['avg_temperature'], alpha=0.5, 
                    label=f'Cluster {cluster_id}', bins=10)
        plt.xlabel('Average Temperature (°C)')
        plt.ylabel('Frequency')
        plt.title('Temperature Distribution by Cluster')
        plt.legend()
        
        # Capacity retention by cluster
        plt.subplot(2, 3, 4)
        cluster_capacity_means = summary_df.groupby('cluster')['final_capacity'].mean()
        plt.bar(range(len(cluster_capacity_means)), cluster_capacity_means.values, alpha=0.7)
        plt.xlabel('Cluster')
        plt.ylabel('Mean Final Capacity')
        plt.title('Capacity Retention by Cluster')
        plt.xticks(range(len(cluster_capacity_means)), [f'Cluster {i}' for i in cluster_capacity_means.index])
        
        # Cycle life by cluster
        plt.subplot(2, 3, 5)
        cluster_cycle_means = summary_df.groupby('cluster')['total_cycles'].mean()
        plt.bar(range(len(cluster_cycle_means)), cluster_cycle_means.values, alpha=0.7)
        plt.xlabel('Cluster')
        plt.ylabel('Mean Total Cycles')
        plt.title('Cycle Life by Cluster')
        plt.xticks(range(len(cluster_cycle_means)), [f'Cluster {i}' for i in cluster_cycle_means.index])
        
        # Feature importance (PCA components)
        plt.subplot(2, 3, 6)
        feature_importance = np.abs(pca.components_[0])
        plt.bar(range(len(available_features)), feature_importance, alpha=0.7)
        plt.xlabel('Features')
        plt.ylabel('PCA Component 1 Weight')
        plt.title('Feature Importance (PC1)')
        plt.xticks(range(len(available_features)), available_features, rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        # Print cluster analysis
        print(f"\nClustering Analysis Results:")
        print(f"Optimal number of clusters: {optimal_k}")
        print(f"Features used: {', '.join(available_features)}")
        print(f"PCA explained variance: PC1={pca.explained_variance_ratio_[0]:.2%}, PC2={pca.explained_variance_ratio_[1]:.2%}")
        
        for cluster_id in sorted(summary_df['cluster'].unique()):
            cluster_data = summary_df[summary_df['cluster'] == cluster_id]
            print(f"\nCluster {cluster_id} ({len(cluster_data)} batteries):")
            print(f"  Mean degradation rate: {cluster_data['degradation_rate'].mean():.2e}")
            print(f"  Mean temperature: {cluster_data['avg_temperature'].mean():.1f}°C")
            print(f"  Mean final capacity: {cluster_data['final_capacity'].mean():.3f}")
            print(f"  Mean cycles: {cluster_data['total_cycles'].mean():.0f}")
    
    # Time series analysis for degradation patterns
    print("\n" + "="*60)
    print("TIME SERIES DEGRADATION ANALYSIS")
    print("="*60)
    
    # Analyze degradation trends over time
    if 'timestamp' in battery_data.columns:
        # Convert timestamp to datetime if it's not already
        battery_data['timestamp'] = pd.to_datetime(battery_data['timestamp'])
        
        # Group by time periods to analyze degradation trends
        battery_data['month'] = battery_data['timestamp'].dt.to_period('M')
        monthly_degradation = battery_data.groupby(['battery_id', 'month']).agg({
            'capacity': 'mean',
            'temperature': 'mean',
            'voltage': 'mean',
            'current': 'mean'
        }).reset_index()
        
        # Calculate monthly degradation rates
        degradation_trends = []
        for battery_id in monthly_degradation['battery_id'].unique():
            battery_monthly = monthly_degradation[monthly_degradation['battery_id'] == battery_id].sort_values('month')
            if len(battery_monthly) > 1:
                capacity_trend = np.polyfit(range(len(battery_monthly)), battery_monthly['capacity'], 1)[0]
                degradation_trends.append({
                    'battery_id': battery_id,
                    'monthly_degradation_rate': -capacity_trend,  # Negative slope means degradation
                    'months_observed': len(battery_monthly)
                })
        
        if degradation_trends:
            trends_df = pd.DataFrame(degradation_trends)

