In [None]:
"""
BatteryMind - Battery Data Exploration Notebook

Comprehensive exploratory data analysis for battery telemetry data, sensor readings,
and performance metrics. This notebook provides insights into battery behavior patterns,
data quality assessment, and feature engineering opportunities.

Features:
- Multi-modal sensor data analysis
- Battery performance visualization
- Statistical analysis of battery parameters
- Data quality assessment and anomaly detection
- Feature correlation analysis
- Time-series pattern identification

Author: BatteryMind Development Team
Version: 1.0.0
"""

# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Scientific computing
from scipy import stats
from scipy.signal import find_peaks, savgol_filter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest

# Time series analysis
import datetime as dt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

print("BatteryMind - Battery Data Exploration Notebook")
print("=" * 50)

# Load synthetic battery data
print("Loading battery telemetry data...")

# Load synthetic datasets
battery_telemetry = pd.read_csv('../../training-data/synthetic_datasets/battery_telemetry.csv')
degradation_curves = pd.read_csv('../../training-data/synthetic_datasets/degradation_curves.csv')
environmental_data = pd.read_csv('../../training-data/synthetic_datasets/environmental_data.csv')
usage_profiles = pd.read_csv('../../training-data/synthetic_datasets/usage_profiles.csv')

print(f"Battery telemetry data shape: {battery_telemetry.shape}")
print(f"Degradation curves data shape: {degradation_curves.shape}")
print(f"Environmental data shape: {environmental_data.shape}")
print(f"Usage profiles data shape: {usage_profiles.shape}")

# Data Overview and Basic Statistics
print("\n" + "="*50)
print("BATTERY TELEMETRY DATA OVERVIEW")
print("="*50)

print("\nDataset Info:")
print(battery_telemetry.info())

print("\nBasic Statistics:")
print(battery_telemetry.describe())

print("\nMissing Values:")
print(battery_telemetry.isnull().sum())

print("\nData Types:")
print(battery_telemetry.dtypes)

# Convert timestamp column if exists
if 'timestamp' in battery_telemetry.columns:
    battery_telemetry['timestamp'] = pd.to_datetime(battery_telemetry['timestamp'])
    battery_telemetry.set_index('timestamp', inplace=True)

# Battery Parameter Distribution Analysis
print("\n" + "="*50)
print("BATTERY PARAMETER DISTRIBUTIONS")
print("="*50)

# Key battery parameters to analyze
battery_params = ['voltage', 'current', 'temperature', 'state_of_charge', 'state_of_health', 'capacity']
available_params = [param for param in battery_params if param in battery_telemetry.columns]

print(f"Available battery parameters: {available_params}")

# Create distribution plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for i, param in enumerate(available_params[:6]):
    if param in battery_telemetry.columns:
        # Histogram with KDE
        sns.histplot(battery_telemetry[param], kde=True, ax=axes[i])
        axes[i].set_title(f'{param.replace("_", " ").title()} Distribution')
        axes[i].set_xlabel(param.replace("_", " ").title())
        axes[i].set_ylabel('Frequency')
        
        # Add statistics text
        mean_val = battery_telemetry[param].mean()
        std_val = battery_telemetry[param].std()
        axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
        axes[i].legend()

plt.tight_layout()
plt.show()

# Battery Performance Over Time Analysis
print("\n" + "="*50)
print("BATTERY PERFORMANCE TIME SERIES ANALYSIS")
print("="*50)

# Time series plots for key parameters
if 'battery_id' in battery_telemetry.columns:
    # Select a sample of batteries for detailed analysis
    sample_batteries = battery_telemetry['battery_id'].unique()[:5]
    
    fig, axes = plt.subplots(len(available_params), 1, figsize=(15, 3*len(available_params)))
    if len(available_params) == 1:
        axes = [axes]
    
    for i, param in enumerate(available_params):
        for battery_id in sample_batteries:
            battery_data = battery_telemetry[battery_telemetry['battery_id'] == battery_id]
            if len(battery_data) > 0:
                axes[i].plot(battery_data.index, battery_data[param], 
                           label=f'Battery {battery_id}', alpha=0.7)
        
        axes[i].set_title(f'{param.replace("_", " ").title()} Over Time')
        axes[i].set_ylabel(param.replace("_", " ").title())
        axes[i].legend()
        axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Correlation Analysis
print("\n" + "="*50)
print("CORRELATION ANALYSIS")
print("="*50)

# Calculate correlation matrix
numeric_columns = battery_telemetry.select_dtypes(include=[np.number]).columns
correlation_matrix = battery_telemetry[numeric_columns].corr()

print("Correlation Matrix:")
print(correlation_matrix.round(3))

# Correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Battery Parameter Correlation Matrix')
plt.tight_layout()
plt.show()

# Strong correlations analysis
strong_correlations = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_val = correlation_matrix.iloc[i, j]
        if abs(corr_val) > 0.7:  # Strong correlation threshold
            strong_correlations.append({
                'param1': correlation_matrix.columns[i],
                'param2': correlation_matrix.columns[j],
                'correlation': corr_val
            })

print(f"\nStrong Correlations (|r| > 0.7):")
for corr in strong_correlations:
    print(f"{corr['param1']} - {corr['param2']}: {corr['correlation']:.3f}")

# Battery Health Analysis
print("\n" + "="*50)
print("BATTERY HEALTH ANALYSIS")
print("="*50)

if 'state_of_health' in battery_telemetry.columns:
    # SOH distribution by battery age/cycles
    if 'cycle_count' in battery_telemetry.columns:
        # Scatter plot: SOH vs Cycle Count
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        scatter = plt.scatter(battery_telemetry['cycle_count'], battery_telemetry['state_of_health'], 
                            c=battery_telemetry['temperature'] if 'temperature' in battery_telemetry.columns else 'blue',
                            cmap='coolwarm', alpha=0.6)
        plt.xlabel('Cycle Count')
        plt.ylabel('State of Health')
        plt.title('SOH vs Cycle Count')
        if 'temperature' in battery_telemetry.columns:
            plt.colorbar(scatter, label='Temperature (°C)')
        
        # SOH histogram by health categories
        plt.subplot(1, 2, 2)
        soh_categories = pd.cut(battery_telemetry['state_of_health'], 
                               bins=[0, 0.7, 0.8, 0.9, 1.0], 
                               labels=['Poor', 'Fair', 'Good', 'Excellent'])
        soh_categories.value_counts().plot(kind='bar')
        plt.title('Battery Health Distribution')
        plt.xlabel('Health Category')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()
    
    # SOH statistics
    print(f"SOH Statistics:")
    print(f"Mean SOH: {battery_telemetry['state_of_health'].mean():.3f}")
    print(f"Median SOH: {battery_telemetry['state_of_health'].median():.3f}")
    print(f"SOH Standard Deviation: {battery_telemetry['state_of_health'].std():.3f}")
    print(f"Batteries with SOH < 0.8: {(battery_telemetry['state_of_health'] < 0.8).sum()}")

# Temperature Impact Analysis
print("\n" + "="*50)
print("TEMPERATURE IMPACT ANALYSIS")
print("="*50)

if 'temperature' in battery_telemetry.columns:
    # Temperature distribution
    plt.figure(figsize=(15, 5))
    
    plt.subplot(1, 3, 1)
    sns.histplot(battery_telemetry['temperature'], bins=50, kde=True)
    plt.title('Temperature Distribution')
    plt.xlabel('Temperature (°C)')
    
    # Temperature vs Performance
    if 'state_of_health' in battery_telemetry.columns:
        plt.subplot(1, 3, 2)
        plt.scatter(battery_telemetry['temperature'], battery_telemetry['state_of_health'], alpha=0.5)
        plt.xlabel('Temperature (°C)')
        plt.ylabel('State of Health')
        plt.title('Temperature vs SOH')
        
        # Add trend line
        z = np.polyfit(battery_telemetry['temperature'], battery_telemetry['state_of_health'], 1)
        p = np.poly1d(z)
        plt.plot(battery_telemetry['temperature'], p(battery_telemetry['temperature']), "r--", alpha=0.8)
    
    # Temperature extremes analysis
    plt.subplot(1, 3, 3)
    temp_ranges = pd.cut(battery_telemetry['temperature'], 
                        bins=[-np.inf, 0, 25, 40, np.inf], 
                        labels=['Cold (<0°C)', 'Cool (0-25°C)', 'Normal (25-40°C)', 'Hot (>40°C)'])
    temp_ranges.value_counts().plot(kind='bar')
    plt.title('Temperature Range Distribution')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    # Temperature statistics
    print(f"Temperature Statistics:")
    print(f"Mean Temperature: {battery_telemetry['temperature'].mean():.2f}°C")
    print(f"Temperature Range: {battery_telemetry['temperature'].min():.2f}°C to {battery_telemetry['temperature'].max():.2f}°C")
    print(f"Extreme Cold (<0°C): {(battery_telemetry['temperature'] < 0).sum()} readings")
    print(f"Extreme Hot (>50°C): {(battery_telemetry['temperature'] > 50).sum()} readings")

# Charging Pattern Analysis
print("\n" + "="*50)
print("CHARGING PATTERN ANALYSIS")
print("="*50)

if 'current' in battery_telemetry.columns and 'state_of_charge' in battery_telemetry.columns:
    # Identify charging vs discharging
    battery_telemetry['charging'] = battery_telemetry['current'] > 0
    
    # Charging statistics
    charging_data = battery_telemetry[battery_telemetry['charging']]
    discharging_data = battery_telemetry[~battery_telemetry['charging']]
    
    print(f"Charging cycles: {len(charging_data)} ({len(charging_data)/len(battery_telemetry)*100:.1f}%)")
    print(f"Discharging cycles: {len(discharging_data)} ({len(discharging_data)/len(battery_telemetry)*100:.1f}%)")
    
    # Charging pattern visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Current distribution during charging/discharging
    axes[0, 0].hist(charging_data['current'], bins=50, alpha=0.7, label='Charging', color='green')
    axes[0, 0].hist(discharging_data['current'], bins=50, alpha=0.7, label='Discharging', color='red')
    axes[0, 0].set_xlabel('Current (A)')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].set_title('Current Distribution')
    axes[0, 0].legend()
    
    # SOC during charging/discharging
    axes[0, 1].scatter(charging_data['state_of_charge'], charging_data['current'], 
                      alpha=0.5, color='green', label='Charging', s=1)
    axes[0, 1].scatter(discharging_data['state_of_charge'], discharging_data['current'], 
                      alpha=0.5, color='red', label='Discharging', s=1)
    axes[0, 1].set_xlabel('State of Charge')
    axes[0, 1].set_ylabel('Current (A)')
    axes[0, 1].set_title('Current vs SOC')
    axes[0, 1].legend()
    
    # Charging efficiency analysis
    if 'voltage' in battery_telemetry.columns:
        battery_telemetry['power'] = battery_telemetry['voltage'] * battery_telemetry['current']
        
        axes[1, 0].scatter(charging_data['power'], charging_data['temperature'], 
                          alpha=0.5, color='orange', s=1)
        axes[1, 0].set_xlabel('Power (W)')
        axes[1, 0].set_ylabel('Temperature (°C)')
        axes[1, 0].set_title('Power vs Temperature (Charging)')
    
    # SOC distribution
    axes[1, 1].hist(battery_telemetry['state_of_charge'], bins=50, alpha=0.7)
    axes[1, 1].set_xlabel('State of Charge')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('SOC Distribution')
    
    plt.tight_layout()
    plt.show()

# Data Quality Assessment
print("\n" + "="*50)
print("DATA QUALITY ASSESSMENT")
print("="*50)

# Missing data analysis
missing_data = battery_telemetry.isnull().sum()
missing_percentage = (missing_data / len(battery_telemetry)) * 100

print("Missing Data Summary:")
for col, missing_count in missing_data.items():
    if missing_count > 0:
        print(f"{col}: {missing_count} ({missing_percentage[col]:.2f}%)")

# Outlier detection using IQR method
def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

print("\nOutlier Analysis (IQR Method):")
for param in available_params:
    if param in battery_telemetry.columns:
        outliers, lower, upper = detect_outliers_iqr(battery_telemetry, param)
        print(f"{param}: {len(outliers)} outliers ({len(outliers)/len(battery_telemetry)*100:.2f}%)")
        print(f"  Valid range: {lower:.3f} to {upper:.3f}")

# Anomaly detection using Isolation Forest
if len(numeric_columns) > 1:
    # Prepare data for anomaly detection
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(battery_telemetry[numeric_columns].fillna(0))
    
    # Apply Isolation Forest
    iso_forest = IsolationForest(contamination=0.1, random_state=42)
    anomalies = iso_forest.fit_predict(scaled_data)
    
    battery_telemetry['anomaly'] = anomalies
    anomaly_count = (anomalies == -1).sum()
    
    print(f"\nAnomaly Detection (Isolation Forest):")
    print(f"Detected anomalies: {anomaly_count} ({anomaly_count/len(battery_telemetry)*100:.2f}%)")

# Feature Engineering Opportunities
print("\n" + "="*50)
print("FEATURE ENGINEERING OPPORTUNITIES")
print("="*50)

# Time-based features
if battery_telemetry.index.dtype == 'datetime64[ns]':
    battery_telemetry['hour'] = battery_telemetry.index.hour
    battery_telemetry['day_of_week'] = battery_telemetry.index.dayofweek
    battery_telemetry['month'] = battery_telemetry.index.month
    
    print("Time-based features created:")
    print("- Hour of day")
    print("- Day of week")
    print("- Month")

# Derived features
derived_features = []

if 'voltage' in battery_telemetry.columns and 'current' in battery_telemetry.columns:
    battery_telemetry['power'] = battery_telemetry['voltage'] * battery_telemetry['current']
    derived_features.append('power')

if 'state_of_charge' in battery_telemetry.columns:
    battery_telemetry['soc_change'] = battery_telemetry['state_of_charge'].diff()
    derived_features.append('soc_change')

if 'temperature' in battery_telemetry.columns:
    battery_telemetry['temp_change'] = battery_telemetry['temperature'].diff()
    derived_features.append('temp_change')

print(f"\nDerived features created: {derived_features}")

# Rolling statistics
window_size = 10
if len(battery_telemetry) > window_size:
    for param in ['voltage', 'current', 'temperature']:
        if param in battery_telemetry.columns:
            battery_telemetry[f'{param}_rolling_mean'] = battery_telemetry[param].rolling(window=window_size).mean()
            battery_telemetry[f'{param}_rolling_std'] = battery_telemetry[param].rolling(window=window_size).std()

print(f"Rolling statistics (window={window_size}) created for available parameters")

# Principal Component Analysis
print("\n" + "="*50)
print("PRINCIPAL COMPONENT ANALYSIS")
print("="*50)

# Prepare data for PCA
pca_data = battery_telemetry[numeric_columns].fillna(0)
scaler = StandardScaler()
scaled_pca_data = scaler.fit_transform(pca_data)

# Apply PCA
pca = PCA()
pca_result = pca.fit_transform(scaled_pca_data)

# Plot explained variance
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), 
         pca.explained_variance_ratio_, 'bo-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA - Explained Variance by Component')
plt.grid(True)

plt.subplot(1, 2, 2)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA - Cumulative Explained Variance')
plt.axhline(y=0.95, color='k', linestyle='--', alpha=0.7, label='95% Variance')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Find number of components for 95% variance
n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Components needed for 95% variance: {n_components_95}")
print(f"Total variance explained by first 3 components: {cumulative_variance[2]:.3f}")

# Clustering Analysis
print("\n" + "="*50)
print("CLUSTERING ANALYSIS")
print("="*50)

# K-means clustering on PCA results
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(pca_result[:, :3])  # Use first 3 PCA components

battery_telemetry['cluster'] = clusters

# Visualize clusters
fig = plt.figure(figsize=(15, 5))

# 2D cluster visualization
ax1 = fig.add_subplot(131)
scatter = ax1.scatter(pca_result[:, 0], pca_result[:, 1], c=clusters, cmap='viridis', alpha=0.6)
ax1.set_xlabel('First Principal Component')
ax1.set_ylabel('Second Principal Component')
ax1.set_title('Battery Data Clusters (2D)')
plt.colorbar(scatter)

# 3D cluster visualization
ax2 = fig.add_subplot(132, projection='3d')
ax2.scatter(pca_result[:, 0], pca_result[:, 1], pca_result[:, 2], c=clusters, cmap='viridis', alpha=0.6)
ax2.set_xlabel('PC1')
ax2.set_ylabel('PC2')
ax2.set_zlabel('PC3')
ax2.set_title('Battery Data Clusters (3D)')

# Cluster size distribution
ax3 = fig.add_subplot(133)
cluster_counts = pd.Series(clusters).value_counts().sort_index()
ax3.bar(cluster_counts.index, cluster_counts.values)
ax3.set_xlabel('Cluster')
ax3.set_ylabel('Number of Data Points')
ax3.set_title('Cluster Size Distribution')

plt.tight_layout()
plt.show()

# Cluster characteristics
print("Cluster Characteristics:")
for cluster_id in range(n_clusters):
    cluster_data = battery_telemetry[battery_telemetry['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} ({len(cluster_data)} points):")
    
    for param in ['voltage', 'current', 'temperature', 'state_of_charge', 'state_of_health']:
        if param in cluster_data.columns:
            mean_val = cluster_data[param].mean()
            print(f"  {param}: {mean_val:.3f}")

# Summary and Recommendations
print("\n" + "="*50)
print("SUMMARY AND RECOMMENDATIONS")
print("="*50)

print("Key Findings:")
print("1. Data Quality:")
print(f"   - Dataset contains {len(battery_telemetry)} records")
print(f"   - Missing data: {(battery_telemetry.isnull().sum().sum() / battery_telemetry.size * 100):.2f}%")
print(f"   - Anomalies detected: {anomaly_count if 'anomaly_count' in locals() else 'N/A'}")

print("\n2. Battery Performance:")
if 'state_of_health' in battery_telemetry.columns:
    print(f"   - Average SOH: {battery_telemetry['state_of_health'].mean():.3f}")
    print(f"   - SOH range: {battery_telemetry['state_of_health'].min():.3f} - {battery_telemetry['state_of_health'].max():.3f}")

if 'temperature' in battery_telemetry.columns:
    print(f"   - Temperature range: {battery_telemetry['temperature'].min():.1f}°C - {battery_telemetry['temperature'].max():.1f}°C")

print("\n3. Data Patterns:")
print(f"   - {n_clusters} distinct battery behavior clusters identified")
print(f"   - {n_components_95} PCA components explain 95% of variance")
print(f"   - Strong correlations found between {len(strong_correlations)} parameter pairs")

print("\nRecommendations for Model Development:")
print("1. Feature Engineering:")
print("   - Use rolling statistics for temporal patterns")
print("   - Include derived features (power, rate of change)")
print("   - Consider time-based features for seasonality")

print("\n2. Data Preprocessing:")
print("   - Address missing data through interpolation or imputation")
print("   - Apply outlier treatment for extreme values")
print("   - Normalize features for model training")

print("\n3. Model Strategy:")
print("   - Use clustering insights for stratified sampling")
print("   - Consider ensemble methods for different battery types")
print("   - Implement anomaly detection for safety monitoring")

print("\n4. Monitoring:")
print("   - Track temperature extremes for safety")
print("   - Monitor SOH degradation patterns")
print("   - Implement data drift detection")

print("\nExploration Complete!")
print("="*50)
