# EV Battery Degradation - Data Exploration

This notebook explores the battery degradation dataset.


In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path().absolute().parent))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.data_loader import BatteryDataLoader

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
# Load data
loader = BatteryDataLoader("../data/raw")
df = loader.load_data()

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


In [None]:
# Basic statistics
df.describe()


In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())


In [None]:
# Plot SOH degradation over cycles
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# SOH over cycles
for battery_id in df['battery_id'].unique()[:3]:
    battery_data = df[df['battery_id'] == battery_id]
    axes[0, 0].plot(battery_data['cycle'], battery_data['soh'], label=f'Battery {battery_id}')
axes[0, 0].set_xlabel('Cycle')
axes[0, 0].set_ylabel('State of Health (SOH)')
axes[0, 0].set_title('Battery Degradation Over Cycles')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Range over cycles
for battery_id in df['battery_id'].unique()[:3]:
    battery_data = df[df['battery_id'] == battery_id]
    axes[0, 1].plot(battery_data['cycle'], battery_data['range_km'], label=f'Battery {battery_id}')
axes[0, 1].set_xlabel('Cycle')
axes[0, 1].set_ylabel('Range (km)')
axes[0, 1].set_title('Range Estimation Over Cycles')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Temperature vs SOH
axes[1, 0].scatter(df['soh'], df['temperature'], alpha=0.5, s=10)
axes[1, 0].set_xlabel('SOH')
axes[1, 0].set_ylabel('Temperature (Â°C)')
axes[1, 0].set_title('Temperature vs Battery Health')
axes[1, 0].grid(True, alpha=0.3)

# Voltage vs Capacity
axes[1, 1].scatter(df['capacity'], df['voltage'], alpha=0.5, s=10)
axes[1, 1].set_xlabel('Capacity (Ah)')
axes[1, 1].set_ylabel('Voltage (V)')
axes[1, 1].set_title('Voltage vs Capacity')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix
correlation_cols = ['voltage', 'current', 'temperature', 'capacity', 'soh', 'range_km', 'degradation_rate']
corr_matrix = df[correlation_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0, square=True)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

features_to_plot = ['voltage', 'current', 'temperature', 'capacity', 'soh', 'range_km']

for i, feature in enumerate(features_to_plot):
    axes[i].hist(df[feature], bins=50, alpha=0.7, edgecolor='black')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'{feature} Distribution')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
