# Fleet Predictive Maintenance - Exploratory Data Analysis

This notebook explores the fleet telematics data and analyzes patterns related to component failures.

**Components Analyzed:**
- Engine failures
- Brake failures  
- Battery failures

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

print("Libraries loaded successfully!")

## 1. Load Data

In [None]:
# Load the raw data
data_path = Path('../data/raw/fleet_telematics.csv')

if not data_path.exists():
    print("Data not found. Generating synthetic data...")
    import sys
    sys.path.insert(0, '../src')
    from data_generator import generate_fleet_data
    df = generate_fleet_data(n_vehicles=50, n_days=365)
else:
    df = pd.read_csv(data_path)
    print(f"Data loaded from {data_path}")

df['timestamp'] = pd.to_datetime(df['timestamp'])
print(f"\nDataset shape: {df.shape}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
df.head()

## 2. Data Overview

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"  Total records: {len(df):,}")
print(f"  Number of vehicles: {df['vehicle_id'].nunique()}")
print(f"  Number of features: {len(df.columns)}")
print(f"\nColumn types:")
print(df.dtypes)

In [None]:
# Summary statistics
df.describe().round(2)

## 3. Target Variable Analysis (Failure Types)

In [None]:
# Failure distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
failure_counts = df['failure_within_30_days'].value_counts()
colors = ['#2ecc71', '#e74c3c', '#3498db', '#f39c12']
axes[0].bar(failure_counts.index, failure_counts.values, color=colors)
axes[0].set_title('Distribution of Failure Types', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Failure Type')
axes[0].set_ylabel('Count')
for i, v in enumerate(failure_counts.values):
    axes[0].text(i, v + 200, f'{v:,}', ha='center', fontweight='bold')

# Pie chart
axes[1].pie(failure_counts.values, labels=failure_counts.index, autopct='%1.1f%%',
           colors=colors, explode=[0.02]*len(failure_counts))
axes[1].set_title('Failure Type Proportions', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../data/processed/failure_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nFailure Type Distribution:")
print(df['failure_within_30_days'].value_counts())
print(f"\nOverall failure rate: {(df['failure_within_30_days'] != 'None').mean():.1%}")

## 4. Feature Distributions

In [None]:
# Key sensor features
sensor_features = ['engine_temp', 'oil_pressure', 'battery_voltage', 'brake_pad_thickness']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feature in enumerate(sensor_features):
    for failure_type in df['failure_within_30_days'].unique():
        subset = df[df['failure_within_30_days'] == failure_type][feature]
        axes[i].hist(subset, bins=50, alpha=0.5, label=failure_type, density=True)
    
    axes[i].set_title(f'{feature.replace("_", " ").title()} by Failure Type', fontweight='bold')
    axes[i].set_xlabel(feature.replace('_', ' ').title())
    axes[i].set_ylabel('Density')
    axes[i].legend()

plt.tight_layout()
plt.savefig('../data/processed/sensor_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Box plots for sensor features by failure type
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, feature in enumerate(sensor_features):
    df.boxplot(column=feature, by='failure_within_30_days', ax=axes[i])
    axes[i].set_title(f'{feature.replace("_", " ").title()}', fontweight='bold')
    axes[i].set_xlabel('Failure Type')
    axes[i].set_ylabel(feature.replace('_', ' ').title())

plt.suptitle('Sensor Readings by Failure Type', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../data/processed/sensor_boxplots.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Correlation matrix
corr_matrix = df[numeric_cols].corr()

plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='RdBu_r', center=0,
           square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/processed/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Top correlations with failure-related features
key_features = ['engine_temp', 'oil_pressure', 'battery_voltage', 'brake_pad_thickness', 
               'error_code_count', 'days_since_maintenance']

plt.figure(figsize=(10, 8))
key_corr = corr_matrix.loc[key_features, key_features]
sns.heatmap(key_corr, annot=True, cmap='RdBu_r', center=0, fmt='.2f',
           square=True, linewidths=0.5)
plt.title('Key Feature Correlations', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Time-Based Analysis

In [None]:
# Monthly failure trends
df['month'] = df['timestamp'].dt.month
df['month_name'] = df['timestamp'].dt.month_name()

monthly_failures = df[df['failure_within_30_days'] != 'None'].groupby(
    ['month', 'failure_within_30_days']).size().unstack(fill_value=0)

monthly_failures.plot(kind='bar', figsize=(14, 6), width=0.8)
plt.title('Monthly Failure Counts by Type', fontsize=14, fontweight='bold')
plt.xlabel('Month')
plt.ylabel('Number of Failures')
plt.legend(title='Failure Type')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('../data/processed/monthly_failures.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Daily failure rate trend
daily_failure_rate = df.groupby('timestamp').apply(
    lambda x: (x['failure_within_30_days'] != 'None').mean()
)

plt.figure(figsize=(14, 5))
plt.plot(daily_failure_rate.rolling(7).mean(), color='#e74c3c', linewidth=2)
plt.fill_between(daily_failure_rate.index, daily_failure_rate.rolling(7).mean(), alpha=0.3, color='#e74c3c')
plt.title('7-Day Rolling Average Failure Rate', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Failure Rate')
plt.tight_layout()
plt.show()

## 7. Vehicle-Level Analysis

In [None]:
# Failure count by vehicle
vehicle_failures = df[df['failure_within_30_days'] != 'None'].groupby('vehicle_id').size().sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Top 15 vehicles by failures
vehicle_failures.head(15).plot(kind='bar', ax=axes[0], color='#e74c3c')
axes[0].set_title('Top 15 Vehicles by Failure Count', fontweight='bold')
axes[0].set_xlabel('Vehicle ID')
axes[0].set_ylabel('Number of Failure Predictions')
axes[0].tick_params(axis='x', rotation=45)

# Distribution of failures per vehicle
axes[1].hist(vehicle_failures.values, bins=20, color='#3498db', edgecolor='white')
axes[1].axvline(vehicle_failures.mean(), color='red', linestyle='--', label=f'Mean: {vehicle_failures.mean():.1f}')
axes[1].set_title('Distribution of Failures per Vehicle', fontweight='bold')
axes[1].set_xlabel('Number of Failures')
axes[1].set_ylabel('Number of Vehicles')
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/processed/vehicle_failures.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nVehicle failure statistics:")
print(f"  Mean failures per vehicle: {vehicle_failures.mean():.1f}")
print(f"  Max failures: {vehicle_failures.max()}")
print(f"  Min failures: {vehicle_failures.min()}")

## 8. Feature Importance Preview

In [None]:
# Load feature importance if model has been trained
import joblib
from pathlib import Path

model_path = Path('../models')
if (model_path / 'all_models.joblib').exists():
    models = joblib.load(model_path / 'all_models.joblib')
    feature_names = joblib.load(model_path / 'feature_names.joblib')
    
    # Use Random Forest for feature importance
    if 'Random Forest' in models:
        rf_model = models['Random Forest']
        importance = rf_model.feature_importances_
        
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importance
        }).sort_values('importance', ascending=True).tail(20)
        
        plt.figure(figsize=(10, 10))
        plt.barh(importance_df['feature'], importance_df['importance'], color='#2ecc71')
        plt.title('Top 20 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.savefig('../data/processed/feature_importance.png', dpi=150, bbox_inches='tight')
        plt.show()
else:
    print("Model not trained yet. Run 'python main.py train' first to see feature importance.")

## 9. Key Insights Summary

### Failure Patterns Observed:

1. **Engine Failures**: Associated with high engine temperature (>230Â°F) and low oil pressure (<20 PSI)

2. **Brake Failures**: Correlated with low brake pad thickness (<3mm) and high hard brake events

3. **Battery Failures**: Linked to low voltage (<12V) and extreme ambient temperatures

### Class Imbalance:
- Majority of readings show no failure (expected for preventive maintenance)
- SMOTE applied during training to balance classes

### Seasonal Effects:
- Battery failures more common in extreme temperature months
- Engine issues may increase in summer months

In [None]:
# Summary statistics by failure type
summary = df.groupby('failure_within_30_days')[sensor_features].agg(['mean', 'std']).round(2)
print("\nSummary Statistics by Failure Type:")
summary