Connected to myenv (Python 3.12.8)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

In [None]:
# Create folder for images
os.makedirs("figures", exist_ok=True)

data_path = "../data/processed/"
idle = pd.read_csv(f"{data_path}idle30_clean.csv")

In [None]:
# ============================================
# Drive Dataset Correlation Heatmap
# ============================================
# Check all columns
print(idle.columns.tolist())

features = idle.columns.tolist()
print("Features for correlation heatmap:", features)

print("\n" + "="*50)
print("CORRELATION WITH FEATURES")
print("="*50)
corr = idle[features].corr()
print(corr)

In [None]:
# Plot heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(idle[features].corr(), cmap='YlGnBu')
plt.title("Correlation Heatmap â€“ Idle30 Dataset")
plt.savefig("figures/idle30_heatmap.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print("\nNaN values per column in correlation matrix:")
print(corr[features].isna().sum())

columns_to_drop_idle = [
    'vehicle_speed',
    'fuel_tank',
    'pedal_d',
    'pedal_e',
    'fuel_air_commanded_equiv_ratio',
    'time_run_with_mil_on',
    'distance_traveled_with_mil_on',
    'warm_ups_since_codes_cleared'
]

In [None]:
# Remove them from feature list
features_clean_idle = [f for f in features if f not in columns_to_drop_idle]

# Recompute correlation
corr_idle_clean = idle[features_clean_idle].corr()

print("\nCleaned correlation matrix for idle:")
print(corr_idle_clean)

In [None]:
##not necessary the copy but keeping for consistency
corr_matrix_idle = corr_idle_clean.copy()

upper_tri_idle = corr_matrix_idle.where(
    np.triu(np.ones(corr_matrix_idle.shape), k=1).astype(bool)
)

strongest_pos_idle = upper_tri_idle.stack().sort_values(ascending=False)
print("\n" + "="*50)
print("Strongest positive correlations (IDLE):")
print("="*50)
print(strongest_pos_idle.head(10))

strongest_neg_idle = upper_tri_idle.stack().sort_values(ascending=True)
print("\n" + "="*50)
print("Strongest negative correlations (IDLE):")
print("="*50)
print(strongest_neg_idle.head(10))

In [None]:
# ===========================================
# Scatter plots for strongest correlations
# ===========================================

os.makedirs("figures/idle30_scatter/positive", exist_ok=True)
os.makedirs("figures/idle30_scatter/negative", exist_ok=True)

In [None]:
# Top 5 positive correlations
top5_pos = strongest_pos_idle.head(5)
print("\nCreating scatter plots for top 5 positive correlations...")
for i, ((col1, col2), value) in enumerate(top5_pos.items(), 1):
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=idle, x=col1, y=col2)
    plt.title(f"{i}. {col1} vs {col2} (Corr={value:.3f})")
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.tight_layout()
    plt.savefig(f"figures/idle30_scatter/positive/scatter_pos{i}_{col1}_{col2}.png", dpi=300)
    plt.show()

In [None]:
# Top 5 negative correlations
top5_neg = strongest_neg_idle.head(5)
print("Creating scatter plots for top 5 negative correlations...")
for i, ((col1, col2), value) in enumerate(top5_neg.items(), 1):
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=idle, x=col1, y=col2)
    plt.title(f"{i}. {col1} vs {col2} (Corr={value:.3f})")
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.tight_layout()
    plt.savefig(f"figures/idle30_scatter/negative/scatter_neg{i}_{col1}_{col2}.png", dpi=300)
    plt.show()