In [189]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report, roc_curve, auc
from scipy.stats import truncnorm
import warnings
import pickle
import os

warnings.filterwarnings("ignore")

## Load the Dataset

In [190]:
file_path = 'Multiple Classification - EV Battery Faults Dataset.xlsx'
df = pd.read_excel(file_path)

## Step 1: Exploratory Data Analysis (EDA)

In [None]:
print("Dataset Info:")
print(df.info())
print("\nFirst 5 Rows:")
print(df.head())

### Handling Negative SOC Values

In [192]:
df['SoC'] = df['SoC'].apply(lambda x: x if x >= 0 else np.nan)
df['SoC'].fillna(df['SoC'].median(), inplace=True)

### Conversion

In [193]:
df['Temperature'] = df['Temperature'] - 273.15
df['Voltage'] = np.clip(df['Voltage'], 48, 72)
df['SoC'] = np.clip(df['SoC'], 0, 100)

#### Battery specifications based on IEC 62133 and EN 15194 standards

In [194]:
# Battery specifications based on IEC 62133 and EN 15194 standards
BATTERY_SPECS = {
    '36V': {
        'min_voltage': 30,
        'nominal_voltage': 36,
        'max_voltage': 42,
        'capacity': 10,  # Ah
        'charge_current_range': (2, 5),  # 0.2C to 0.5C
        'discharge_current_range': (10, 20),  # 1C to 2C
    },
    '48V': {
        'min_voltage': 40,
        'nominal_voltage': 48,
        'max_voltage': 54.6,
        'capacity': 10,  # Ah
        'charge_current_range': (2, 5),
        'discharge_current_range': (10, 20),
    }
}

### Temperature limits from standards

In [None]:
# Temperature limits from standards
TEMP_LIMITS = {
    'operating': {'min': 0, 'max': 45},
    'charging': {'min': 0, 'max': 40},
    'storage': {'min': -20, 'max': 60}
}

### Select battery system (36V or 48V)

In [None]:
# Select battery system (36V or 48V)
battery_type = '36V'
specs = BATTERY_SPECS[battery_type]

# Define operating parameters
min_voltage = specs['min_voltage']
max_voltage = specs['max_voltage']
nominal_voltage = specs['nominal_voltage']
min_current = specs['discharge_current_range'][0]
max_current = specs['discharge_current_range'][1]
min_temp = TEMP_LIMITS['operating']['min']
max_temp = TEMP_LIMITS['operating']['max']
min_soc = 10  # Minimum safe SoC per industry standards
max_soc = 90  # Maximum recommended SoC for longevity

### Calculate Resistance

In [195]:

def calculate_resistance(voltage, temperature, soc):
    # Base resistance calculation
    nominal_resistance = nominal_voltage / specs['discharge_current_range'][0]
    
    # Temperature compensation (based on lithium-ion characteristics)
    temp_factor = 1 + 0.004 * (temperature - 25)  # 0.4%/°C temperature coefficient
    
    # SoC compensation
    soc_factor = 1 + 0.1 * ((100 - soc) / 100)  # Resistance increases at lower SoC
    
    return nominal_resistance * temp_factor * soc_factor

### Creating Sample Data

In [196]:
# Creating Sample Data
num_samples = 100000

### Function to generate data following a truncated Gaussian distribution

In [197]:
# Function to generate data following a truncated Gaussian distribution
def generate_truncated_bell_curve_data(median, std, min_val, max_val, num_samples):
    lower_bound = (min_val - median) / std
    upper_bound = (max_val - median) / std
    data = truncnorm.rvs(lower_bound, upper_bound, loc=median, scale=std, size=num_samples)
    return data

### Generate synthetic data for SoC, Temperature, and Voltage

In [198]:
synthetic_data = {
    'SoC': generate_truncated_bell_curve_data(50, 20, min_soc, max_soc, num_samples),
    'Temperature': generate_truncated_bell_curve_data(25, 10, min_temp, max_temp, num_samples),
    'Voltage': generate_truncated_bell_curve_data(nominal_voltage, 2, min_voltage, max_voltage, num_samples)
}

### Create synthetic DataFrame


In [199]:
# Create DataFrame and calculate parameters
synthetic_df = pd.DataFrame(synthetic_data)
synthetic_df['Resistance'] = synthetic_df.apply(
    lambda row: calculate_resistance(row['Voltage'], row['Temperature'], row['SoC']), 
    axis=1
)

### Calculate Current using Ohm's Law with average resistance

In [218]:
# Calculate current using Ohm's Law
synthetic_df['Current'] = synthetic_df['Voltage'] / synthetic_df['Resistance']
synthetic_df['Current'] = np.clip(synthetic_df['Current'], min_current, max_current)


In [None]:
def calculate_risk_factors(row):
    """Calculate risk factors based on IEC 62133 safety requirements"""
    risk_score = 0
    
    # Temperature risk (weighted: 0.4)
    if row['Temperature'] > 40:
        risk_score += 0.4 * ((row['Temperature'] - 40) / 5)
    elif row['Temperature'] < 5:
        risk_score += 0.4 * ((5 - row['Temperature']) / 5)
        
    # Voltage risk (weighted: 0.3)
    if row['Voltage'] > specs['max_voltage'] - 0.5:
        risk_score += 0.3
    elif row['Voltage'] < specs['min_voltage'] + 2:
        risk_score += 0.3
        
    # Current risk (weighted: 0.2)
    if row['Current'] > specs['discharge_current_range'][1] * 0.9:
        risk_score += 0.2 * ((row['Current'] - specs['discharge_current_range'][1] * 0.9) / 
                            (specs['discharge_current_range'][1] * 0.1))
        
    # SoC risk (weighted: 0.1)
    if row['SoC'] < 15 or row['SoC'] > 85:
        risk_score += 0.1
        
    return np.clip(risk_score, 0, 1)

#### Clip Current


In [219]:
# Calculate fire risk
synthetic_df['Fire_Risk'] = synthetic_df.apply(calculate_risk_factors, axis=1)

### Combine original and synthetic datasets

In [None]:
# Combine original and synthetic datasets
df = pd.concat([df, synthetic_df], ignore_index=True)

print(f"Shape after adding synthetic data: {df.shape}")

### Outlier handling

In [221]:
df['Temperature'] = np.clip(df['Temperature'], min_temp, max_temp)
df['Voltage'] = np.clip(df['Voltage'], min_voltage, max_voltage)
df['SoC'] = np.clip(df['SoC'], min_soc, max_soc)

#### Remove Label column

In [None]:
df = df.drop(columns=['Label'])

### Export the combined dataset to a CSV file

In [None]:
# Export the combined dataset to a CSV file
try:
    os.makedirs("data", exist_ok=True)
    df.to_csv("data/combined_dataset.csv", index=False)
    print("\nCombined dataset exported to data/combined_dataset.csv")
except Exception as e:
    print(f"\nError exporting combined dataset: {e}")

### Visualze distribution after outlier handling

In [None]:
# Visualze distribution after outlier handling
plt.figure(figsize=(18, 6))

# SoC distribution
plt.subplot(4, 1, 1)
sns.histplot(df['SoC'], kde=True, bins=30)
plt.title('Distribution of State of Charge (SoC)')

# Temperature distribution
plt.subplot(4, 1, 2)
sns.histplot(df['Temperature'], kde=True, bins=30)
plt.title('Distribution of Temperature')

# Voltage distribution
plt.subplot(4, 1, 3)
sns.histplot(df['Voltage'], kde=True, bins=30)
plt.title('Distribution of Voltage')

# Current distribution
plt.subplot(4, 1, 4)
sns.histplot(df['Current'], kde=True, bins=30)
plt.title('Distribution of Current')

plt.tight_layout()
plt.show()


#### Current vs Voltage scatterplot

In [None]:
plt.subplot(4, 2, (5,6))
plt.scatter(df['Voltage'], df['Current'], alpha=0.5)
plt.xlabel('Voltage')
plt.ylabel('Current')
plt.title('Current vs Voltage')

plt.tight_layout()
plt.show()

### Heatmap

In [None]:
plt.figure(figsize=(8, 6))
corr_matrix = df[['SoC', 'Temperature', 'Voltage', 'Current']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix (Outliers Handled)')
plt.show()

## Step 2: Define Fire Risk Thresholds

In [140]:
# Define Fire Risk Thresholds
temp_threshold = 40  # High temperature threshold, slightly below max
voltage_low_threshold = 50  # Low voltage threshold, slightly above min
voltage_high_threshold = 70  # High voltage threshold, slightly below max
soc_low_threshold = 60  # Low SOC threshold, slightly above min
soc_high_threshold = 95  # High SOC threshold, slightly below max
current_low_threshold = 31 # Low current threshold, slightly above min
current_high_threshold = 35 # High current threshold, slightly below max

# Step 3: Analyze the Dataset and Add Fire Risk Column

### Conditions for Fire Risk

In [141]:
# Conditions for Fire Risk
def create_features(df):
    # Original features
    features = df[['SoC', 'Temperature', 'Voltage', 'Current']].copy()
    
    # Add interaction terms
    features['Temp_Voltage'] = df['Temperature'] * df['Voltage']
    features['Temp_SoC'] = df['Temperature'] * df['SoC']
    features['Voltage_SoC'] = df['Voltage'] * df['SoC']
    features['Temp_Current'] = df['Temperature'] * df['Current']
    features['Voltage_Current'] = df['Voltage'] * df['Current']
    features['SoC_Current'] = df['SoC'] * df['Current']

    # Add polynomial terms
    features['Temp_Squared'] = df['Temperature'] ** 2
    features['Voltage_Squared'] = df['Voltage'] ** 2
    features['SoC_Squared'] = df['SoC'] ** 2
    features['Current_Squared'] = df['Current'] ** 2

    # Add threshold-based features
    features['High_Temp'] = (df['Temperature'] > temp_threshold).astype(int)
    features['Low_Voltage'] = (df['Voltage'] < voltage_low_threshold).astype(int)
    features['High_Voltage'] = (df['Voltage'] > voltage_high_threshold).astype(int)
    features['Low_SoC'] = (df['SoC'] < soc_low_threshold).astype(int)
    features['High_SoC'] = (df['SoC'] > soc_high_threshold).astype(int)
    features['Low_Current'] = (df['Current'] < current_low_threshold).astype(int)
    features['High_Current'] = (df['Current'] > current_high_threshold).astype(int)

    return features

In [142]:
def calculate_fire_risk_score(df):
    risk_score = np.zeros(len(df))
    
    # Temperature risk (exponential increase after threshold)
    temp_risk = np.exp((df['Temperature'] - temp_threshold) / 10) / np.exp(5.5)  # Normalized
    risk_score += np.clip(temp_risk, 0, 0.4)
    
    # Voltage risk (both low and high voltage are risky)
    voltage_low_risk = np.exp((voltage_low_threshold - df['Voltage']) * 2) / np.exp(5)
    voltage_high_risk = np.exp((df['Voltage'] - voltage_high_threshold) * 2) / np.exp(5)
    risk_score += np.clip(voltage_low_risk + voltage_high_risk, 0, 0.3)
    
    # SoC risk (both very low and very high are risky)
    soc_low_risk = np.exp((soc_low_threshold - df['SoC']) / 20) / np.exp(1)
    soc_high_risk = np.exp((df['SoC'] - soc_high_threshold) / 10) / np.exp(1)
    risk_score += np.clip(soc_low_risk + soc_high_risk, 0, 0.3)
    
    # Current risk (both low and high current are risky)
    current_low_risk = np.exp((current_low_threshold - df['Current']) * 2) / np.exp(5)
    current_high_risk = np.exp((df['Current'] - current_high_threshold) * 2) / np.exp(5)
    risk_score += np.clip(current_low_risk + current_high_risk, 0, 0.3)
    
    # Normalize to [0, 1]
    risk_score = np.clip(risk_score, 0, 1)
    
    # Assign to new column
    df['Fire Risk Score'] = risk_score
    
    # Print summary statistics of the risk scores
    print("\nFire Risk Score Statistics:")
    print(df['Fire Risk Score'].describe())
    
    # Print distribution of risk scores
    print("\nRisk Score Distribution:")
    risk_bins = pd.cut(df['Fire Risk Score'], 
                      bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0],
                      labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    print(risk_bins.value_counts().sort_index())
    
    return df['Fire Risk Score']

## Step 5: Train Model to Predict Fire Risk

In [None]:
# Create features and target
X = create_features(df)
y = calculate_fire_risk_score(df)

# Optional: Print some examples
print("\nSample data with Fire Risk Scores:")
print(df[['Temperature', 'Voltage', 'SoC', 'Current', 'Fire Risk Score']].head(10).round(3))

In [144]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [145]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Model Training and Evaluation

### Define a Function to Evaluate Models

In [146]:
def evaluate_model(model, X_train, y_train, X_test, y_test, threshold=0.5):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_scores = []
    val_scores = []
    for train_index, val_index in kf.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        model.fit(X_train_fold, y_train_fold)
        y_train_pred = model.predict(X_train_fold)
        y_val_pred = model.predict(X_val_fold)
        train_scores.append(mean_squared_error(y_train_fold, y_train_pred))
        val_scores.append(mean_squared_error(y_val_fold, y_val_pred))
    
    train_mse = np.mean(train_scores)
    val_mse = np.mean(val_scores)
    print(f"Cross-Validation Training MSE: {train_mse:.4f}")
    print(f"Cross-Validation Validation MSE: {val_mse:.4f}")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"  Test MSE: {mse:.4f}")
    print(f"  Test R2: {r2:.4f}")
    
    y_pred_class = (y_pred >= threshold).astype(int)
    cm = confusion_matrix(y_test >= threshold, y_pred_class)
    print("  Confusion Matrix:\n", cm)
    print("  Classification Report:\n", classification_report(y_test >= threshold, y_pred_class))
    
    fpr, tpr, thresholds = roc_curve(y_test >= threshold, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6, 4))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()
    
    print("-" * 60)
    return train_mse, val_mse

## Hyperparameter Tuning with GridSearchCV

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

param_distributions = {
    'Linear Regression': {},
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 15, 20],
        'min_samples_split': [10, 20],
        'min_samples_leaf': [2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'XGBoost': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [2, 3, 4],
        'subsample': [0.6, 0.8, 1.0],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [2, 4]
    }
}

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

best_models = {}
for name, model in models.items():
    print(f"Tuning {name}:")
    if param_distributions[name]:
        random_search = RandomizedSearchCV(
            model,
            param_distributions[name],
            n_iter=50,
            cv=5,
            scoring='neg_mean_squared_error',
            n_jobs=-1,
            random_state=42
        )
        random_search.fit(X_train_split, y_train_split)
        print("Best Parameters:")
        print(random_search.best_params_)
        best_models[name] = random_search.best_estimator_
    else:
        model.fit(X_train_split, y_train_split)
        best_models[name] = model

# Evaluate models and select the best one
best_model_name = None
best_model = None
best_mse = float('inf')
overfitting_threshold = 0.0005  # Define your overfitting threshold

model_results = []

for name, model in best_models.items():
    print(f"{name}:")
    train_mse, val_mse = evaluate_model(model, X_train, y_train, X_test, y_test)
    model_results.append((name, model, train_mse, val_mse))

model_results.sort(key=lambda x: x[3]) # Sort by validation MSE

for name, model, train_mse, val_mse in model_results:
    if (train_mse - val_mse) > overfitting_threshold:
        print(f"{name} is overfitting. Considering a simpler model.")
    else:
        best_model_name = name
        best_model = model
        best_mse = val_mse
        print(f"Best Model: {best_model_name}")
        print(f"Best Validation MSE: {best_mse:.4f}")
        break # Select the first non-overfitting model
else:
    # If no model is found to be non-overfitting, select the best model based on validation MSE
    best_model_name, best_model, _, best_mse = model_results[0]
    print(f"No non-overfitting model found. Selecting best model based on validation MSE: {best_model_name}")
    print(f"Best Validation MSE: {best_mse:.4f}")

# Final evaluation on test set
y_pred_best = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)
print(f"Test MSE for Best Model: {mse:.4f}")
print(f"Test R2 for Best Model: {r2:.4f}")

In [None]:
# Create DataFrame with actual and predicted values
results_df = pd.DataFrame({
    'Actual_Risk': y_test,
    'Predicted_Risk': y_pred_best,
    'Difference': np.abs(y_test - y_pred_best)
})

# Add original features for analysis
test_features = pd.DataFrame(X_test, columns=X.columns)
results_df = pd.concat([results_df, test_features], axis=1)

# Add some useful statistics
print("\nPrediction Analysis:")
print("-" * 50)
print("\nSummary Statistics of Predictions:")
print(results_df[['Actual_Risk', 'Predicted_Risk', 'Difference']].describe())

# Show distribution of predictions
print("\nDistribution of Predictions:")
print(pd.cut(results_df['Predicted_Risk'], 
            bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0], 
            labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']).value_counts())

# Show some example predictions
print("\nSample Predictions (10 random examples):")
sample_results = results_df.sample(10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print(sample_results.round(3))

# Print cases with largest prediction errors
print("\nCases with Largest Prediction Errors:")
print(results_df.nlargest(5, 'Difference').round(3))

# Save results to CSV for further analysis
results_df.to_csv('model/prediction_analysis.csv', index=False)
print("\nDetailed results saved to 'prediction_analysis.csv'")

In [None]:
# Create visualization subplot
plt.figure(figsize=(15, 5))

# Plot 1: Actual vs Predicted
plt.subplot(131)
plt.scatter(y_test, y_pred_best, alpha=0.5)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('Actual Risk Score')
plt.ylabel('Predicted Risk Score')
plt.title('Actual vs Predicted Risk Scores')

# Plot 2: Distribution of predictions
plt.subplot(132)
sns.histplot(data=results_df, x='Predicted_Risk', bins=30)
plt.title('Distribution of Predicted Risk Scores')
plt.xlabel('Risk Score')

plt.tight_layout()
plt.show()

# Feature importance plot
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='Importance', y='Feature')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Create test cases
test_cases = pd.DataFrame({
    'Temperature': [25, 50, 75, 30, 60, 40, 80, 35, 45, 70],  # Mix of normal and high temperatures
    'Voltage': [50, 50, 70, 60, 50, 60, 70, 50, 60, 50],  # Mix of normal, low and high voltages
    'SoC': [50, 15, 95, 60, 25, 75, 90, 40, 85, 10],  # Mix of normal, low and high SoC values
    'Current': [32, 31, 35, 33, 30, 34, 36, 32, 33, 31]
})

# Create features for test cases using the same feature engineering
def create_test_features(df):
    features = df[['SoC', 'Temperature', 'Voltage', 'Current']].copy()
    
    # Add interaction terms
    features['Temp_Voltage'] = df['Temperature'] * df['Voltage']
    features['Temp_SoC'] = df['Temperature'] * df['SoC']
    features['Voltage_SoC'] = df['Voltage'] * df['SoC']
    features['Temp_Current'] = df['Temperature'] * df['Current']
    features['Voltage_Current'] = df['Voltage'] * df['Current']
    features['SoC_Current'] = df['SoC'] * df['Current']
    
    # Add polynomial terms
    features['Temp_Squared'] = df['Temperature'] ** 2
    features['Voltage_Squared'] = df['Voltage'] ** 2
    features['SoC_Squared'] = df['SoC'] ** 2
    features['Current_Squared'] = df['Current'] ** 2
    
    # Add threshold-based features
    features['High_Temp'] = (df['Temperature'] > temp_threshold).astype(int)
    features['Low_Voltage'] = (df['Voltage'] < voltage_low_threshold).astype(int)
    features['High_Voltage'] = (df['Voltage'] > voltage_high_threshold).astype(int)
    features['Low_SoC'] = (df['SoC'] < soc_low_threshold).astype(int)
    features['High_SoC'] = (df['SoC'] > soc_high_threshold).astype(int)
    features['Low_Current'] = (df['Current'] < current_low_threshold).astype(int)
    features['High_Current'] = (df['Current'] > current_high_threshold).astype(int)
    
    return features

# Process test cases
X_test_cases = create_test_features(test_cases)
X_test_cases_scaled = scaler.transform(X_test_cases)
y_pred_test = best_model.predict(X_test_cases_scaled)

# Add predictions to test cases
test_cases['Predicted_Risk'] = y_pred_test

# Add risk level categorization
def get_risk_level(score):
    if score <= 0.2:
        return 'Very Low'
    elif score <= 0.4:
        return 'Low'
    elif score <= 0.6:
        return 'Medium'
    elif score <= 0.8:
        return 'High'
    else:
        return 'Very High'

test_cases['Risk_Level'] = test_cases['Predicted_Risk'].apply(get_risk_level)

# Display results
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
print("\nTest Case Results:")
print("-" * 80)
print(test_cases.round(3))

# Summary statistics
print("\nRisk Level Distribution:")
print(test_cases['Risk_Level'].value_counts())

In [None]:
# Visualize results
plt.figure(figsize=(15, 5))

# Plot 1: Risk scores for each test case
plt.subplot(131)
plt.bar(range(len(test_cases)), test_cases['Predicted_Risk'])
plt.xlabel('Test Case')
plt.ylabel('Predicted Risk Score')
plt.title('Risk Scores by Test Case')

# Plot 2: Risk scores vs Temperature
plt.subplot(132)
plt.scatter(test_cases['Temperature'], test_cases['Predicted_Risk'])
plt.xlabel('Temperature')
plt.ylabel('Predicted Risk Score')
plt.title('Risk Score vs Temperature')

# Plot 3: Risk scores vs SoC
plt.subplot(133)
plt.scatter(test_cases['SoC'], test_cases['Predicted_Risk'])
plt.xlabel('State of Charge (SoC)')
plt.ylabel('Predicted Risk Score')
plt.title('Risk Score vs SoC')

plt.tight_layout()
plt.show()

# Print detailed analysis for high-risk cases
print("\nHigh Risk Cases (Risk Score > 0.6):")
print("-" * 80)
high_risk_cases = test_cases[test_cases['Predicted_Risk'] > 0.6]
if len(high_risk_cases) > 0:
    print(high_risk_cases.round(3))
else:
    print("No high risk cases found in the test set")

# Print correlation analysis
print("\nCorrelation with Risk Score:")
print("-" * 80)
correlations = test_cases[['Temperature', 'Voltage', 'SoC', 'Predicted_Risk']].corr()['Predicted_Risk'].sort_values(ascending=False)
print(correlations)

## Export Pickle File

In [128]:
# Create the 'model' directory if it doesn't exist
model_dir = "model"
os.makedirs(model_dir, exist_ok=True)

# Export the model
with open(os.path.join(model_dir, 'fire_risk_model.pkl'), 'wb') as file:
    pickle.dump(best_model, file)