# Machine Learning Analysis
## Predictive Modeling for Fuel Consumption and CO2 Emissions

This notebook implements various ML algorithms to predict:
- Fuel Consumption
- CO2 Emissions

Algorithms used:
- Linear Regression
- Random Forest
- Gradient Boosting
- Support Vector Regression


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

df = pd.read_csv('../../data/FuelConsumption.csv')
df.columns = df.columns.str.strip()
print("Dataset loaded!")
print(f"Shape: {df.shape}")


## 1. Data Preprocessing


In [None]:
# Prepare data for modeling
# Encode categorical variables
le_make = LabelEncoder()
le_model = LabelEncoder()
le_class = LabelEncoder()
le_transmission = LabelEncoder()
le_fuel = LabelEncoder()

df_ml = df.copy()
df_ml['MAKE_encoded'] = le_make.fit_transform(df_ml['MAKE'])
df_ml['MODEL_encoded'] = le_model.fit_transform(df_ml['MODEL'])
df_ml['VEHICLE CLASS_encoded'] = le_class.fit_transform(df_ml['VEHICLE CLASS'])
df_ml['TRANSMISSION_encoded'] = le_transmission.fit_transform(df_ml['TRANSMISSION'])
df_ml['FUEL_encoded'] = le_fuel.fit_transform(df_ml['FUEL'])

# Select features
features = ['Year', 'ENGINE SIZE', 'CYLINDERS', 'MAKE_encoded', 
            'VEHICLE CLASS_encoded', 'TRANSMISSION_encoded', 'FUEL_encoded']
X = df_ml[features]
y_fuel = df_ml['FUEL CONSUMPTION']
y_co2 = df_ml['COEMISSIONS']

print("Features selected:", features)
print(f"X shape: {X.shape}")
print(f"y_fuel shape: {y_fuel.shape}")
print(f"y_co2 shape: {y_co2.shape}")
print(f"\nMissing values in X: {X.isnull().sum().sum()}")
print(f"Missing values in y_fuel: {y_fuel.isnull().sum()}")
print(f"Missing values in y_co2: {y_co2.isnull().sum()}")


In [None]:
# Split data for fuel consumption
X_train_fuel, X_test_fuel, y_fuel_train, y_fuel_test = train_test_split(
    X, y_fuel, test_size=0.2, random_state=42
)

# Split data for CO2 emissions (using same random state for consistency)
X_train_co2, X_test_co2, y_co2_train, y_co2_test = train_test_split(
    X, y_co2, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_fuel_scaled = scaler.fit_transform(X_train_fuel)
X_test_fuel_scaled = scaler.transform(X_test_fuel)
X_train_co2_scaled = scaler.fit_transform(X_train_co2)
X_test_co2_scaled = scaler.transform(X_test_co2)

print("Data split completed!")
print(f"Training set size (fuel): {X_train_fuel.shape[0]}")
print(f"Test set size (fuel): {X_test_fuel.shape[0]}")
print(f"Training set size (CO2): {X_train_co2.shape[0]}")
print(f"Test set size (CO2): {X_test_co2.shape[0]}")


## 2. Model 1: Predict Fuel Consumption


In [None]:
# Linear Regression
lr_fuel = LinearRegression()
lr_fuel.fit(X_train_fuel_scaled, y_fuel_train)
y_fuel_pred_lr = lr_fuel.predict(X_test_fuel_scaled)

print("Linear Regression - Fuel Consumption:")
print(f"  R2 Score: {r2_score(y_fuel_test, y_fuel_pred_lr):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_fuel_test, y_fuel_pred_lr)):.4f}")
print(f"  MAE: {mean_absolute_error(y_fuel_test, y_fuel_pred_lr):.4f}")

# Cross-validation score
cv_scores = cross_val_score(lr_fuel, X_train_fuel_scaled, y_fuel_train, cv=5, scoring='r2')
print(f"  CV R2 Score (mean): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")


In [None]:
# Random Forest
rf_fuel = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
rf_fuel.fit(X_train_fuel, y_fuel_train)
y_fuel_pred_rf = rf_fuel.predict(X_test_fuel)

print("Random Forest - Fuel Consumption:")
print(f"  R2 Score: {r2_score(y_fuel_test, y_fuel_pred_rf):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_fuel_test, y_fuel_pred_rf)):.4f}")
print(f"  MAE: {mean_absolute_error(y_fuel_test, y_fuel_pred_rf):.4f}")

# Cross-validation score
cv_scores = cross_val_score(rf_fuel, X_train_fuel, y_fuel_train, cv=5, scoring='r2')
print(f"  CV R2 Score (mean): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_fuel.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature', palette='viridis')
plt.title('Random Forest Feature Importance - Fuel Consumption', fontsize=14, fontweight='bold')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('../../outputs/figures/feature_importance_fuel.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Gradient Boosting
gb_fuel = GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5, learning_rate=0.1)
gb_fuel.fit(X_train_fuel, y_fuel_train)
y_fuel_pred_gb = gb_fuel.predict(X_test_fuel)

print("Gradient Boosting - Fuel Consumption:")
print(f"  R2 Score: {r2_score(y_fuel_test, y_fuel_pred_gb):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_fuel_test, y_fuel_pred_gb)):.4f}")
print(f"  MAE: {mean_absolute_error(y_fuel_test, y_fuel_pred_gb):.4f}")

# Cross-validation score
cv_scores = cross_val_score(gb_fuel, X_train_fuel, y_fuel_train, cv=5, scoring='r2')
print(f"  CV R2 Score (mean): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Compare models
print("\n=== Model Comparison for Fuel Consumption ===")
models_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'R2 Score': [
        r2_score(y_fuel_test, y_fuel_pred_lr),
        r2_score(y_fuel_test, y_fuel_pred_rf),
        r2_score(y_fuel_test, y_fuel_pred_gb)
    ],
    'RMSE': [
        np.sqrt(mean_squared_error(y_fuel_test, y_fuel_pred_lr)),
        np.sqrt(mean_squared_error(y_fuel_test, y_fuel_pred_rf)),
        np.sqrt(mean_squared_error(y_fuel_test, y_fuel_pred_gb))
    ],
    'MAE': [
        mean_absolute_error(y_fuel_test, y_fuel_pred_lr),
        mean_absolute_error(y_fuel_test, y_fuel_pred_rf),
        mean_absolute_error(y_fuel_test, y_fuel_pred_gb)
    ]
})
print(models_comparison.round(4))


## 3. Model 2: Predict CO2 Emissions


In [None]:
# Multiple models for CO2 emissions prediction
# Linear Regression
lr_co2 = LinearRegression()
lr_co2.fit(X_train_co2_scaled, y_co2_train)
y_co2_pred_lr = lr_co2.predict(X_test_co2_scaled)

# Random Forest for CO2
rf_co2 = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10)
rf_co2.fit(X_train_co2, y_co2_train)
y_co2_pred_rf = rf_co2.predict(X_test_co2)

# Gradient Boosting for CO2
gb_co2 = GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5, learning_rate=0.1)
gb_co2.fit(X_train_co2, y_co2_train)
y_co2_pred_gb = gb_co2.predict(X_test_co2)

print("=== CO2 Emissions Prediction Results ===\n")

print("Linear Regression - CO2 Emissions:")
print(f"  R2 Score: {r2_score(y_co2_test, y_co2_pred_lr):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_co2_test, y_co2_pred_lr)):.4f}")
print(f"  MAE: {mean_absolute_error(y_co2_test, y_co2_pred_lr):.4f}\n")

print("Random Forest - CO2 Emissions:")
print(f"  R2 Score: {r2_score(y_co2_test, y_co2_pred_rf):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_co2_test, y_co2_pred_rf)):.4f}")
print(f"  MAE: {mean_absolute_error(y_co2_test, y_co2_pred_rf):.4f}\n")

print("Gradient Boosting - CO2 Emissions:")
print(f"  R2 Score: {r2_score(y_co2_test, y_co2_pred_gb):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_co2_test, y_co2_pred_gb)):.4f}")
print(f"  MAE: {mean_absolute_error(y_co2_test, y_co2_pred_gb):.4f}\n")

# Compare models
models_comparison_co2 = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting'],
    'R2 Score': [
        r2_score(y_co2_test, y_co2_pred_lr),
        r2_score(y_co2_test, y_co2_pred_rf),
        r2_score(y_co2_test, y_co2_pred_gb)
    ],
    'RMSE': [
        np.sqrt(mean_squared_error(y_co2_test, y_co2_pred_lr)),
        np.sqrt(mean_squared_error(y_co2_test, y_co2_pred_rf)),
        np.sqrt(mean_squared_error(y_co2_test, y_co2_pred_gb))
    ],
    'MAE': [
        mean_absolute_error(y_co2_test, y_co2_pred_lr),
        mean_absolute_error(y_co2_test, y_co2_pred_rf),
        mean_absolute_error(y_co2_test, y_co2_pred_gb)
    ]
})
print("=== Model Comparison for CO2 Emissions ===")
print(models_comparison_co2.round(4))

# Use best model for visualization
y_co2_pred = y_co2_pred_rf  # Random Forest performed best

# Feature importance for CO2
feature_importance_co2 = pd.DataFrame({
    'feature': features,
    'importance': rf_co2.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance for CO2 Emissions:")
print(feature_importance_co2)

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance_co2, x='importance', y='feature', palette='plasma')
plt.title('Random Forest Feature Importance - CO2 Emissions', fontsize=14, fontweight='bold')
plt.xlabel('Importance')
plt.tight_layout()
plt.savefig('../../outputs/figures/feature_importance_co2.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Visualization: Actual vs Predicted
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Fuel Consumption - Random Forest
axes[0, 0].scatter(y_fuel_test, y_fuel_pred_rf, alpha=0.6, s=50)
axes[0, 0].plot([y_fuel_test.min(), y_fuel_test.max()], 
                [y_fuel_test.min(), y_fuel_test.max()], 'r--', lw=2, label='Perfect Prediction')
axes[0, 0].set_xlabel('Actual Fuel Consumption (L/100km)', fontsize=11)
axes[0, 0].set_ylabel('Predicted Fuel Consumption (L/100km)', fontsize=11)
axes[0, 0].set_title(f'Random Forest: Fuel Consumption\nR² = {r2_score(y_fuel_test, y_fuel_pred_rf):.3f}', 
                     fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# CO2 Emissions - Random Forest
axes[0, 1].scatter(y_co2_test, y_co2_pred_rf, alpha=0.6, s=50, color='green')
axes[0, 1].plot([y_co2_test.min(), y_co2_test.max()], 
                [y_co2_test.min(), y_co2_test.max()], 'r--', lw=2, label='Perfect Prediction')
axes[0, 1].set_xlabel('Actual CO2 Emissions (g/km)', fontsize=11)
axes[0, 1].set_ylabel('Predicted CO2 Emissions (g/km)', fontsize=11)
axes[0, 1].set_title(f'Random Forest: CO2 Emissions\nR² = {r2_score(y_co2_test, y_co2_pred_rf):.3f}', 
                     fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Residuals for Fuel Consumption
residuals_fuel = y_fuel_test - y_fuel_pred_rf
axes[1, 0].scatter(y_fuel_pred_rf, residuals_fuel, alpha=0.6, s=50)
axes[1, 0].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1, 0].set_xlabel('Predicted Fuel Consumption (L/100km)', fontsize=11)
axes[1, 0].set_ylabel('Residuals', fontsize=11)
axes[1, 0].set_title('Residual Plot: Fuel Consumption', fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# Residuals for CO2 Emissions
residuals_co2 = y_co2_test - y_co2_pred_rf
axes[1, 1].scatter(y_co2_pred_rf, residuals_co2, alpha=0.6, s=50, color='green')
axes[1, 1].axhline(y=0, color='r', linestyle='--', linewidth=2)
axes[1, 1].set_xlabel('Predicted CO2 Emissions (g/km)', fontsize=11)
axes[1, 1].set_ylabel('Residuals', fontsize=11)
axes[1, 1].set_title('Residual Plot: CO2 Emissions', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../../outputs/figures/ml_predictions.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Save models
import joblib
import os

# Create models directory if it doesn't exist
os.makedirs('../../outputs/models', exist_ok=True)

# Save best models
joblib.dump(rf_fuel, '../../outputs/models/random_forest_fuel.pkl')
joblib.dump(rf_co2, '../../outputs/models/random_forest_co2.pkl')
joblib.dump(gb_fuel, '../../outputs/models/gradient_boosting_fuel.pkl')
joblib.dump(gb_co2, '../../outputs/models/gradient_boosting_co2.pkl')
joblib.dump(lr_fuel, '../../outputs/models/linear_regression_fuel.pkl')
joblib.dump(lr_co2, '../../outputs/models/linear_regression_co2.pkl')
joblib.dump(scaler, '../../outputs/models/scaler.pkl')

# Save label encoders
joblib.dump(le_make, '../../outputs/models/le_make.pkl')
joblib.dump(le_model, '../../outputs/models/le_model.pkl')
joblib.dump(le_class, '../../outputs/models/le_class.pkl')
joblib.dump(le_transmission, '../../outputs/models/le_transmission.pkl')
joblib.dump(le_fuel, '../../outputs/models/le_fuel.pkl')

print("Models and encoders saved successfully!")
print("\nSaved files:")
print("  - random_forest_fuel.pkl")
print("  - random_forest_co2.pkl")
print("  - gradient_boosting_fuel.pkl")
print("  - gradient_boosting_co2.pkl")
print("  - linear_regression_fuel.pkl")
print("  - linear_regression_co2.pkl")
print("  - scaler.pkl")
print("  - Label encoders (5 files)")
