# Machine Learning Analysis
## Predictive Modeling for Fuel Consumption and CO2 Emissions

This notebook implements various ML algorithms to predict:
- Fuel Consumption
- CO2 Emissions

Algorithms used:
- Linear Regression
- Random Forest
- Gradient Boosting


In [None]:
# Load necessary libraries
library(caret)
library(randomForest)
library(e1071)

# Load the dataset
df <- read.csv("../../data/FuelConsumption.csv", stringsAsFactors = FALSE)
colnames(df) <- trimws(colnames(df))

# Prepare data
df$MAKE_encoded <- as.numeric(as.factor(df$MAKE))
df$VEHICLE.CLASS_encoded <- as.numeric(as.factor(df$VEHICLE.CLASS))
df$TRANSMISSION_encoded <- as.numeric(as.factor(df$TRANSMISSION))
df$FUEL_encoded <- as.numeric(as.factor(df$FUEL))

features <- c("Year", "ENGINE.SIZE", "CYLINDERS", "MAKE_encoded", 
              "VEHICLE.CLASS_encoded", "TRANSMISSION_encoded", "FUEL_encoded")
X <- df[, features]
y_fuel <- df$FUEL.CONSUMPTION
y_co2 <- df$COEMISSIONS


## 1. Data Preprocessing and Splitting


In [None]:
# Split data
set.seed(42)
trainIndex_fuel <- createDataPartition(y_fuel, p = 0.8, list = FALSE)
X_train_fuel <- X[trainIndex_fuel, ]
X_test_fuel <- X[-trainIndex_fuel, ]
y_fuel_train <- y_fuel[trainIndex_fuel]
y_fuel_test <- y_fuel[-trainIndex_fuel]

trainIndex_co2 <- createDataPartition(y_co2, p = 0.8, list = FALSE)
X_train_co2 <- X[trainIndex_co2, ]
X_test_co2 <- X[-trainIndex_co2, ]
y_co2_train <- y_co2[trainIndex_co2]
y_co2_test <- y_co2[-trainIndex_co2]

cat("Data split completed!\n")
cat("Training set size:", nrow(X_train_fuel), "\n")
cat("Test set size:", nrow(X_test_fuel), "\n")


## 2. Model Training - Fuel Consumption


In [None]:
# Random Forest for Fuel Consumption
rf_fuel <- randomForest(X_train_fuel, y_fuel_train, ntree = 100, 
                        mtry = sqrt(ncol(X_train_fuel)))
y_fuel_pred <- predict(rf_fuel, X_test_fuel)

# Calculate metrics
r2_fuel <- R2(y_fuel_pred, y_fuel_test)
rmse_fuel <- RMSE(y_fuel_pred, y_fuel_test)
mae_fuel <- MAE(y_fuel_pred, y_fuel_test)

cat("Random Forest - Fuel Consumption:\n")
cat("  R2 Score:", round(r2_fuel, 4), "\n")
cat("  RMSE:", round(rmse_fuel, 4), "\n")
cat("  MAE:", round(mae_fuel, 4), "\n")


## 3. Model Training - CO2 Emissions


In [None]:
# Random Forest for CO2 Emissions
rf_co2 <- randomForest(X_train_co2, y_co2_train, ntree = 100, 
                       mtry = sqrt(ncol(X_train_co2)))
y_co2_pred <- predict(rf_co2, X_test_co2)

# Calculate metrics
r2_co2 <- R2(y_co2_pred, y_co2_test)
rmse_co2 <- RMSE(y_co2_pred, y_co2_test)
mae_co2 <- MAE(y_co2_pred, y_co2_test)

cat("Random Forest - CO2 Emissions:\n")
cat("  R2 Score:", round(r2_co2, 4), "\n")
cat("  RMSE:", round(rmse_co2, 4), "\n")
cat("  MAE:", round(mae_co2, 4), "\n")

# Save models
model_dir <- "../../outputs/models"
if(!dir.exists(model_dir)) {
  dir.create(model_dir, recursive = TRUE)
}
saveRDS(rf_fuel, file.path(model_dir, "random_forest_fuel.rds"))
saveRDS(rf_co2, file.path(model_dir, "random_forest_co2.rds"))
cat("\nâœ“ Models saved!\n")
