In [1]:
%load_ext rpy2.ipython

In [2]:
%%R 

# Loading necessary libraries
library(dplyr)       # For data manipulation
library(ggplot2)     # For data visualization
library(caret)       # For machine learning utilities
library(tibble)      # For data frame manipulation
library(pROC)        # For model performance evaluation
library(ROSE)        # For oversampling/undersampling imbalanced data
library(glmnet)      # For regularized regression models

#Source with my functions
source("CalculatedFieldSubroutines.R") 


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: lattice
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var

Loaded ROSE 0.0-4

Loading required package: Matrix
Loaded glmnet 4.1-8
In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
  libraries ‘/usr/local/lib/R/site-library’, ‘/usr/lib/R/site-library’ contain no packages


In [3]:
%%R

# latlongStdDev function to calculate the combined standard deviation of latitude and longitude
LatLonStdDev <- function(best_pose_df) {
    # Check if the required columns exist in the data
    if (!all(c("latitudeStdDev", "longitudeStdDev") %in% colnames(best_pose_df))) {
        stop("The input dataframe must contain 'latitudeStdDev' and 'longitudeStdDev' columns.")
    }
    
    # Calculate the Euclidean distance for latitude and longitude standard deviations
    best_pose_df$latlongStdDev <- sqrt(best_pose_df$latitudeStdDev^2 + best_pose_df$longitudeStdDev^2)
    
    # Return the modified dataframe
    return(best_pose_df)
}

In [4]:
%%R
# We initialize an empty data frame for training data
train_data <- data.frame()

# Loop through each GMID to load chassis data
for (gmID in red_route_gmIDs) {
    chassis_file_path <- file.path("data", gmID, "_apollo_canbus_chassis", paste0(gmID, "_apollo_canbus_chassis.csv"))
    
    if (file.exists(chassis_file_path)) {
        # Load the chassis data
        chassis_data <- read.csv(chassis_file_path)
        
        # Append the loaded data to train_data
        train_data <- rbind(train_data, chassis_data)
    } else {
        warning(paste("Chassis file not found for GMID:", gmID))
    }
}

Error in (function (expr, envir = parent.frame(), enclos = if (is.list(envir) ||  : 
  object 'red_route_gmIDs' not found


RInterpreterError: Failed to parse and evaluate line '# We initialize an empty data frame for training data\ntrain_data <- data.frame()\n\n# Loop through each GMID to load chassis data\nfor (gmID in red_route_gmIDs) {\n    chassis_file_path <- file.path("data", gmID, "_apollo_canbus_chassis", paste0(gmID, "_apollo_canbus_chassis.csv"))\n    \n    if (file.exists(chassis_file_path)) {\n        # Load the chassis data\n        chassis_data <- read.csv(chassis_file_path)\n        \n        # Append the loaded data to train_data\n        train_data <- rbind(train_data, chassis_data)\n    } else {\n        warning(paste("Chassis file not found for GMID:", gmID))\n    }\n}\n'.
R error message: "Error in (function (expr, envir = parent.frame(), enclos = if (is.list(envir) ||  : \n  object 'red_route_gmIDs' not found"

In [2]:
%%R

#install.packages("tibble")
#install.packages("dplyr")
#install.packages(ggplot2)
#install.packages("caret")
#install.packages("glmnet")

library(tibble)
library(dplyr)
library(ggplot2)
library(caret)
library(pROC)
library(ROSE)
library(glmnet)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Loading required package: lattice
Type 'citation("pROC")' for a citation.

Attaching package: ‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var

Loaded ROSE 0.0-4

Loading required package: Matrix
Loaded glmnet 4.1-8
In (function (package, help, pos = 2, lib.loc = NULL, character.only = FALSE,  :
  libraries ‘/usr/local/lib/R/site-library’, ‘/usr/lib/R/site-library’ contain no packages


In [3]:
%%R

# Function to add a binary driving mode column
binary_driving_mode <- function(chassis_df) {
  chassis_df$BinaryDrivingMode <- sapply(chassis_df$drivingMode, function(drive_mode) {
    if (drive_mode %in% c("COMPLETE_MANUAL", "EMERGENCY_MODE")) {
      return(0)
    } else if (drive_mode == "COMPLETE_AUTO_DRIVE") {
      return(1)
    } else {
      stop(paste("Unknown driving mode:", drive_mode))
    }
  })
  return(chassis_df)
}

# Function to calculate ternary driving mode transitions
ternary_driving_mode_transition <- function(time_sorted_chassis_df) {
  binary_drive_mode <- time_sorted_chassis_df$BinaryDrivingMode
  transitions <- c(0, diff(binary_drive_mode))
  time_sorted_chassis_df$TernaryDrivingModeTransition <- transitions
  return(time_sorted_chassis_df)
}

# Function to calculate total standard deviation of latitude and longitude
lat_lon_total_std_dev <- function(best_pose_df) {
  best_pose_df$LatLonTotalStdDev <- sqrt(best_pose_df$latitudeStdDev^2 + best_pose_df$longitudeStdDev^2)
  return(best_pose_df)
}

# Function to match chassis times with best pose times
chassis_best_pose_matched_time <- function(chassis_df, best_pose_df) {
  chassis_df$ChassisBestPoseMatchedTime <- sapply(chassis_df$time, function(chassis_time) {
    closest_time <- best_pose_df$time[which.min(abs(best_pose_df$time - chassis_time))]
    return(closest_time)
  })
  return(chassis_df)
}

# Function to calculate progress along a reference route
progress_along_route <- function(best_pose_df, time_sorted_reference_best_pose_df) {
  reference_lat <- time_sorted_reference_best_pose_df$latitude
  reference_lon <- time_sorted_reference_best_pose_df$longitude
  reference_progress <- c(0)

  for (i in 1:(length(reference_lat) - 1)) {
    distance <- sqrt((reference_lat[i + 1] - reference_lat[i])^2 + (reference_lon[i + 1] - reference_lon[i])^2)
    reference_progress <- c(reference_progress, tail(reference_progress, 1) + distance)
  }

  reference_progress <- reference_progress / max(reference_progress)

  best_pose_df$ProgressAlongRoute <- sapply(1:nrow(best_pose_df), function(i) {
    distances <- (reference_lat - best_pose_df$latitude[i])^2 + (reference_lon - best_pose_df$longitude[i])^2
    closest_index <- which.min(distances)
    return(reference_progress[closest_index])
  })

  return(best_pose_df)
}

# Function to normalize time
normalized_time <- function(topic_df) {
  topic_df$NormalizedTime <- topic_df$time - min(topic_df$time)
  return(topic_df)
}

# Function to calculate delta time
delta_time <- function(time_sorted_topic_df) {
  delta_time <- c(0, diff(time_sorted_topic_df$time))
  time_sorted_topic_df$DeltaTime <- delta_time
  return(time_sorted_topic_df)
}

# Function to calculate cumulative distance
calculate_distance <- function(time_sorted_chassis_df) {
  delta_time_sec <- time_sorted_chassis_df$DeltaTime * 1e-9
  distance <- cumsum(delta_time_sec * time_sorted_chassis_df$speedMps)
  time_sorted_chassis_df$Distance <- distance
  return(time_sorted_chassis_df)
}

# Function to merge drive events with chassis data
merge_chassis_drive_event <- function(chassis_df, drive_event_df) {
  chassis_df$DriveEvent <- NA
  chassis_df$DriveEventType <- NA
  for (i in 1:nrow(drive_event_df)) {
    closest_index <- which.min(abs(chassis_df$time - drive_event_df$time[i]))
    chassis_df$DriveEvent[closest_index] <- drive_event_df$event[i]
    chassis_df$DriveEventType[closest_index] <- drive_event_df$type[i]
  }
  return(chassis_df)
}

# Function to calculate distance to the nearest disengagement
distance_to_nearest_disengagement <- function(time_sorted_chassis_df) {
  delta_time_sec <- c(0, diff(time_sorted_chassis_df$time)) * 1e-9
  cumulative_distance <- cumsum(delta_time_sec * time_sorted_chassis_df$speedMps)
  disengagement_indexes <- which(time_sorted_chassis_df$TernaryDrivingModeTransition == -1)
  disengagement_distances <- cumulative_distance[disengagement_indexes]

  if (length(disengagement_indexes) == 0) {
    time_sorted_chassis_df$DistanceToNearestDisengagement <- NA
    time_sorted_chassis_df$NearestDisengagementID <- NA
    return(time_sorted_chassis_df)
  }

  distances_to_disengagement <- sapply(1:length(cumulative_distance), function(i) {
    min(abs(cumulative_distance[i] - disengagement_distances))
  })

  nearest_disengagement_ids <- sapply(1:length(cumulative_distance), function(i) {
    closest_index <- which.min(abs(cumulative_distance[i] - disengagement_distances))
    paste(time_sorted_chassis_df$groupMetadataID[1], disengagement_indexes[closest_index], sep = "_")
  })

  time_sorted_chassis_df$DistanceToNearestDisengagement <- distances_to_disengagement
  time_sorted_chassis_df$NearestDisengagementID <- nearest_disengagement_ids
  return(time_sorted_chassis_df)
}

# Function to calculate acceleration based on chassis time
acceleration_chassistime <- function(time_sorted_chassis_df) {
  delta_time_sec <- c(0, diff(time_sorted_chassis_df$time)) * 1e-9
  speed_diff <- c(0, diff(time_sorted_chassis_df$speedMps))
  acceleration <- speed_diff / delta_time_sec
  acceleration[is.na(acceleration)] <- 0
  time_sorted_chassis_df$Acceleration <- acceleration
  return(time_sorted_chassis_df)
}

# Function to calculate acceleration for best pose time intervals
acceleration_bestposetime <- function(time_sorted_merged_chassisbestpose_df) {
  unique_best_pose_times <- unique(time_sorted_merged_chassisbestpose_df$time_y)
  acceleration_list <- numeric()
  for (best_pose_time in unique_best_pose_times) {
    subset_df <- time_sorted_merged_chassisbestpose_df[time_sorted_merged_chassisbestpose_df$time_y == best_pose_time, ]
    delta_time <- (max(subset_df$time_x) - min(subset_df$time_x)) * 1e-9
    delta_speed <- max(subset_df$speedMps) - min(subset_df$speedMps)
    acceleration <- ifelse(delta_time > 0, delta_speed / delta_time, 0)
    acceleration_list <- c(acceleration_list, rep(acceleration, nrow(subset_df)))
  }
  time_sorted_merged_chassisbestpose_df$Acceleration <- acceleration_list
  return(time_sorted_merged_chassisbestpose_df)
}


In [8]:
%%R
gmIDs <- c(# Use the predefined list of 72 Red route gmIDs
  "6d2ea45a-c839-11ee-a7fc-dd032dba19e8", "8dbbbf1c-f0ef-11ee-ba29-fb353e7798cd",
  "35518ec4-f153-11ee-ba88-fb353e7798cd", "be857244-efc0-11ee-b966-fb353e7798cd",
  "f41cbd44-eff8-11ee-b966-fb353e7798cd", "c0555ef0-f50f-11ee-8afa-cb629b0d53e6",
  "e7b934a8-ef1a-11ee-9385-ef789ffde1d3", "3151e9e2-eff3-11ee-b966-fb353e7798cd",
  "d3698592-ef9d-11ee-b966-fb353e7798cd", "9798fe24-f143-11ee-ba78-fb353e7798cd",
  "c9c6856c-d33c-11ee-b437-336917683bb8", "96f7a614-f549-11ee-8afa-cb629b0d53e6",
  "457dc5ee-f02a-11ee-b966-fb353e7798cd", "b82476fe-f1f3-11ee-baff-fb353e7798cd",
  "41b67a28-f52f-11ee-8afa-cb629b0d53e6", "8fa6fe80-c869-11ee-a7fc-dd032dba19e8",
  "fd1ab258-efa7-11ee-b966-fb353e7798cd", "3d2d29ec-ef95-11ee-b966-fb353e7798cd",
  "8437f77a-cab7-11ee-909c-e1dc60cf66f9", "fcc6fcd2-f013-11ee-b966-fb353e7798cd",
  "d12cd1c4-caec-11ee-909c-e1dc60cf66f9", "1b6aca0e-efdf-11ee-b966-fb353e7798cd",
  "94c53148-eeed-11ee-9385-ef789ffde1d3", "cf831f42-f353-11ee-bb4e-fb353e7798cd",
  "05c7c824-cab8-11ee-aa4d-1d66adf2f0c7", "7cbd932e-f244-11ee-bb3f-fb353e7798cd",
  "286e019a-f204-11ee-bb07-fb353e7798cd", "fe973c9c-f53c-11ee-8afa-cb629b0d53e6",
  "ecebb942-f162-11ee-ba97-fb353e7798cd", "2a61b8a8-f528-11ee-8afa-cb629b0d53e6",
  "d21965e6-f0fa-11ee-ba37-fb353e7798cd", "ce6465b6-f51b-11ee-8afa-cb629b0d53e6",
  "d24820c8-f197-11ee-babe-fb353e7798cd", "622bd2e8-f0e4-11ee-ba1f-fb353e7798cd",
  "2f95c748-f009-11ee-b966-fb353e7798cd", "fc211bb2-efca-11ee-b966-fb353e7798cd",
  "c338788a-d324-11ee-b437-336917683bb8", "171c50bc-f106-11ee-ba42-fb353e7798cd",
  "c25271be-f3a4-11ee-bb4e-fb353e7798cd", "5a4bccf4-effe-11ee-b966-fb353e7798cd",
  "1bbbfbae-c839-11ee-a7fc-dd032dba19e8", "f0eebb6a-f0dc-11ee-ba1e-fb353e7798cd",
  "72a03d4a-efe9-11ee-b966-fb353e7798cd", "853ef120-cad3-11ee-909c-e1dc60cf66f9",
  "8347b862-efad-11ee-b966-fb353e7798cd", "1ee938a2-f172-11ee-baa6-fb353e7798cd",
  "3344a3c0-f502-11ee-8afa-cb629b0d53e6", "51ef6da6-ca9f-11ee-909c-e1dc60cf66f9",
  "817d6848-efb6-11ee-b966-fb353e7798cd", "aa5dbcd2-ef10-11ee-9385-ef789ffde1d3",
  "dd72fdec-f0cf-11ee-ba0d-fb353e7798cd", "de933de8-f112-11ee-ba4d-fb353e7798cd",
  "211bdb36-f0da-11ee-ba1b-fb353e7798cd", "01e65360-efd4-11ee-b966-fb353e7798cd",
  "f43b6a70-f01e-11ee-b966-fb353e7798cd", "65cfbfd6-f396-11ee-bb4e-fb353e7798cd",
  "61b12e7a-f234-11ee-bb33-fb353e7798cd", "7fb7b9c0-c881-11ee-a7fc-dd032dba19e8",
  "88a68dd8-eef9-11ee-9385-ef789ffde1d3", "85b6e70e-ef7a-11ee-b966-fb353e7798cd",
  "868de15e-f3b3-11ee-bb4e-fb353e7798cd", "219f7eb8-ef87-11ee-b966-fb353e7798cd",
  "88dd6fbe-f224-11ee-bb21-fb353e7798cd", "f755cf60-f132-11ee-ba6d-fb353e7798cd",
  "84d96f18-f214-11ee-bb13-fb353e7798cd", "9189a2a8-f121-11ee-ba5b-fb353e7798cd",
  "43a1a35e-f362-11ee-bb4e-fb353e7798cd", "f711e68e-f0e1-11ee-ba1f-fb353e7798cd",
  "2462c9d0-eecd-11ee-9385-ef789ffde1d3"
)

# Split gmIDs into training and testing
train_gmIDs <- gmIDs[1:54]
test_gmIDs <- gmIDs[55:69]

# Print training and testing gmIDs
print("Training gmIDs:")
print(train_gmIDs)

print("Testing gmIDs:")
print(test_gmIDs)

# Now you can use these gmIDs to filter your data for training and testing sets.

    

[1] "Training gmIDs:"
 [1] "6d2ea45a-c839-11ee-a7fc-dd032dba19e8"
 [2] "8dbbbf1c-f0ef-11ee-ba29-fb353e7798cd"
 [3] "35518ec4-f153-11ee-ba88-fb353e7798cd"
 [4] "be857244-efc0-11ee-b966-fb353e7798cd"
 [5] "f41cbd44-eff8-11ee-b966-fb353e7798cd"
 [6] "c0555ef0-f50f-11ee-8afa-cb629b0d53e6"
 [7] "e7b934a8-ef1a-11ee-9385-ef789ffde1d3"
 [8] "3151e9e2-eff3-11ee-b966-fb353e7798cd"
 [9] "d3698592-ef9d-11ee-b966-fb353e7798cd"
[10] "9798fe24-f143-11ee-ba78-fb353e7798cd"
[11] "c9c6856c-d33c-11ee-b437-336917683bb8"
[12] "96f7a614-f549-11ee-8afa-cb629b0d53e6"
[13] "457dc5ee-f02a-11ee-b966-fb353e7798cd"
[14] "b82476fe-f1f3-11ee-baff-fb353e7798cd"
[15] "41b67a28-f52f-11ee-8afa-cb629b0d53e6"
[16] "8fa6fe80-c869-11ee-a7fc-dd032dba19e8"
[17] "fd1ab258-efa7-11ee-b966-fb353e7798cd"
[18] "3d2d29ec-ef95-11ee-b966-fb353e7798cd"
[19] "8437f77a-cab7-11ee-909c-e1dc60cf66f9"
[20] "fcc6fcd2-f013-11ee-b966-fb353e7798cd"
[21] "d12cd1c4-caec-11ee-909c-e1dc60cf66f9"
[22] "1b6aca0e-efdf-11ee-b966-fb353e7798cd"
[23] "94c5

In [14]:
%%R
# Ensure we have data loaded before proceeding
if (nrow(df_merged) == 0) {
  stop("No data was loaded. Check the directory and file structure.")
}

# Preprocess the merged dataset
df_merged <- df_merged %>%
  mutate(
    brakePercentage = ifelse(is.na(brakePercentage), mean(brakePercentage, na.rm = TRUE), brakePercentage),
    throttlePercentage = ifelse(is.na(throttlePercentage), mean(throttlePercentage, na.rm = TRUE), throttlePercentage),
    steeringPercentage = ifelse(is.na(steeringPercentage), mean(steeringPercentage, na.rm = TRUE), steeringPercentage)
  ) %>%
  na.omit()  # Remove rows with any remaining NA values

# Confirm how many gmIDs have valid data
valid_gmIDs <- unique(df_merged$gmID)
print(paste("Valid gmIDs with data:", length(valid_gmIDs)))

# Update training and testing splits with valid gmIDs
train_data <- df_merged %>% filter(gmID %in% train_gmIDs & gmID %in% valid_gmIDs)
test_data <- df_merged %>% filter(gmID %in% test_gmIDs & gmID %in% valid_gmIDs)

# Check for sufficient data in both training and testing sets
if (nrow(train_data) == 0 || nrow(test_data) == 0) {
  stop("Training or testing data is empty after filtering for valid gmIDs.")
}

# Train the logistic regression model
model <- glm(
  binaryDrivingMode ~ speedMps + throttlePercentage + brakePercentage + steeringPercentage + latlongStdDev,
  data = train_data,
  family = binomial
)

# Summarize the model
print(summary(model))

# Predict on the testing set
predictions <- predict(model, test_data, type = "response")
predicted_classes <- ifelse(predictions > 0.5, 1, 0)

# Calculate accuracy
accuracy <- mean(predicted_classes == test_data$binaryDrivingMode)
print(paste("Model Accuracy:", accuracy))

# Evaluate using confusion matrix
conf_matrix <- confusionMatrix(
  factor(predicted_classes),
  factor(test_data$binaryDrivingMode)
)
print(conf_matrix)

# Visualize the confusion matrix
conf_table <- as.data.frame(conf_matrix$table)
ggplot(data = conf_table, aes(x = Prediction, y = Reference, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  scale_fill_gradient(low = "lightblue", high = "blue") +
  labs(title = "Confusion Matrix", x = "Predicted Classes", y = "Actual Classes") +
  theme_minimal()

# Optionally plot ROC curve
roc_obj <- roc(test_data$binaryDrivingMode, predictions)
plot(roc_obj, col = "blue", main = "ROC Curve")


Error in (function (expr, envir = parent.frame(), enclos = if (is.list(envir) ||  : 
  No data was loaded. Check the directory and file structure.


RInterpreterError: Failed to parse and evaluate line '# Ensure we have data loaded before proceeding\nif (nrow(df_merged) == 0) {\n  stop("No data was loaded. Check the directory and file structure.")\n}\n\n# Preprocess the merged dataset\ndf_merged <- df_merged %>%\n  mutate(\n    brakePercentage = ifelse(is.na(brakePercentage), mean(brakePercentage, na.rm = TRUE), brakePercentage),\n    throttlePercentage = ifelse(is.na(throttlePercentage), mean(throttlePercentage, na.rm = TRUE), throttlePercentage),\n    steeringPercentage = ifelse(is.na(steeringPercentage), mean(steeringPercentage, na.rm = TRUE), steeringPercentage)\n  ) %>%\n  na.omit()  # Remove rows with any remaining NA values\n\n# Confirm how many gmIDs have valid data\nvalid_gmIDs <- unique(df_merged$gmID)\nprint(paste("Valid gmIDs with data:", length(valid_gmIDs)))\n\n# Update training and testing splits with valid gmIDs\ntrain_data <- df_merged %>% filter(gmID %in% train_gmIDs & gmID %in% valid_gmIDs)\ntest_data <- df_merged %>% filter(gmID %in% test_gmIDs & gmID %in% valid_gmIDs)\n\n# Check for sufficient data in both training and testing sets\nif (nrow(train_data) == 0 || nrow(test_data) == 0) {\n  stop("Training or testing data is empty after filtering for valid gmIDs.")\n}\n\n# Train the logistic regression model\nmodel <- glm(\n  binaryDrivingMode ~ speedMps + throttlePercentage + brakePercentage + steeringPercentage + latlongStdDev,\n  data = train_data,\n  family = binomial\n)\n\n# Summarize the model\nprint(summary(model))\n\n# Predict on the testing set\npredictions <- predict(model, test_data, type = "response")\npredicted_classes <- ifelse(predictions > 0.5, 1, 0)\n\n# Calculate accuracy\naccuracy <- mean(predicted_classes == test_data$binaryDrivingMode)\nprint(paste("Model Accuracy:", accuracy))\n\n# Evaluate using confusion matrix\nconf_matrix <- confusionMatrix(\n  factor(predicted_classes),\n  factor(test_data$binaryDrivingMode)\n)\nprint(conf_matrix)\n\n# Visualize the confusion matrix\nconf_table <- as.data.frame(conf_matrix$table)\nggplot(data = conf_table, aes(x = Prediction, y = Reference, fill = Freq)) +\n  geom_tile() +\n  geom_text(aes(label = Freq), color = "white", size = 5) +\n  scale_fill_gradient(low = "lightblue", high = "blue") +\n  labs(title = "Confusion Matrix", x = "Predicted Classes", y = "Actual Classes") +\n  theme_minimal()\n\n# Optionally plot ROC curve\nroc_obj <- roc(test_data$binaryDrivingMode, predictions)\nplot(roc_obj, col = "blue", main = "ROC Curve")\n'.
R error message: 'Error in (function (expr, envir = parent.frame(), enclos = if (is.list(envir) ||  : \n  No data was loaded. Check the directory and file structure.'