# Mutation Impact and Pathogenicity Prediction (R)

This notebook demonstrates how to predict the functional impact of mutations using R and machine learning approaches.

## Objectives
1. Load and preprocess genomics mutation data
2. Encode DNA sequences for machine learning
3. Train multiple ML models
4. Evaluate and compare model performance
5. Make predictions on new mutations


## 1. Install and Load Required Libraries


In [None]:
# Install packages if needed (run once)
# install.packages(c("randomForest", "e1071", "xgboost", "caret", "pROC", "ggplot2", "dplyr"))

# Load libraries
library(randomForest)
library(e1071)
library(xgboost)
library(caret)
library(pROC)
library(ggplot2)
library(dplyr)

# Source custom functions
source("../src/data_loader.R")
source("../src/models.R")


## 2. Load and Explore Data


In [None]:
# Load data
data <- load_genomics_data()

# Display basic information
cat("Dataset shape:", nrow(data), "x", ncol(data), "\n")
cat("\nFirst few rows:\n")
head(data)

cat("\nDataset summary:\n")
str(data)

cat("\nClass distribution:\n")
table(data$Labels)

cat("\nClass distribution percentage:\n")
prop.table(table(data$Labels)) * 100


In [None]:
# Visualize class distribution
library(ggplot2)

# Bar plot
p1 <- ggplot(data, aes(x = factor(Labels, labels = c("Benign", "Pathogenic")))) +
  geom_bar(fill = c("skyblue", "salmon")) +
  labs(title = "Class Distribution", x = "Label", y = "Count") +
  theme_minimal()

# Pie chart
p2 <- ggplot(data, aes(x = "", fill = factor(Labels, labels = c("Benign", "Pathogenic")))) +
  geom_bar(width = 1) +
  coord_polar("y") +
  labs(title = "Class Distribution (Percentage)", fill = "Label") +
  theme_void()

print(p1)
print(p2)


## 3. Prepare Data for Machine Learning


In [None]:
# Prepare data with one-hot encoding
data_split <- prepare_data(
  encoding_method = "onehot",
  test_size = 0.2,
  random_state = 42
)

X_train <- data_split$X_train
X_test <- data_split$X_test
y_train <- data_split$y_train
y_test <- data_split$y_test

cat("Training set shape:", nrow(X_train), "x", ncol(X_train), "\n")
cat("Test set shape:", nrow(X_test), "x", ncol(X_test), "\n")
cat("\nTraining set class distribution:\n")
table(y_train)
cat("\nTest set class distribution:\n")
table(y_test)


## 4. Train Multiple Models


In [None]:
# List of models to train
model_types <- c("random_forest", "xgboost", "svm", "logistic")

# Dictionary to store models and results
models <- list()
results <- list()

# Train each model
for (model_type in model_types) {
  cat("\n", rep("=", 60), "\n", sep = "")
  cat("Training", toupper(model_type), "model...\n")
  cat(rep("=", 60), "\n", sep = "")
  
  # Train model
  if (model_type == "random_forest") {
    model <- train_random_forest(X_train, y_train)
  } else if (model_type == "svm") {
    model <- train_svm(X_train, y_train)
  } else if (model_type == "logistic") {
    model <- train_logistic_regression(X_train, y_train)
  } else if (model_type == "xgboost") {
    model <- train_xgboost(X_train, y_train)
  }
  
  # Evaluate model
  metrics <- evaluate_model(model, X_test, y_test, model_type)
  
  # Store model and results
  models[[model_type]] <- model
  results[[model_type]] <- metrics
}


## 5. Compare Models


In [None]:
# Create comparison data frame
comparison_df <- data.frame(
  model = names(results),
  accuracy = sapply(results, function(x) x$accuracy),
  precision = sapply(results, function(x) x$precision),
  recall = sapply(results, function(x) x$recall),
  f1_score = sapply(results, function(x) x$f1_score),
  roc_auc = sapply(results, function(x) ifelse(is.na(x$roc_auc), NA, x$roc_auc))
)

cat("Model Comparison:\n")
print(round(comparison_df, 4))


In [None]:
# Visualize model comparison
library(ggplot2)
library(tidyr)

# Reshape data for plotting
comparison_long <- comparison_df %>%
  select(model, accuracy, precision, recall, f1_score) %>%
  gather(key = "metric", value = "value", -model)

# Create plot
ggplot(comparison_long, aes(x = model, y = value, fill = metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Model Comparison", x = "Model", y = "Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ylim(0, 1)


## 6. Train Ensemble Model


In [None]:
# Train ensemble model
cat("\nTraining Ensemble Model...\n")
ensemble_models <- train_ensemble(X_train, y_train, model_types = c("random_forest", "xgboost", "svm"))

# Predict using ensemble
ensemble_pred <- predict_ensemble(ensemble_models, X_test, method = "voting")

# Evaluate ensemble
y_test_factor <- as.factor(y_test)
ensemble_pred_factor <- as.factor(ensemble_pred)
cm <- confusionMatrix(ensemble_pred_factor, y_test_factor)

cat("\n", rep("=", 50), "\n", sep = "")
cat("ENSEMBLE MODEL RESULTS\n")
cat(rep("=", 50), "\n", sep = "")
cat("Accuracy: ", sprintf("%.4f", cm$overall['Accuracy']), "\n", sep = "")
cat("Precision:", sprintf("%.4f", cm$byClass['Precision']), "\n", sep = "")
cat("Recall:   ", sprintf("%.4f", cm$byClass['Recall']), "\n", sep = "")
cat("F1 Score: ", sprintf("%.4f", cm$byClass['F1']), "\n", sep = "")
cat(rep("=", 50), "\n", sep = "")


## 7. Save Best Model


In [None]:
# Save the best model
best_model_type <- comparison_df$model[which.max(comparison_df$accuracy)]
best_model <- models[[best_model_type]]

# Create models directory if it doesn't exist
if (!dir.exists("../models")) {
  dir.create("../models")
}

# Save model
model_path <- paste0("../models/", best_model_type, "_best.rds")
save_model_r(best_model, model_path)
cat("Best model (", best_model_type, ") saved to:", model_path, "\n")
