# Protein-Coding Potential Prediction (R)

This notebook implements machine learning models in R to predict whether genomic regions encode proteins (protein-coding) or non-coding RNAs.

## Objectives
1. Load and explore genomic sequence data
2. Extract features from DNA sequences
3. Train machine learning models
4. Evaluate model performance
5. Visualize results


In [None]:
# Load required libraries
library(dplyr)
library(caret)
library(randomForest)
library(gbm)
library(e1071)
library(pROC)
library(ggplot2)
library(gridExtra)

# Set random seed
set.seed(42)

cat("Libraries loaded successfully!\n")


## 2. Load and Explore Data


In [None]:
# Load data
data_path <- "../data/genomics_data.csv"
df <- read.csv(data_path, stringsAsFactors = FALSE)

cat("Dataset shape:", nrow(df), "rows,", ncol(df), "columns\n")
cat("\nFirst few rows:\n")
head(df)


In [None]:
# Explore data
cat("Class Distribution:\n")
table(df$Labels)
cat("\nClass Proportions:\n")
prop.table(table(df$Labels))

# Analyze sequence characteristics
df$Sequence_Length <- nchar(df$Sequences)
df$GC_Content <- (sapply(df$Sequences, function(x) sum(strsplit(x, "")[[1]] == "G")) +
                  sapply(df$Sequences, function(x) sum(strsplit(x, "")[[1]] == "C"))) / df$Sequence_Length

# Visualize
p1 <- ggplot(df, aes(x = factor(Labels), y = GC_Content, fill = factor(Labels))) +
  geom_boxplot() +
  scale_fill_manual(values = c("skyblue", "lightcoral"), labels = c("Non-coding", "Protein-coding")) +
  labs(title = "GC Content by Class", x = "Class", y = "GC Content") +
  theme_minimal()

p2 <- ggplot(df, aes(x = factor(Labels), fill = factor(Labels))) +
  geom_bar() +
  scale_fill_manual(values = c("skyblue", "lightcoral"), labels = c("Non-coding", "Protein-coding")) +
  labs(title = "Class Distribution", x = "Class", y = "Count") +
  theme_minimal()

grid.arrange(p1, p2, ncol = 2)


## 3. Source Prediction Functions


In [None]:
# Source the main R script with all functions
source("../R/protein_coding_predictor.R")
cat("Functions loaded successfully!\n")


## 4. Extract Features and Train Model


In [None]:
# Extract features
cat("Extracting features from sequences...\n")
sequences <- df$Sequences
labels <- df$Labels
X <- extract_features(sequences)
y <- labels

cat("Feature matrix shape:", nrow(X), "x", ncol(X), "\n")
cat("Number of features:", ncol(X), "\n")

# Train model
results <- train_model(X, y, model_type = "randomForest")


## 5. Visualize Results


In [None]:
# Plot results
plot_results(results, save_path = "../results/performance_plots_R.png")


## 6. Make Predictions


In [None]:
# Example predictions
test_sequences <- c(
  "GTCCACGACCGAACTCCCACCTTGACCGCAGAGGTACCACCAGAGCCCTG",
  "GAGTTTATATGGCGCGAGCCTAGTGGTTTTTGTACTTGTTTGTCGCGTCG"
)

pred_results <- predict_sequences(results$model, test_sequences, 
                                 model_type = results$model_type)

for (i in 1:length(test_sequences)) {
  cat("\nSequence", i, ":", substr(test_sequences[i], 1, 30), "...\n")
  cat("Prediction:", as.character(pred_results$predictions[i]), "\n")
  cat("Probability:", sprintf("%.4f", pred_results$probabilities[i]), "\n")
}


## 7. Save Model


In [None]:
# Save model
saveRDS(results$model, "../models/protein_coding_predictor_R.rds")
cat("Model saved successfully!\n")
