# Protein-Coding Potential Prediction

This notebook implements machine learning models to predict whether genomic regions encode proteins (protein-coding) or non-coding RNAs.

## Objectives
1. Load and explore genomic sequence data
2. Extract features from DNA sequences
3. Train machine learning models
4. Evaluate model performance
5. Visualize results


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, auc
)
import sys
import os

# Add src directory to path
sys.path.append('../src')
from protein_coding_predictor import ProteinCodingPredictor, SequenceFeatureExtractor

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Set random seed
np.random.seed(42)

print("Libraries imported successfully!")


## 2. Load and Explore Data


In [None]:
# Load data
data_path = '../data/genomics_data.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


In [None]:
# Initialize predictor and train
predictor = ProteinCodingPredictor(model_type='random_forest')
predictor.load_data('../data/genomics_data.csv')
X, y = predictor.prepare_data()
results = predictor.train(X, y)
predictor.plot_results(results, save_path='../results/performance_plots.png')
predictor.save_model('../models/protein_coding_predictor.pkl')
