# Mutation Impact and Pathogenicity Prediction

This notebook demonstrates how to predict the functional impact of mutations (missense, nonsense, regulatory) using machine learning approaches.

## Objectives
1. Load and preprocess genomics mutation data
2. Encode DNA sequences for machine learning
3. Train multiple ML models
4. Evaluate and compare model performance
5. Make predictions on new mutations


## 1. Import Libraries


In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# Add src directory to path
sys.path.append('../src')
from data_loader import GenomicsDataLoader
from models import MutationPredictor, ModelEnsemble

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline


## 2. Load and Explore Data


In [None]:
# Initialize data loader
loader = GenomicsDataLoader()

# Load data
data = loader.load_data()

# Display basic information
print("Dataset shape:", data.shape)
print("\nFirst few rows:")
print(data.head())
print("\nDataset info:")
print(data.info())
print("\nClass distribution:")
print(data['Labels'].value_counts())
print("\nClass distribution percentage:")
print(data['Labels'].value_counts(normalize=True) * 100)


In [None]:
# Prepare data with one-hot encoding
X_train, X_test, y_train, y_test = loader.prepare_data(
    encoding_method='onehot',
    test_size=0.2,
    random_state=42
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"\nTraining set class distribution: {np.bincount(y_train)}")
print(f"Test set class distribution: {np.bincount(y_test)}")


## 3. Train Multiple Models


In [None]:
# List of models to train
model_types = ['random_forest', 'gradient_boosting', 'logistic', 'svm', 'neural_network']

# Dictionary to store models and results
models = {}
results = {}

# Train each model
for model_type in model_types:
    print(f"\n{'='*60}")
    print(f"Training {model_type.upper()} model...")
    print(f"{'='*60}")
    
    # Create and train model
    model = MutationPredictor(model_type=model_type)
    model.train(X_train, y_train)
    
    # Evaluate model
    metrics = model.evaluate(X_test, y_test)
    
    # Store model and results
    models[model_type] = model
    results[model_type] = metrics


## 4. Compare Models and Save Best Model


In [None]:
# Create comparison DataFrame
import pandas as pd
comparison_df = pd.DataFrame(results).T
print("\nModel Comparison:")
print(comparison_df.round(4))

# Save the best model
best_model_type = comparison_df['accuracy'].idxmax()
best_model = models[best_model_type]
model_path = best_model.save_model()
print(f"\nBest model ({best_model_type}) saved to: {model_path}")
