# Transcription Factor Binding Prediction

This notebook implements comprehensive analysis and prediction of transcription factor (TF) binding sites using multiple machine learning and deep learning approaches.

## Objectives
1. Load and explore genomics data
2. Preprocess DNA sequences
3. Train multiple models (CNN, LSTM, Random Forest, XGBoost, SVM)
4. Evaluate and compare model performance
5. Visualize results

## Dataset
- **Sequences**: DNA sequences (50 nucleotides)
- **Labels**: Binary labels (0 = no binding, 1 = TF binding)


In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks
import warnings
import os
import sys

# Add parent directory to path to import scripts
sys.path.append('../../scripts/python')
from tf_binding_prediction import TFBindingPredictor, DNASequenceEncoder

warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline


## 1. Data Loading and Exploration


In [None]:
# Load data
data_path = '../../data/genomics_data.csv'
df = pd.read_csv(data_path)

print(f"Data shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nData info:")
print(df.info())
print(f"\nLabel distribution:")
print(df['Labels'].value_counts())
print(f"\nLabel distribution (%):")
print(df['Labels'].value_counts(normalize=True) * 100)


In [None]:
# Initialize predictor and load data
predictor = TFBindingPredictor(data_path=data_path)
predictor.load_data()


## 2. Train Models and Evaluate


In [None]:
# Train all models
predictor.train_cnn(epochs=50, batch_size=32)
predictor.train_lstm(epochs=50, batch_size=32)
predictor.train_random_forest(n_estimators=100, max_depth=20)
predictor.train_xgboost(n_estimators=100, max_depth=6)
predictor.train_svm()

# Evaluate all models
predictor.evaluate_all_models()


In [None]:
# Create comparison dataframe
import pandas as pd
results_df = pd.DataFrame({
    'Model': list(predictor.results.keys()),
    'Accuracy': [predictor.results[m]['accuracy'] for m in predictor.results.keys()],
    'Precision': [predictor.results[m]['precision'] for m in predictor.results.keys()],
    'Recall': [predictor.results[m]['recall'] for m in predictor.results.keys()],
    'F1-Score': [predictor.results[m]['f1'] for m in predictor.results.keys()],
    'ROC-AUC': [predictor.results[m]['roc_auc'] for m in predictor.results.keys()]
})

print("\nModel Comparison:")
print(results_df.to_string(index=False))


In [None]:
# Plot results
predictor.plot_training_history('cnn')
predictor.plot_training_history('lstm')
predictor.plot_confusion_matrices()
predictor.plot_roc_curves()

# Save models
predictor.save_models(directory='../../models')
print("\nAll models and results have been saved!")
