## 🧬 One-Hot Encoded CNN for TFBS Detection (Optuna-CNN-One-Hot)

This notebook focuses on building a CNN classifier for identifying transcription factor binding sites (TFBS) from raw DNA sequences using one-hot encoding. Each nucleotide (A, C, G, T) is represented as a binary vector, and sequences are transformed into 4×N matrices suitable for convolutional processing.

A CNN is constructed to learn spatial features over the one-hot matrix, with its architecture (layer count, filter width, dropout rate, etc.) optimized using Optuna. The objective is to maximize validation accuracy through automated trials, identifying the most performant configuration of the network.

The final model is capable of making accurate binary predictions on whether a given DNA sequence contains a TFBS, leveraging the raw structure of the sequence without requiring handcrafted features.


In [None]:
import pandas as pd
import numpy as np
import torch
import optuna

import sys

sys.path.append("../utils")

from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from one_hot import one_hot_encode_sequences
from data_loader_one_hot import prepare_dataloaders
from optuna_cnn_one_hot_utils import *

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
data_dir = "..\\Data"
excel_dir = "..\\Outputs\\excel_results.xlsx"

results_df, excel_df = initialize_results_df(data_dir, excel_dir)

train_df = load_sequence_data(results_df["train_path"][0])
test_df = load_sequence_data(results_df["test_path"][0])
X = one_hot_encode_sequences(train_df["sequence"])
X = X.transpose(0, 2, 1)
y = train_df["label"]

X_test = one_hot_encode_sequences(test_df["sequence"])
X_test = X_test.transpose(0, 2, 1)
y_test = test_df["label"]

train_loader, valid_loader = prepare_dataloaders(X, y)

In [None]:
search_space = {
    "num_layers": {"type": "int", "low": 4, "high": 15},
    "lr": {"type": "float", "low": 1e-4, "high": 1e-2, "log": True},
    "units": {"type": "categorical", "choices": [32, 64, 128]},
    "kernel_size": {"type": "categorical", "choices": [3, 5]},
    "activation": {"type": "categorical", "choices": ["relu", "tanh"]},
    "pool_size": {"type": "categorical", "choices": [1, 2]},
    "dropout": {"type": "float", "low": 0.0, "high": 0.5},
    "dilation": {"type": "categorical", "choices": [1, 2]},
}


best_model, best_params, metrics, study = run_optuna_pipeline(
    train_loader,
    valid_loader,
    device="cuda",
    input_len=101,
    epochs=10,
    n_trials=15,
    save_path="../Models/CNN_OH.pt",
    search_space=search_space,
)

print(best_params)
print(metrics)

[I 2025-05-04 03:14:02,834] A new study created in memory with name: no-name-b055eb4e-d8a6-4031-b37e-06d8716c7da9
[I 2025-05-04 03:14:38,656] Trial 0 finished with value: 0.9688606492862704 and parameters: {'num_layers': 4, 'lr': 0.00960716954798707, 'units_0': 128, 'kernel_size_0': 3, 'activation_0': 'tanh', 'pool_size_0': 2, 'dropout_0': 0.24461575242767664, 'dilation_0': 1, 'units_1': 32, 'kernel_size_1': 3, 'activation_1': 'relu', 'pool_size_1': 1, 'dropout_1': 0.32524137844098155, 'dilation_1': 2, 'units_2': 32, 'kernel_size_2': 5, 'activation_2': 'tanh', 'pool_size_2': 2, 'dropout_2': 0.3502093073976296, 'dilation_2': 1, 'units_3': 32, 'kernel_size_3': 5, 'activation_3': 'relu', 'pool_size_3': 1, 'dropout_3': 0.3804891375690817, 'dilation_3': 1}. Best is trial 0 with value: 0.9688606492862704.
[I 2025-05-04 03:16:06,131] Trial 1 finished with value: 0.9753124320948446 and parameters: {'num_layers': 15, 'lr': 0.0023491455832738714, 'units_0': 64, 'kernel_size_0': 3, 'activation_0'

{'num_layers': 12, 'lr': 0.0016999759841156564, 'units_0': 64, 'kernel_size_0': 3, 'activation_0': 'relu', 'pool_size_0': 1, 'dropout_0': 0.1250627613377461, 'dilation_0': 2, 'units_1': 128, 'kernel_size_1': 5, 'activation_1': 'tanh', 'pool_size_1': 1, 'dropout_1': 0.12632194237159514, 'dilation_1': 1, 'units_2': 128, 'kernel_size_2': 3, 'activation_2': 'relu', 'pool_size_2': 1, 'dropout_2': 0.058325552369733163, 'dilation_2': 1, 'units_3': 128, 'kernel_size_3': 5, 'activation_3': 'relu', 'pool_size_3': 1, 'dropout_3': 0.1862313569969462, 'dilation_3': 1, 'units_4': 32, 'kernel_size_4': 5, 'activation_4': 'tanh', 'pool_size_4': 2, 'dropout_4': 0.4080685576288325, 'dilation_4': 2, 'units_5': 64, 'kernel_size_5': 5, 'activation_5': 'tanh', 'pool_size_5': 2, 'dropout_5': 0.15230852595425645, 'dilation_5': 1, 'units_6': 64, 'kernel_size_6': 5, 'activation_6': 'tanh', 'pool_size_6': 2, 'dropout_6': 0.03052722575414722, 'dilation_6': 2, 'units_7': 128, 'kernel_size_7': 5, 'activation_7': 'ta

In [None]:
# Save the Model

study.best_params
import json

with open("../Models/CNN_OH.json", "w") as f:
    json.dump(study.best_params, f)

In [None]:
# import optuna.visualization as vis

# # Optimization progress
# vis.plot_optimization_history(study).show()

# # Param importance (which hp mattered most?)
# vis.plot_param_importances(study).show()

# # Parallel coordinates (interactions between hparams)
# vis.plot_parallel_coordinate(study).show()

# LOOPING THROUGH FOLDERS

In [None]:
import pandas as pd
import numpy as np
import torch
import optuna
import json

import sys

sys.path.append("../utils")

from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from one_hot import one_hot_encode_sequences
from data_loader_one_hot import prepare_dataloaders
from optuna_cnn_one_hot_utils import *

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# load hp from JSON
with open("../Models/CNN_OH.json", "r") as f:
    hp = json.load(f)

In [None]:
# create model
input_len = 101  # or whatever your input length is
model = DynamicCNN(hp, input_len=input_len)

# load saved state_dict
model.load_state_dict(torch.load("../Models/CNN_OH.pt"))

# move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Model loaded and ready!")

✅ Model loaded and ready!


In [None]:
# Paths
data_dir = "../Data"
excel_path = "../Outputs/50_CNN_OH.xlsx"

# Load dataframes
results_df, excel_df = initialize_results_df(data_dir, excel_path)

In [None]:
for idx, row in results_df.iloc[:50].iterrows():
    train_path = row["train_path"]
    test_path = row["test_path"]
    folder_name = row["folder_name"]

    print(f"🔄 Processing {folder_name}")

    # --- Load training data ---
    train_df = load_sequence_data(train_path)
    test_df = load_sequence_data(test_path)

    # --- One-hot encode ---
    X_train = one_hot_encode_sequences(train_df["sequence"])
    X_train = X_train.transpose(0, 2, 1)
    y_train = train_df["label"]

    X_test = one_hot_encode_sequences(test_df["sequence"])
    X_test = X_test.transpose(0, 2, 1)
    y_test = test_df["label"]

    # --- Prepare loaders ---
    train_loader, valid_loader = prepare_dataloaders(
        X_train, y_train, batch_size=32
    )
    test_loader, _ = prepare_dataloaders(X_test, y_test, batch_size=32)

    # --- Fine-tune model ---
    model = train_model(
        model, train_loader, valid_loader, device, hp, epochs=5
    )

    # --- Evaluate ---
    train_metrics = evaluate_model(model, train_loader, device)
    test_metrics = evaluate_model(model, test_loader, device)

    print(
        f"✅ {folder_name}: train_acc={train_metrics['accuracy']:.4f}, test_acc={test_metrics['accuracy']:.4f}"
    )

    # # --- Store metrics ---
    excel_df.at[idx, "folder_name"] = folder_name
    excel_df.at[idx, "train_accuracy"] = train_metrics["accuracy"]
    excel_df.at[idx, "test_accuracy"] = test_metrics["accuracy"]
    excel_df.at[idx, "pr-roc"] = test_metrics["roc_auc"]
    excel_df.at[idx, "pr-auc"] = test_metrics["average_precision"]

# ✅ Save updated Excel
excel_df.iloc[: idx + 1].to_excel(excel_path, index=False)
print(f"✅ Metrics saved to {excel_path}")

# ✅ Save the final fine-tuned model
torch.save(
    model.state_dict(), "../Models/50_CNN_OH.pt"
)  #  <---  Fine tuned model after 50 folders.
print("✅ Final fine-tuned model saved to Models/50_CNN_OH.pt")

# USER INPUT

In [None]:
import pandas as pd
import numpy as np
import torch
import optuna
import json

import sys

sys.path.append("../utils")

from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from one_hot import one_hot_encode_sequences
from data_loader_one_hot import prepare_dataloaders
from optuna_cnn_one_hot_utils import *

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
hp_path = "../Models/CNN_OH.json"
model_path = "../Models/50_CNN_OH.pt"

with open(hp_path, "r") as f:
    hp = json.load(f)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = DynamicCNN(hp, input_len=101)
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.to(device)

  model.load_state_dict(torch.load(model_path, map_location='cpu'))


DynamicCNN(
  (feature_extractor): Sequential(
    (0): Conv1d(4, 64, kernel_size=(3,), stride=(1,), padding=same, dilation=(2,))
    (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    (4): Dropout(p=0.1250627613377461, inplace=False)
    (5): Conv1d(64, 128, kernel_size=(5,), stride=(1,), padding=same)
    (6): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Tanh()
    (8): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    (9): Dropout(p=0.12632194237159514, inplace=False)
    (10): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=same)
    (11): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU()
    (13): MaxPool1d(kernel_size=1, stride=1, padding=0, dilation=1, ceil_mode=False)
    (14): Dropout(p=0.058325552369733163, inpla

In [None]:
sequence = input("Enter a DNA sequence (length = 101): ").strip()
label, confidence = predict_onehot_sequence(
    model, sequence, device="cuda", seq_len=101
)
print(f"Prediction: {label} (Confidence: {confidence}%)")

Prediction: TFBS (Confidence: 73.11%)
