In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Dataset

In [None]:
Xtr = pd.read_csv('/content/dataa/Xtr.csv')
Ytr= pd.read_csv('/content/dataa/Ytr.csv')
Xte= pd.read_csv('/content/dataa/Xte.csv')
Xte_mat = pd.read_csv('/content/dataa/Xte_mat100.csv', sep='\s+', header=None)
Xtr_mat = pd.read_csv('/content/dataa/Xtr_mat100.csv', sep='\s+', header=None)
Xtr.head()

Unnamed: 0,Id,seq
0,0,GAGGGGCTGGGGAGGGGGCTGGCCCAGAGGCACCAGACTCTGCAGA...
1,1,CGGCCTGGGGGCCACATGTGAGTGCTTACCTGTGTGGGGATGAGGG...
2,2,GACAACGCCGCTGTCAGCCGCCTTCGACTCACCTGGGAGGTGATGA...
3,3,GCCTCCCTTGGCACCACGGGAGACCAGTTTTGGAGGGGCGGGGCTG...
4,4,GCACTACTACACCCATTGCTGTAATAGTAAGTGCCGGTGCCTTCAC...


## CNN+KMeans + CNN

In [None]:
# One-hot encoding function
def one_hot_encode_seq(seq, maxlen=101):
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    one_hot = np.zeros((4, maxlen), dtype=np.float32)
    for i, char in enumerate(seq):
        if char in mapping:
            one_hot[mapping[char], i] = 1.0
    return one_hot

# Apply one-hot to all sequences
X_seq = np.stack([one_hot_encode_seq(seq) for seq in Xtr["seq"]])
X_test_seq = np.stack([one_hot_encode_seq(seq) for seq in Xte["seq"]])

# Convert all to PyTorch tensors
X_seq_tensor = torch.tensor(X_seq, dtype=torch.float32)
X_test_seq_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_tensor = torch.tensor(Ytr["Bound"].values, dtype=torch.float32)

X_cluster_tensor = torch.tensor(Xtr_mat.values, dtype=torch.float32)
X_test_cluster_tensor = torch.tensor(Xte_mat.values, dtype=torch.float32)

print("All data loaded and encoded.")
print(f"Train sequences shape: {X_seq_tensor.shape}")
print(f"Test sequences shape:  {X_test_seq_tensor.shape}")

All data loaded and encoded.
Train sequences shape: torch.Size([2000, 4, 101])
Test sequences shape:  torch.Size([1000, 4, 101])


In [None]:
# Normalize KMeans features ---

from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, TensorDataset, DataLoader

scaler = StandardScaler()
# Assign X_cluster and X_test_cluster with the KMeans features
X_cluster = Xtr_mat.values
X_test_cluster = Xte_mat.values
X_cluster = scaler.fit_transform(X_cluster)
X_test_cluster = scaler.transform(X_test_cluster)

X_cluster_tensor = torch.tensor(X_cluster, dtype=torch.float32)
X_test_cluster_tensor = torch.tensor(X_test_cluster, dtype=torch.float32)

# Prepare train/val split
X_seq_train, X_seq_val, X_clust_train, X_clust_val, y_train, y_val = train_test_split(
    X_seq_tensor, X_cluster_tensor, y_tensor, test_size=0.2, random_state=42)

# Dataset for CNN+KMeans
class DualInputDataset(Dataset):
    def __init__(self, seq_tensor, clust_tensor, labels):
        self.seq = seq_tensor
        self.clust = clust_tensor
        self.labels = labels

    def __getitem__(self, idx):
        return self.seq[idx], self.clust[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)

train_dataset = DualInputDataset(X_seq_train, X_clust_train, y_train)
val_dataset = DualInputDataset(X_seq_val, X_clust_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# CNN+KMeans model
class CNNKMeansFusion(nn.Module):
    def __init__(self):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(4, 128, kernel_size=9, padding=4),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 256, kernel_size=15, padding=7),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten()
        )
        self.mlp = nn.Sequential(
            nn.Linear(256 + 100, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x_seq, x_clust):
        x_cnn = self.cnn(x_seq)
        x_concat = torch.cat([x_cnn, x_clust], dim=1)
        return self.mlp(x_concat)

In [None]:
# CNN-only model
class CNNOnly(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(4, 128, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.Conv1d(128, 256, kernel_size=11, padding=5),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

model2 = CNNOnly()
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=0.003)
# Here I train CNN-only model with validation accuracy
def compute_accuracy(preds, labels):
    preds = (preds >= 0.5).float()
    correct = (preds == labels).float().sum()
    return correct / len(labels)

for epoch in range(20):
    model2.train()
    total_loss = 0
    for xb, _, yb in train_loader:
        preds = model2(xb).squeeze()
        loss = loss_fn(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # --- Validation ---
    model2.eval()
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for xb, _, yb in val_loader:
            preds = model2(xb).squeeze()
            val_preds.append(preds)
            val_targets.append(yb)
    val_preds = torch.cat(val_preds)
    val_targets = torch.cat(val_targets)
    val_acc = compute_accuracy(val_preds, val_targets)

    print(f"Epoch {epoch+1:02d} - Loss: {total_loss/len(train_loader):.4f} - Val Accuracy: {val_acc:.4f}")

Epoch 01 - Loss: 0.6937 - Val Accuracy: 0.5450
Epoch 02 - Loss: 0.5972 - Val Accuracy: 0.6850
Epoch 03 - Loss: 0.4771 - Val Accuracy: 0.6575
Epoch 04 - Loss: 0.3728 - Val Accuracy: 0.6475
Epoch 05 - Loss: 0.2559 - Val Accuracy: 0.6725
Epoch 06 - Loss: 0.1198 - Val Accuracy: 0.7000
Epoch 07 - Loss: 0.0512 - Val Accuracy: 0.6700
Epoch 08 - Loss: 0.0454 - Val Accuracy: 0.6825
Epoch 09 - Loss: 0.0333 - Val Accuracy: 0.6650
Epoch 10 - Loss: 0.0057 - Val Accuracy: 0.6700
Epoch 11 - Loss: 0.0197 - Val Accuracy: 0.6825
Epoch 12 - Loss: 0.0162 - Val Accuracy: 0.7000
Epoch 13 - Loss: 0.0038 - Val Accuracy: 0.6950
Epoch 14 - Loss: 0.0019 - Val Accuracy: 0.6900
Epoch 15 - Loss: 0.0050 - Val Accuracy: 0.6850
Epoch 16 - Loss: 0.0130 - Val Accuracy: 0.6875
Epoch 17 - Loss: 0.0089 - Val Accuracy: 0.7000
Epoch 18 - Loss: 0.0745 - Val Accuracy: 0.6975
Epoch 19 - Loss: 0.0451 - Val Accuracy: 0.7000
Epoch 20 - Loss: 0.0744 - Val Accuracy: 0.6975


## Predictions

In [None]:
# predictions
model1 = CNNKMeansFusion()
torch.save(model1.state_dict(), 'model.pth')
model1.load_state_dict(torch.load('model.pth'))
model1.eval()
with torch.no_grad():
    preds1 = model1(X_test_seq_tensor, X_test_cluster_tensor).squeeze()
    preds2 = model2(X_test_seq_tensor).squeeze()
    alpha=0.7
    avg_preds = ((alpha*preds1 +(1-alpha)*preds2)).cpu().numpy()
    final_preds = (avg_preds >= 0.5).astype(int)

In [None]:
# Create submission.csv
with open("submission6.csv", "w") as f:
    f.write("Id,Bound\n")
    for i, p in enumerate(final_preds):
        f.write(f"{i},{p}\n")