# Learning Methylated Sequence to Dyad Relationship
Train a neural network to discover the relationship between 6-letter methylated sequences and dyad positions.

In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt

In [None]:
# Define a simple model
class DyadPredictor(nn.Module):
    def __init__(self, seq_len=6, hidden_dim=32):
        super().__init__()
        # Embed 6-letter alphabet to vectors
        self.embed = nn.Embedding(6, 8)
        
        # Process sequence with Conv1d
        self.conv = nn.Conv1d(8, hidden_dim, kernel_size=3, padding=1)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * seq_len, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Predict dyad position (0-1 normalized)
        )
    
    def forward(self, x):
        x = self.embed(x)  # (batch, seq_len) -> (batch, seq_len, 8)
        x = x.transpose(1, 2)  # (batch, 8, seq_len)
        x = self.conv(x)  # (batch, hidden_dim, seq_len)
        x = x.flatten(1)  # (batch, hidden_dim * seq_len)
        x = self.fc(x)
        return torch.sigmoid(x)  # Normalize to [0, 1]

In [None]:
# Prepare training data
# Example: map 6-letter sequence to dyad position
# Replace with your actual data!

def generate_synthetic_data(n_samples=1000, seq_len=6):
    """Generate synthetic methylated sequences and corresponding dyads."""
    sequences = np.random.randint(0, 6, (n_samples, seq_len))
    
    # Simple rule: dyad position depends on sequence pattern
    # Example: sum of letters modulo 1
    dyads = (sequences.sum(axis=1) / (6 * 6)) % 1.0
    
    return sequences, dyads

X, y = generate_synthetic_data()
X_tensor = torch.LongTensor(X)
y_tensor = torch.FloatTensor(y).reshape(-1, 1)

dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = DyadPredictor().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

epochs = 50
losses = []

for epoch in range(epochs):
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        pred = model(batch_X)
        loss = criterion(pred, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_loader)
    losses.append(avg_loss)
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

plt.figure(figsize=(8, 4))
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

In [None]:
# Test the model
model.eval()
with torch.no_grad():
    test_seq = torch.LongTensor([[0, 1, 2, 3, 4, 5]]).to(device)
    pred = model(test_seq)
    print(f"Test sequence: {test_seq.cpu().numpy()}")
    print(f"Predicted dyad position (normalized): {pred.cpu().item():.4f}")

## To use with your data:
1. Provide your `methylated_sequence` (6 letters, encoded 0-5)
2. Provide corresponding `dyads` positions (normalized 0-1 or as indices)
3. Replace `generate_synthetic_data()` with your actual data
4. Adjust model architecture if needed