In [4]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

def one_hot_pad(sequences, max_len=1000, alphabet="ACGT-"):
    char_to_index = {c: i for i, c in enumerate(alphabet)}
    X = np.zeros((len(sequences), max_len, len(alphabet)), dtype=np.float32)
    for i, seq in enumerate(sequences):
        for j, c in enumerate(seq[:max_len]):
            X[i, j, char_to_index[c]] = 1.0
    return X

def generate_random_sequences(n_sequences=10000, max_len=1000, alphabet="ACGT-"):
    sequences = []
    for _ in range(n_sequences):
        L = random.randint(1, max_len)
        seq = ''.join(random.choices(alphabet, k=L))
        sequences.append(seq)
    return sequences
    
def one_hot_encode(sequences, alphabet="ACGT-"):
    char_to_index = {c: i for i, c in enumerate(alphabet)}
    max_len = max(len(seq) for seq in sequences)
    X = np.zeros((len(sequences), max_len, len(alphabet)), dtype=np.float32)
    for i, seq in enumerate(sequences):
        for j, c in enumerate(seq):
            X[i, j, char_to_index[c]] = 1.0
    return X

class ConvSeqAutoencoder(nn.Module):
    def __init__(self, seq_len=1000, alphabet_size=5, latent_dim=32):
        super().__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(alphabet_size, 32, kernel_size=9, stride=3, padding=4),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=9, stride=3, padding=4),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=9, stride=3, padding=4),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),   # komprimiert auf Länge 1
        )
        self.fc_mu = nn.Linear(128, latent_dim)
        
        # Decoder (vereinfacht)
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.Linear(128, seq_len * alphabet_size),
            nn.Sigmoid()
        )
        self.seq_len = seq_len
        self.alphabet_size = alphabet_size
        
    def forward(self, x):
        # Eingabe: (B, L, A)
        x = x.permute(0, 2, 1)       # -> (B, A, L)
        encoded = self.encoder(x).squeeze(-1)
        z = self.fc_mu(encoded)
        decoded = self.decoder(z)
        decoded = decoded.view(-1, self.seq_len, self.alphabet_size)
        return decoded, z
        

In [2]:
# Beispiel-Daten
seqs = ['TTGT', 'ATC-', 'TTGC', 'TTGC', '----']
X = one_hot_encode(seqs)
X = torch.tensor(X.reshape(len(seqs), -1))  # Flatten

model = SequenceAutoencoder(seq_len=X.shape[1] // 5, latent_dim=8)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

for epoch in range(100):
    optimizer.zero_grad()
    recon, z = model(X)
    loss = criterion(recon, X)
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, loss={loss.item():.4f}")

with torch.no_grad():
    embeddings = model.encoder(X)
print(embeddings)
print(embeddings.shape)  # (Anzahl Sequenzen, latent_dim)


Epoch 0, loss=0.2492
Epoch 100, loss=0.0000
Epoch 200, loss=0.0000
Epoch 300, loss=0.0000
Epoch 400, loss=0.0000
tensor([[ 4.9679, -3.5180,  0.2629, -4.2421, -3.0753,  3.1379,  3.5786,  4.1005],
        [ 1.6593, -4.9951, -0.7777, -1.4683,  2.7091,  0.4280,  4.6903,  5.1249],
        [ 5.5396, -0.0456,  2.2221, -4.3101, -6.6020,  6.5573,  2.0966,  2.3053],
        [ 5.5396, -0.0456,  2.2221, -4.3101, -6.6020,  6.5573,  2.0966,  2.3053],
        [-0.7830, -4.2424, -5.3825,  0.0084,  5.2634, -4.0651,  3.3325,  6.6650]])
torch.Size([5, 8])


In [5]:
# Daten erzeugen
seqs = generate_random_sequences(n_sequences=20000, max_len=1000)
X = one_hot_pad(seqs)
X = torch.tensor(X)

# Modell, Optimizer, Loss
model = ConvSeqAutoencoder(seq_len=1000, latent_dim=32)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# Training
batch_size = 64
n_epochs = 5

for epoch in range(n_epochs):
    perm = torch.randperm(X.size(0))
    total_loss = 0
    for i in range(0, X.size(0), batch_size):
        batch = X[perm[i:i+batch_size]]
        recon, z = model(batch)
        loss = criterion(recon, batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, loss={total_loss / (X.size(0)/batch_size):.4f}")


Epoch 1, loss=0.1052
Epoch 2, loss=0.0845
Epoch 3, loss=0.0814
Epoch 4, loss=0.0811
Epoch 5, loss=0.0810
Epoch 6, loss=0.0809
Epoch 7, loss=0.0808
Epoch 8, loss=0.0808
Epoch 9, loss=0.0807
Epoch 10, loss=0.0807


In [8]:
# Auswertung: Korrelation und Scatterplot
import numpy as np
import torch
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
from itertools import combinations
from random import sample

def hamming_seq(a, b):
    # a,b als strings gleicher Länge (hier padded)
    return sum(x != y for x, y in zip(a, b))

def evaluate_embeddings(seqs, embeddings, n_pairs=1000):
    # seqs: list of strings (alle gleiche Länge)
    # embeddings: numpy array (N, D)
    N = len(seqs)
    pairs = [tuple(sample(range(N), 2)) for _ in range(n_pairs)]
    hammings = []
    edists = []
    for i, j in pairs:
        hamm = hamming_seq(seqs[i], seqs[j])
        ed = np.linalg.norm(embeddings[i] - embeddings[j])
        hammings.append(hamm)
        edists.append(ed)
    hammings = np.array(hammings)
    edists = np.array(edists)
    print("Pearson r:", pearsonr(hammings, edists)[0])
    print("Spearman rho:", spearmanr(hammings, edists)[0])
    # Scatter
    plt.figure(figsize=(6,4))
    plt.scatter(hammings, edists, s=8, alpha=0.6)
    plt.xlabel("Hamming-Distanz")
    plt.ylabel("Embedding (Euklid) Distanz")
    plt.title("Hamming vs. Embedding Distanz")
    plt.grid(True)
    plt.show()
    return hammings, edists

import torch.nn.functional as F

# Beispiel Triplet-Loss-Trainingsstep (pseudocode)
margin = 1.0
optimizer.zero_grad()
# x_a, x_p, x_n: Batches (B, L, A) tensors
_, z_a = model(x_a)
_, z_p = model(x_p)
_, z_n = model(x_n)
loss_triplet = F.triplet_margin_loss(z_a, z_p, z_n, margin=margin)
# optional: kombiniere mit Rekonstruktionsloss
loss_recon = criterion(model(x_a)[0], x_a)
loss = loss_recon + 0.5 * loss_triplet
loss.backward()
optimizer.step()


NameError: name 'x_a' is not defined