## 🧬 k-mer Embedding via Word2Vec for Genomic Sequences

This notebook demonstrates the use of Word2Vec to learn vector representations (embeddings) of k-mers from DNA sequences. K-mers are substrings of fixed length extracted from sequences using a sliding window, mimicking the way words are tokenized in NLP.

The skip-gram model of Word2Vec is used to train embeddings that capture the contextual similarity between k-mers based on their occurrence in biological sequences. The trained embedding vectors are then visualized using PCA to reveal patterns or clustering among similar k-mers.

These learned k-mer embeddings are biologically meaningful and can be used as input features in downstream classification models like CNNs or gradient-boosted trees. The notebook lays the foundation for transforming symbolic DNA sequences into dense, semantically rich numerical representations.


In [None]:
import sys

sys.path.append("../utils")
from k_mer_data_loader import prepare_kmer_loaders
from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from kmer_2_vec import (
    get_kmer_list,
    train_word2vec,
    build_vocab,
    build_embedding_matrix,
)
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from kmer_2_vec import SimpleCNN, train_and_evaluate


w2v_model = Word2Vec.load("../Models/kmer2vec_k_6_s_1.model")

In [14]:
# Training Word2Vec Model!

# file_path = "C:/Users/dhair/Desktop/Captstone-Proj/Data/Data_3/global_train.data"
# df = load_sequence_data(file_path)
# corpus = get_kmer_list(df['seq'].tolist(), k=6, stride=1)
# w2v_model = train_word2vec(corpus)
# w2v_model.save("../Outputs/kmer2vec_k_6_s_1.model")
# vocab = build_vocab(k=6)
# pretrained_embeddings = {kmer: w2v_model.wv[kmer] for kmer in w2v_model.wv.index_to_key}
# embedding_matrix = build_embedding_matrix(vocab, pretrained_embeddings, embedding_dim=128)

In [None]:
vocab = build_vocab(k=6)

pretrained_embeddings = {
    kmer: w2v_model.wv[kmer] for kmer in w2v_model.wv.index_to_key
}
embedding_matrix = build_embedding_matrix(
    vocab, pretrained_embeddings, embedding_dim=128
)

In [None]:
data_dir = "..\\Data"
excel_dir = "..\\Outputs\\excel_results.xlsx"

results_df, excel_df = initialize_results_df(data_dir, excel_dir)

train_df = load_sequence_data(results_df["train_path"][1])
test_df = load_sequence_data(results_df["test_path"][1])

train_loader, valid_loader, test_loader = prepare_kmer_loaders(
    train_df["sequence"].tolist(),
    train_df["label"].values,
    test_df["sequence"].tolist(),
    test_df["label"].values,
    vocab,
    k=6,
    stride=1,
    max_len=96,
    batch_size=32,
)

In [None]:
model = SimpleCNN(embedding_matrix, True).to(device)
trained_model, history = train_and_evaluate(
    model,
    train_loader,
    valid_loader,
    test_loader,
    device=device,
    epochs=10,
    lr=1e-3,
    weight_decay=1e-4,
)

🔄 Epoch 1/10 started...
📈 Epoch 1/10 | Train Loss: 0.7803 | Train Acc: 0.5466 | Val Loss: 0.6934 | Val Acc: 0.5522 | Val ROC-AUC: 0.6133
🔄 Epoch 2/10 started...
📈 Epoch 2/10 | Train Loss: 0.5885 | Train Acc: 0.6940 | Val Loss: 0.7072 | Val Acc: 0.5690 | Val ROC-AUC: 0.6378
🔄 Epoch 3/10 started...
📈 Epoch 3/10 | Train Loss: 0.4915 | Train Acc: 0.7738 | Val Loss: 0.6548 | Val Acc: 0.6381 | Val ROC-AUC: 0.6880
🔄 Epoch 4/10 started...
📈 Epoch 4/10 | Train Loss: 0.3994 | Train Acc: 0.8307 | Val Loss: 0.6623 | Val Acc: 0.6325 | Val ROC-AUC: 0.6893
🔄 Epoch 5/10 started...
📈 Epoch 5/10 | Train Loss: 0.3334 | Train Acc: 0.8708 | Val Loss: 0.6828 | Val Acc: 0.6343 | Val ROC-AUC: 0.6868
🔄 Epoch 6/10 started...
📈 Epoch 6/10 | Train Loss: 0.2586 | Train Acc: 0.9053 | Val Loss: 0.7409 | Val Acc: 0.6306 | Val ROC-AUC: 0.6667
🔄 Epoch 7/10 started...
📈 Epoch 7/10 | Train Loss: 0.2079 | Train Acc: 0.9328 | Val Loss: 0.7761 | Val Acc: 0.6381 | Val ROC-AUC: 0.6679
🔄 Epoch 8/10 started...
📈 Epoch 8/10 | Tr

# LOOPING THROUGH FOLDERS

In [None]:
import sys

sys.path.append("../utils")
from k_mer_data_loader import prepare_kmer_loaders
from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from kmer_2_vec import (
    get_kmer_list,
    train_word2vec,
    build_vocab,
    build_embedding_matrix,
)
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import numpy as np
from kmer_2_vec import SimpleCNN, train_and_evaluate

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load pretrained Word2Vec
w2v_model = Word2Vec.load("../Models/kmer2vec_k_6_s_1.model")

# ✅ Build vocab
vocab = build_vocab(k=6)

# ✅ Build pretrained embeddings dict
pretrained_embeddings = {
    kmer: w2v_model.wv[kmer] for kmer in w2v_model.wv.index_to_key
}

# ✅ Build embedding matrix
embedding_matrix = build_embedding_matrix(
    vocab, pretrained_embeddings, embedding_dim=128
)

# ✅ Initialize model ONCE → embedding layer frozen
model = SimpleCNN(embedding_matrix, freeze_embed=True).to(device)

In [None]:
# Paths
data_dir = "../Data"
excel_path = "../Outputs/50_W2V.xlsx"

# Load dataframes
results_df, excel_df = initialize_results_df(data_dir, excel_path)

In [None]:
for idx, row in results_df.iloc[:50].iterrows():
    folder_name = row["folder_name"]
    train_path = row["train_path"]
    test_path = row["test_path"]

    print(f"🔄 Processing folder: {folder_name}")

    # Load dataframes
    train_df = load_sequence_data(train_path)
    test_df = load_sequence_data(test_path)

    # ✅ prepare_kmer_loaders handles tokenization + valid split
    train_loader, valid_loader, test_loader = prepare_kmer_loaders(
        train_df["sequence"].tolist(),
        train_df["label"].values,
        test_df["sequence"].tolist(),
        test_df["label"].values,
        vocab,
        k=6,
        stride=1,
        max_len=96,
        batch_size=32,
    )

    # Fine-tune model
    model, last_epoch, test_metrics = train_and_evaluate(
        model,
        train_loader,
        valid_loader,
        test_loader,
        device,
        epochs=5,
        lr=1e-3,
        weight_decay=1e-4,
    )

    # ✅ Log metrics
    excel_df.at[idx, "folder_name"] = folder_name
    excel_df.at[idx, "train_accuracy"] = last_epoch["train_acc"]
    excel_df.at[idx, "test_accuracy"] = test_metrics["test_acc"]
    excel_df.at[idx, "pr-roc"] = test_metrics["test_roc_auc"]
    excel_df.at[idx, "pr-auc"] = test_metrics["test_pr_auc"]

    print(
        f"✅ {folder_name}: train_acc={last_epoch['train_acc']:.4f}, test_acc={test_metrics['test_acc']:.4f}"
    )


# ✅ Save updated Excel
excel_df.iloc[: idx + 1].to_excel(excel_path, index=False)
print(f"✅ Metrics saved to {excel_path}")

# ✅ Save the final fine-tuned model
torch.save(
    model.state_dict(), "../Models/50_W2V.pt"
)  #  <---  Fine tuned model after 50 folders.
print("✅ Final fine-tuned model saved to Models/50_W2V.pt")

🔄 Processing folder: wgEncodeAwgTfbsBroadDnd41CtcfUniPk
🔄 Epoch 1/5 started...
📈 Epoch 1/5 | Train Loss: 0.3767 | Train Acc: 0.8403 | Val Loss: 0.2815 | Val Acc: 0.8902 | Val ROC-AUC: 0.9549
🔄 Epoch 2/5 started...
📈 Epoch 2/5 | Train Loss: 0.2691 | Train Acc: 0.8896 | Val Loss: 0.2594 | Val Acc: 0.9008 | Val ROC-AUC: 0.9605
🔄 Epoch 3/5 started...
📈 Epoch 3/5 | Train Loss: 0.2461 | Train Acc: 0.9000 | Val Loss: 0.2469 | Val Acc: 0.9053 | Val ROC-AUC: 0.9634
🔄 Epoch 4/5 started...
📈 Epoch 4/5 | Train Loss: 0.2301 | Train Acc: 0.9072 | Val Loss: 0.2536 | Val Acc: 0.9009 | Val ROC-AUC: 0.9640
🔄 Epoch 5/5 started...
📈 Epoch 5/5 | Train Loss: 0.2191 | Train Acc: 0.9115 | Val Loss: 0.2346 | Val Acc: 0.9093 | Val ROC-AUC: 0.9660
✅ Final Test | Loss: 0.2312 | Acc: 0.9080 | ROC-AUC: 0.9666
✅ wgEncodeAwgTfbsBroadDnd41CtcfUniPk: train_acc=0.9115, test_acc=0.9080
🔄 Processing folder: wgEncodeAwgTfbsBroadDnd41Ezh239875UniPk
🔄 Epoch 1/5 started...
📈 Epoch 1/5 | Train Loss: 0.8017 | Train Acc: 0.6012 

# USER INPUT 

In [None]:
import sys

sys.path.append("../utils")
from k_mer_data_loader import prepare_kmer_loaders
from initialize_results_df import initialize_results_df
from load_sequence_data import load_sequence_data
from kmer_2_vec import (
    get_kmer_list,
    train_word2vec,
    build_vocab,
    build_embedding_matrix,
)
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import numpy as np
from kmer_2_vec import SimpleCNN, train_and_evaluate, predict_W2V_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model_path = "../Models/50_W2V.pt"
w2v_path = "../Models/kmer2vec_k_6_s_1.model"

w2v_model = Word2Vec.load(w2v_path)
vocab = build_vocab(k=6)
pretrained_embeddings = {
    kmer: w2v_model.wv[kmer] for kmer in w2v_model.wv.index_to_key
}
embedding_matrix = build_embedding_matrix(
    vocab, pretrained_embeddings, embedding_dim=128
)
model = SimpleCNN(embedding_matrix, freeze_embed=True).to(device)

In [None]:
model = SimpleCNN(embedding_matrix, freeze_embed=True)
model.load_state_dict(torch.load(model_path, map_location=device))

seq = input("Enter a DNA sequence: ").strip()
label, conf = predict_W2V_sequence(
    model, w2v_model, vocab, seq, k=6, stride=1, max_len=96, device="cpu"
)
print(f"Prediction: {label} (Confidence: {conf}%)")

  model.load_state_dict(torch.load(model_path, map_location=device))


Prediction: TFBS (Confidence: 99.58%)
