## 🧬 TFBS Classification using XGBoost and k-mer Word2Vec Embeddings

This notebook applies the XGBoost algorithm to classify DNA sequences as TFBS or non-TFBS using features derived from Word2Vec-encoded k-mers. Each DNA sequence is converted into a series of k-mers, which are then mapped to dense vectors using a pretrained Word2Vec model. These vectors are aggregated (e.g., via averaging) to form a fixed-length feature vector per sequence.

XGBoost, a gradient-boosted tree-based ensemble model, is trained on these vectors for binary classification. The notebook includes performance evaluation metrics such as accuracy, precision, recall, F1-score, and ROC-AUC to assess model quality.

This approach leverages the interpretability and speed of XGBoost, combined with the contextual power of distributed k-mer embeddings, to provide a high-performance baseline for TFBS prediction from DNA sequences.


In [None]:
import sys


sys.path.append("../utils")
import pandas as pd
import xgboost as xgb

from initialize_results_df import initialize_results_df

from load_sequence_data import load_sequence_data


# from xgb_kmer_utils import run_xgb_grid_search

from xgb_kmer_utils import (
    build_kmer_vocab,
    build_vectorizer_from_vocab,
    run_xgb_random_search,
    get_kmers_stride,
)

In [None]:
data_dir = "..\\Data"
excel_dir = "..\\Outputs\\excel_results.xlsx"

results_df, excel_df = initialize_results_df(data_dir, excel_dir)

In [None]:
train_df = load_sequence_data(results_df["train_path"][0])
test_df = load_sequence_data(results_df["test_path"][0])

In [None]:
# Define XGBoost param grid
xgb_param_grid = {
    "n_estimators": [100, 300, 500, 1000],
    "max_depth": [4, 6, 8, 10],
    "learning_rate": [0.1, 0.05, 0.01],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 1],
}

stride_values = [1, 2]
use_tfidf = False


# Store all result DataFrames here
all_results = []

# ✅ Loop over each k
for k in [3, 5, 6]:
    print(f"✅ Processing k={k}")

    # Build vocab and vectorizer for this k
    vocab_dict = build_kmer_vocab(k)
    vectorizer = build_vectorizer_from_vocab(vocab_dict, use_tfidf=use_tfidf)

    # Run random search for this k
    results_df = run_xgb_random_search(
        train_df=train_df,
        test_df=test_df,
        k_values=[k],  # only this k
        stride_values=stride_values,
        xgb_param_grid=xgb_param_grid,
        output_csv=f"xgb_random_results_temp_k_{k}.csv",  # temp CSV for backup
        vectorizer=vectorizer,
        n_trials=30,  # adjust number of trials
    )

    results_df["k"] = k  # Add k column (in case not already present)
    all_results.append(results_df)

    print(f"✅ Done k={k}\n")

# ✅ Combine all results into a single DataFrame
final_df = pd.concat(all_results, ignore_index=True)

# ✅ Save to single Excel file
final_df.to_excel("../Outputs/random_search_xgb_cv.xlsx", index=False)

print("🎉 Saved ALL k + stride results to xgb_cv.xlsx")

✅ Processing k=3
✅ Trial 1/30: acc=0.4977
✅ Trial 2/30: acc=0.5207
✅ Trial 3/30: acc=0.5011
✅ Trial 4/30: acc=0.4984
✅ Trial 5/30: acc=0.5576
✅ Trial 6/30: acc=0.4815
✅ Trial 7/30: acc=0.4994
✅ Trial 8/30: acc=0.5382
✅ Trial 9/30: acc=0.5015
✅ Trial 10/30: acc=0.5025
✅ Trial 11/30: acc=0.5009
✅ Trial 12/30: acc=0.5262
✅ Trial 13/30: acc=0.5550
✅ Trial 14/30: acc=0.4940
✅ Trial 15/30: acc=0.4806
✅ Trial 16/30: acc=0.5090
✅ Trial 17/30: acc=0.5260
✅ Trial 18/30: acc=0.5119
✅ Trial 19/30: acc=0.5004
✅ Trial 20/30: acc=0.5128
✅ Trial 21/30: acc=0.5262
✅ Trial 22/30: acc=0.5090
✅ Trial 23/30: acc=0.5036
✅ Trial 24/30: acc=0.5345
✅ Trial 25/30: acc=0.5142
✅ Trial 26/30: acc=0.5587
✅ Trial 27/30: acc=0.4984
✅ Trial 28/30: acc=0.5009
✅ Trial 29/30: acc=0.5413
✅ Trial 30/30: acc=0.5068
🎯 Best Config:
    trial  k  stride  n_estimators  max_depth  learning_rate  subsample  \
25     26  3       1          1000          6           0.01        0.8   
4       5  3       1           100          6  

In [None]:
# ✅ Get best row
best = final_df.sort_values(by="accuracy", ascending=False).iloc[0]

# ✅ Extract best params
best_k = int(best["k"])
best_stride = int(best["stride"])
best_params = {
    "n_estimators": int(best["n_estimators"]),
    "max_depth": int(best["max_depth"]),
    "learning_rate": float(best["learning_rate"]),
    "subsample": float(best["subsample"]),
    "colsample_bytree": float(best["colsample_bytree"]),
    "gamma": float(best["gamma"]),
}

print(
    f"🎯 Best Config: k={best_k}, stride={best_stride}, params={best_params}"
)

# ✅ Build vocab + vectorizer for best k
vocab_dict = build_kmer_vocab(best_k)
vectorizer = build_vectorizer_from_vocab(vocab_dict, use_tfidf=use_tfidf)

# ✅ Recreate input features
train_kmers = train_df["sequence"].apply(
    lambda seq: get_kmers_stride(seq, best_k, best_stride)
)
test_kmers = test_df["sequence"].apply(
    lambda seq: get_kmers_stride(seq, best_k, best_stride)
)

X_train = vectorizer.transform(train_kmers)
X_test = vectorizer.transform(test_kmers)

# ✅ Train final model
model = xgb.XGBClassifier(
    tree_method="gpu_hist", predictor="gpu_predictor", gpu_id=0, **best_params
)
model.fit(X_train, train_df["label"])

# ✅ Save final model
model_output_path = "../Models/xgboost-cv.model"
model.save_model(model_output_path)
print(f"✅ Final model saved to {model_output_path}")

# ✅ Optional: save JSON summary
import json

summary = {
    "k": best_k,
    "stride": best_stride,
    "accuracy": round(float(best["accuracy"]), 4),
    "xgboost_params": best_params,
}

json_output_path = "../Models/xgboost-cv-summary.json"
with open(json_output_path, "w") as f:
    json.dump(summary, f, indent=4)
print(f"✅ Model summary saved to {json_output_path}")

🎯 Best Config: k=6, stride=1, params={'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.6, 'gamma': 0.1}



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "predictor" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Final model saved to ../Outputs/xgboost-cv.model
✅ Model summary saved to ../Outputs/xgboost-cv-summary.json



    E.g. tree_method = "hist", device = "cuda"

  self.get_booster().save_model(fname)
  self.get_booster().save_model(fname)


# LOOPING THROUGH FOLDERS

In [None]:
import sys


sys.path.append("../utils")

import xgboost as xgb
import pandas as pd

import json

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    average_precision_score,
)

from xgb_kmer_utils import (
    build_kmer_vocab,
    build_vectorizer_from_vocab,
    get_kmers_str,
)

from load_sequence_data import load_sequence_data

from initialize_results_df import initialize_results_df

In [None]:
# Paths
data_dir = "../Data"
excel_path = "../Outputs/50_XGBOOST_CV.xlsx"
model_path = "../Models/xgboost-cv.model"

# Load dataframes
results_df, excel_df = initialize_results_df(data_dir, excel_path)

# ✅ Load JSON best params
with open("../Models/xgboost-cv-summary.json", "r") as f:
    summary = json.load(f)

k = summary["k"]
stride = summary["stride"]
best_params = summary["xgboost_params"]

# ✅ Add refresh update params
best_params.update(
    {
        "process_type": "update",
        "updater": "refresh",
        "tree_method": "hist",  # CPU only
        "predictor": "cpu_predictor",
    }
)

In [None]:
vocab_dict = build_kmer_vocab(k)
vectorizer = build_vectorizer_from_vocab(vocab_dict, use_tfidf=False)

model = xgb.XGBClassifier()

In [None]:
# ✅ Process first 50 folders
for idx, row in results_df.iloc[:5].iterrows():
    train_path = row["train_path"]
    test_path = row["test_path"]
    folder_name = row["folder_name"]

    print(f"✅ Processing folder: {folder_name}")

    model.load_model(model_path)

    # Load data
    train_df = load_sequence_data(train_path)
    test_df = load_sequence_data(test_path)

    # Convert sequences to k-mer strings
    train_kmers = train_df["sequence"].apply(
        lambda seq: get_kmers_str(seq, k, stride)
    )
    test_kmers = test_df["sequence"].apply(
        lambda seq: get_kmers_str(seq, k, stride)
    )

    # Transform using same vectorizer
    X_train = vectorizer.transform(train_kmers)
    X_test = vectorizer.transform(test_kmers)

    y_train = train_df["label"]
    y_test = test_df["label"]

    # ✅ Continue training model
    model.fit(X_train, y_train, xgb_model=model)

    # Evaluate train
    preds_train = model.predict(X_train)
    train_acc = accuracy_score(y_train, preds_train)
    train_proba = model.predict_proba(X_train)[:, 1]
    train_pr_auc = average_precision_score(y_train, train_proba)
    train_roc_auc = roc_auc_score(y_train, train_proba)

    # Evaluate test
    preds_test = model.predict(X_test)
    test_acc = accuracy_score(y_test, preds_test)
    test_proba = model.predict_proba(X_test)[:, 1]
    test_pr_auc = average_precision_score(y_test, test_proba)
    test_roc_auc = roc_auc_score(y_test, test_proba)

    # ✅ Save metrics to excel_df
    excel_df.at[idx, "train_accuracy"] = train_acc
    excel_df.at[idx, "test_accuracy"] = test_acc
    excel_df.at[idx, "pr-roc"] = test_roc_auc
    excel_df.at[idx, "pr-auc"] = test_pr_auc

    # ✅ Save updated model after each folder
    model.save_model(model_path)

    print(
        f"✅ {folder_name}: train_acc={train_acc:.4f}, test_acc={test_acc:.4f}"
    )

# ✅ Save updated Excel
excel_df.to_excel(excel_path, index=False)
print(f"✅ Metrics saved to {excel_path}")

✅ Processing folder: wgEncodeAwgTfbsBroadDnd41CtcfUniPk


  self.get_booster().save_model(fname)


✅ wgEncodeAwgTfbsBroadDnd41CtcfUniPk: train_acc=0.5023, test_acc=0.5009
✅ Processing folder: wgEncodeAwgTfbsBroadDnd41Ezh239875UniPk


  self.get_booster().save_model(fname)


✅ wgEncodeAwgTfbsBroadDnd41Ezh239875UniPk: train_acc=0.5049, test_acc=0.4814
✅ Processing folder: wgEncodeAwgTfbsBroadGm12878CtcfUniPk


  self.get_booster().save_model(fname)


✅ wgEncodeAwgTfbsBroadGm12878CtcfUniPk: train_acc=0.5018, test_acc=0.5020
✅ Processing folder: wgEncodeAwgTfbsBroadGm12878Ezh239875UniPk


  self.get_booster().save_model(fname)


✅ wgEncodeAwgTfbsBroadGm12878Ezh239875UniPk: train_acc=0.5020, test_acc=0.4919
✅ Processing folder: wgEncodeAwgTfbsBroadH1hescChd1a301218aUniPk
✅ wgEncodeAwgTfbsBroadH1hescChd1a301218aUniPk: train_acc=0.5014, test_acc=0.4941
✅ Metrics saved to ../Outputs/50_XGBOOST_CV.xlsx


  self.get_booster().save_model(fname)
