In [1]:
from pathlib import Path

import pandas as pd


def load_data(data_path: Path):
    df_train = pd.read_csv(data_path / "train.csv")
    df_valid = pd.read_csv(data_path / "val.csv")
    df_test = pd.read_csv(data_path / "test.csv")

    X_train = df_train.drop(columns=["label"])
    y_train = df_train["label"]
    X_valid = df_valid.drop(columns=["label"])
    y_valid = df_valid["label"]
    X_test = df_test.drop(columns=["label"])
    y_test = df_test["label"]

    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [3]:
from catboost import CatBoostClassifier


def create_classifier(clf_name: str):
    if clf_name == "CatBoost_v1":
        clf = CatBoostClassifier(
            iterations=3_000,
            learning_rate=0.02,
            depth=4,
            # early_stopping_rounds=100,
            # use_best_model=True,
            verbose=50,
            task_type="GPU",
        )
    elif clf_name == "CatBoost_v2":
        clf = CatBoostClassifier(
            iterations=3_000,
            learning_rate=0.05,
            depth=6,
            # early_stopping_rounds=100,
            # use_best_model=True,
            verbose=50,
            task_type="GPU",
        )
    return clf

In [4]:
def train_classifier(clf, X_train, y_train, X_valid, y_valid):
    clf.fit(X_train, y_train)
    return clf

In [5]:
def save(clf, model_path: Path):
    if hasattr(clf, "save_model"):
        clf.save_model(model_path)
    else:
        clf.booster_.save_model(model_path)

In [6]:
embeddings_dir = Path("../data/embeddings")

DATASETS = {
    "enhancers": embeddings_dir / "enhancers",
    "promoter_all": embeddings_dir / "promoter_all",
    "splice_sites_all": embeddings_dir / "splice_sites_all",
    "H3K9me3": embeddings_dir / "H3K9me3",
    "H4K20me1": embeddings_dir / "H4K20me1",
}

In [7]:
MODELS = ["CatBoost_v1", "CatBoost_v2"]

In [8]:
save_model_dir = Path("../two_diff_models")
save_model_dir.mkdir(parents=True, exist_ok=True)

In [9]:
for dataset_name in DATASETS:
    dataset_save_dir = save_model_dir / dataset_name
    dataset_save_dir.mkdir(parents=True, exist_ok=True)
    for model in MODELS:
        print(f"Training {model} on {dataset_name}")
        X_train, y_train, X_valid, y_valid, X_test, y_test = load_data(DATASETS[dataset_name])
        clf = create_classifier(model)
        clf = train_classifier(clf, X_train, y_train, X_valid, y_valid)
        save_model_path = dataset_save_dir / f"{model.lower()}.pkl"
        save(clf, save_model_path)
        print(f"Model saved to {save_model_path}")

Training CatBoost_v1 on enhancers
0:	learn: 0.6895605	total: 262ms	remaining: 13m 5s
50:	learn: 0.6018932	total: 1.23s	remaining: 1m 10s
100:	learn: 0.5761458	total: 2.18s	remaining: 1m 2s
150:	learn: 0.5641591	total: 3.14s	remaining: 59.3s
200:	learn: 0.5565644	total: 4.14s	remaining: 57.7s
250:	learn: 0.5509354	total: 5.17s	remaining: 56.6s
300:	learn: 0.5463633	total: 6.2s	remaining: 55.6s
350:	learn: 0.5424165	total: 7.23s	remaining: 54.5s
400:	learn: 0.5390909	total: 8.26s	remaining: 53.5s
450:	learn: 0.5359318	total: 9.3s	remaining: 52.6s
500:	learn: 0.5331083	total: 10.3s	remaining: 51.6s
550:	learn: 0.5305533	total: 11.3s	remaining: 50.2s
600:	learn: 0.5279327	total: 12.4s	remaining: 49.5s
650:	learn: 0.5255133	total: 13.5s	remaining: 48.6s
700:	learn: 0.5231954	total: 14.5s	remaining: 47.6s
750:	learn: 0.5210215	total: 15.6s	remaining: 46.6s
800:	learn: 0.5188912	total: 16.6s	remaining: 45.6s
850:	learn: 0.5167658	total: 17.7s	remaining: 44.6s
900:	learn: 0.5148910	total: 18.7