In [1]:
from pathlib import Path
import pandas as pd

def load_data(data_path: Path):
    df_train = pd.read_csv(data_path / "train.csv")
    df_valid = pd.read_csv(data_path / "val.csv")
    df_test = pd.read_csv(data_path / "test.csv")

    X_train = df_train.drop(columns=["label"])
    y_train = df_train["label"]
    X_valid = df_valid.drop(columns=["label"])
    y_valid = df_valid["label"]
    X_test = df_test.drop(columns=["label"])
    y_test = df_test["label"]
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

In [37]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb


def create_classifier(clf_name: str):
    if clf_name == "CatBoost":
        clf = CatBoostClassifier(
            iterations=3_000,
            learning_rate=0.02,
            depth=4,
            # early_stopping_rounds=100,
            # use_best_model=True,
            verbose=50,
        )
    elif clf_name == "XGBoost":
        clf = xgb.XGBClassifier(
            n_estimators=3000,
            learning_rate=0.02,
            max_depth=4,
            # early_stopping_rounds=100,
            verbosity=1
        )
    elif clf_name == "LightGBM":
        clf = lgb.LGBMClassifier(
            n_estimators=3000,
            learning_rate=0.02,
            max_depth=4,
            # early_stopping_rounds=100,
            verbosity=1
        )
    return clf

In [38]:
def train_classifier(clf, X_train, y_train, X_valid, y_valid):
    clf.fit(X_train, y_train)
    return clf

In [44]:
def save(clf, model_path: Path):
    if hasattr(clf, "save_model"):
        clf.save_model(model_path)
    else:
        clf.booster_.save_model(model_path)

In [53]:
embeddings_dir = Path("../data/embeddings")

DATASETS = {
    "enhancers": embeddings_dir / "enhancers",
    "promoter_all": embeddings_dir / "promoter_all",
    "splice_sites_all": embeddings_dir / "splice_sites_all",
    "H3K9me3": embeddings_dir / "H3K9me3",
    "H4K20me1": embeddings_dir / "H4K20me1"
}

In [54]:
MODELS = [
    "CatBoost",
    "XGBoost", 
    "LightGBM"
]

In [55]:
save_model_dir = Path("../models")
save_model_dir.mkdir(parents=True, exist_ok=True)

In [56]:
for dataset_name in DATASETS:
    dataset_save_dir = save_model_dir / dataset_name
    dataset_save_dir.mkdir(parents=True, exist_ok=True)
    for model in MODELS:
        print(f"Training {model} on {dataset_name}")
        X_train, y_train, X_valid, y_valid, X_test, y_test = load_data(DATASETS[dataset_name])
        clf = create_classifier(model)
        clf = train_classifier(clf, X_train, y_train, X_valid, y_valid)
        save_model_path = dataset_save_dir / f"{model.lower()}.pkl"
        save(clf, save_model_path)
        print(f"Model saved to {save_model_path}")

Training CatBoost on H3K9me3
0:	learn: 0.6925832	total: 6.58ms	remaining: 19.7s
50:	learn: 0.6744156	total: 271ms	remaining: 15.7s
100:	learn: 0.6643127	total: 527ms	remaining: 15.1s
150:	learn: 0.6573314	total: 783ms	remaining: 14.8s
200:	learn: 0.6518015	total: 1.02s	remaining: 14.3s
250:	learn: 0.6472314	total: 1.27s	remaining: 14s
300:	learn: 0.6434602	total: 1.55s	remaining: 13.9s
350:	learn: 0.6398268	total: 1.81s	remaining: 13.7s
400:	learn: 0.6366633	total: 2.07s	remaining: 13.4s
450:	learn: 0.6333908	total: 2.33s	remaining: 13.2s
500:	learn: 0.6300976	total: 2.59s	remaining: 12.9s
550:	learn: 0.6265084	total: 2.87s	remaining: 12.8s
600:	learn: 0.6226093	total: 3.16s	remaining: 12.6s
650:	learn: 0.6188943	total: 3.44s	remaining: 12.4s
700:	learn: 0.6152077	total: 3.71s	remaining: 12.2s
750:	learn: 0.6117055	total: 3.95s	remaining: 11.8s
800:	learn: 0.6081093	total: 4.2s	remaining: 11.5s
850:	learn: 0.6047708	total: 4.45s	remaining: 11.2s
900:	learn: 0.6015063	total: 4.72s	remai

  self.get_booster().save_model(fname)


Model saved to ../models/H3K9me3/xgboost.pkl
Training LightGBM on H3K9me3
[LightGBM] [Info] Number of positive: 10940, number of negative: 11010
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010322 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 21950, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498405 -> initscore=-0.006378
[LightGBM] [Info] Start training from score -0.006378
Model saved to ../models/H3K9me3/lightgbm.pkl
Training CatBoost on H4K20me1
0:	learn: 0.6860556	total: 6.92ms	remaining: 20.7s
50:	learn: 0.5183444	total: 268ms	remaining: 15.5s
100:	learn: 0.4828697	total: 529ms	remaining: 15.2s
150:	learn: 0.4694611	total: 787ms	remaining: 14.8s
200:	learn: 0.4614889	total: 1.03s	remaining: 14.4s
250:	learn: 0.4556386	total: 1.29s	remaining: 14.1s
300:	learn: 0.4507445	total: 1.54s	remainin

  self.get_booster().save_model(fname)


Model saved to ../models/H4K20me1/xgboost.pkl
Training LightGBM on H4K20me1
[LightGBM] [Info] Number of positive: 11993, number of negative: 12007
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 130560
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 512
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499708 -> initscore=-0.001167
[LightGBM] [Info] Start training from score -0.001167
Model saved to ../models/H4K20me1/lightgbm.pkl
