In [6]:
%reload_ext autoreload
%autoreload 2
import os
from pathlib import Path
import numpy as np
import pandas as pd
from aldiscore.prediction import utils
from aldiscore import ROOT, RSTATE
import matplotlib.pyplot as plt
import seaborn as sn
from collections import defaultdict
from sklearn.model_selection import train_test_split
import lightgbm as lgb

Ideas
- Remove all sources individually and check performance on those
- Remove all small/ large datasets (seq_length, num_seqs)
- Remove DNA/AA data and check

In [7]:
data_dir = Path("/hits/fast/cme/bodynems/data/paper")
param_df = pd.read_parquet(ROOT / "optuna" / "trial_03.parquet")
params = dict(param_df.drop("score", axis=1).iloc[0])
# for key, val in params.items():
#     if isinstance(params[key], float) and (float(int(val)) == val):
#         params[key] = int(val)

#### Study 1
Train on mean seq_length < 500, test on seq_length > 1000

Results
- Short sequences are able to predict long ones pretty well (some degradation of course).
- This is true even though in our case, short sequences are twice as difficult on average (0.17 vs. 0.08).
- Most long sequences sampled from TreeBase, therefore easy on average.


In [8]:
feat_df, drop_df, label_df = utils.load_features(
    data_dir,
    label_scale="auto",
    exclude_features=["is_dna", "num_seqs", "seq_length"],
)
clean_feat_names = feat_df.columns.str.replace(":", ".").to_list()
feat_df.columns = clean_feat_names

mask = (drop_df["mean:seq_length"] <= 500) | (drop_df["mean:seq_length"] >= 1000)
feat_df = feat_df[mask]
label_df = label_df[mask]
drop_df = drop_df[mask]

Dropping 0 NaN rows...


In [9]:
thresholds = [0, 500, 10000]
groups = pd.cut(drop_df["mean:seq_length"], bins=thresholds)
print(groups.value_counts())
groups_unique = np.sort(groups.unique())
group_map = dict(zip(list(map(str, groups_unique)), groups_unique))
perf_dfs = []

for key, group in group_map.items():
    X_train = feat_df[groups == group]
    y_train = label_df[groups == group].iloc[:, 0]
    X_test = feat_df[groups != group]
    y_test = label_df[groups != group].iloc[:, 0]
    # Train the model
    model = lgb.LGBMRegressor(**params)
    model = model.fit(X_train, y_train)
    eps = 1e-2
    perf_df = utils.compute_metrics(model, X_test, y_test, eps)
    perf_df["n"] = len(y_train)
    perf_dfs.append(perf_df)

perf_df = pd.concat(perf_dfs, axis=0, ignore_index=True)
perf_df.index = list(group_map.keys())
perf_df

mean:seq_length
(0, 500]        8370
(500, 10000]    1907
Name: count, dtype: int64


Unnamed: 0,RMSE,RMSE_CV,MAE,MAPE,MAPE_P50,CORR,n
"(0, 500]",0.0535,0.6479,0.0394,0.9622,0.5062,0.8971,8370
"(500, 10000]",0.1144,0.66,0.0891,1.9903,0.7518,0.8708,1907


In [10]:
print("Mean difficulty per group:")
label_df.groupby(groups, observed=False).mean()

Mean difficulty per group:


Unnamed: 0_level_0,mean
mean:seq_length,Unnamed: 1_level_1
"(0, 500]",0.173318
"(500, 10000]",0.082561
