In [1]:
%reload_ext autoreload
%autoreload 2
import os
from pathlib import Path
import numpy as np
import pandas as pd
from aldiscore.prediction import utils
from aldiscore import ROOT, RSTATE
import matplotlib.pyplot as plt
import seaborn as sn
from collections import defaultdict
from sklearn.model_selection import train_test_split
import lightgbm as lgb

Ideas
- Remove all sources individually and check performance on those
- Remove all small/ large datasets (seq_length, num_seqs)
- Remove DNA/AA data and check

In [None]:
data_dir = Path("/hits/fast/cme/bodynems/data/paper")
param_df = pd.read_parquet(ROOT / "optuna" / "trial_03.parquet")
params = dict(param_df.drop("score", axis=1).iloc[0])
params["n_estimators"] = 1500

#### Study 1
Train on seq_length <= 100, test on seq_length >= 500

Results
- Short sequences are able to predict uncertainty for long ones and vice versa (with some degradation of course).

In [3]:
feat_df, drop_df, label_df = utils.load_features(
    data_dir,
    label_scale="auto",
    exclude_features=["is_dna", "num_seqs", "seq_length"],
)

mask = (drop_df["max.seq_length"] <= 100) | (drop_df["max.seq_length"] > 499)
feat_df = feat_df[mask]
label_df = label_df[mask]
drop_df = drop_df[mask]

Dropping 0 NaN rows...


In [4]:
thresholds = [0, 100, 499, 10000]
groups = pd.cut(drop_df["max.seq_length"], bins=thresholds)
print(groups.value_counts())
groups_unique = np.sort(groups.unique())
group_map = dict(zip(list(map(str, groups_unique)), groups_unique))
perf_dfs = []

for key, group in group_map.items():
    X_train = feat_df[groups == group]
    y_train = label_df[groups == group].iloc[:, 0]
    X_test = feat_df[groups != group]
    y_test = label_df[groups != group].iloc[:, 0]
    # Train the model
    model = lgb.LGBMRegressor(**params, random_state=0)
    model = model.fit(X_train, y_train)
    eps = 1e-2
    perf_df = utils.compute_metrics(model, X_test, y_test, eps)
    perf_df["n"] = len(y_train)
    perf_dfs.append(perf_df)

perf_df = pd.concat(perf_dfs, axis=0, ignore_index=True)
perf_df.index = list(group_map.keys())
perf_df

max.seq_length
(499, 10000]    3425
(0, 100]        3009
(100, 499]         0
Name: count, dtype: int64


Unnamed: 0,RMSE,RMSE_CV,MAE,MAPE,MAPE_P50,CORR,n
"(0, 100]",0.0764,0.6734,0.0466,0.4913,0.392,0.9096,3009
"(499, 10000]",0.1047,0.832,0.0874,3.4741,1.9677,0.915,3425


In [5]:
print("Mean difficulty per group:")
label_df.groupby(groups, observed=True).mean().round(2)

Mean difficulty per group:


Unnamed: 0_level_0,mean
max.seq_length,Unnamed: 1_level_1
"(0, 100]",0.13
"(499, 10000]",0.11


#### Study 2
Train on num_seqs <= 10, test on num_seqs >= 50

Results
- Generalization error is similar to what we saw with respect to sequence length.
- Only considering a single split, so take results with a grain of salt.
- RMSE is higher when training on larger datasets because shorter ones are harder on average (absolute measure of error).


In [6]:
feat_df, drop_df, label_df = utils.load_features(
    data_dir,
    label_scale="auto",
    exclude_features=["is_dna", "num_seqs", "seq_length"],
)

mask = (drop_df["num_seqs"] <= 10) | (drop_df["num_seqs"] > 49)
feat_df = feat_df[mask]
label_df = label_df[mask]
drop_df = drop_df[mask]

Dropping 0 NaN rows...


In [7]:
thresholds = [0, 10, 49, 300]
groups = pd.cut(drop_df["num_seqs"], bins=thresholds)
print(groups.value_counts())
groups_unique = np.sort(groups.unique())
group_map = dict(zip(list(map(str, groups_unique)), groups_unique))
perf_dfs = []

for key, group in group_map.items():
    X_train = feat_df[groups == group]
    y_train = label_df[groups == group].iloc[:, 0]
    X_test = feat_df[groups != group]
    y_test = label_df[groups != group].iloc[:, 0]
    # Train the model
    model = lgb.LGBMRegressor(**params, random_state=0)
    model = model.fit(X_train, y_train)
    eps = 1e-2
    perf_df = utils.compute_metrics(model, X_test, y_test, eps)
    perf_df["n"] = len(y_train)
    perf_dfs.append(perf_df)

perf_df = pd.concat(perf_dfs, axis=0, ignore_index=True)
perf_df.index = list(group_map.keys())
perf_df

num_seqs
(0, 10]      5617
(49, 300]    3349
(10, 49]        0
Name: count, dtype: int64


Unnamed: 0,RMSE,RMSE_CV,MAE,MAPE,MAPE_P50,CORR,n
"(0, 10]",0.0584,0.5029,0.0413,0.5572,0.3571,0.9091,5617
"(49, 300]",0.0947,0.5325,0.0687,1.8214,0.3875,0.9132,3349


In [8]:
print("Mean difficulty per group:")
label_df.groupby(groups, observed=True).mean().round(2)

Mean difficulty per group:


Unnamed: 0_level_0,mean
num_seqs,Unnamed: 1_level_1
"(0, 10]",0.18
"(49, 300]",0.12
