In [3]:
%reload_ext autoreload
%autoreload 2
import os
from pathlib import Path
import numpy as np
import pandas as pd
from aldiscore.prediction import utils
from aldiscore import ROOT, RSTATE
from aldiscore.constants.constants import STAT_SEP

In [37]:
data_dir = Path("/hits/fast/cme/bodynems/data/paper")
feat_df, drop_df, label_df = utils.load_features(
    data_dir,
    exclude_features=["is_dna", "num_seqs", "seq_length"],
)
clean_feat_names = feat_df.columns.str.replace(":", STAT_SEP).to_list()
feat_df.columns = clean_feat_names

print(feat_df.shape)
print(drop_df.shape)
print(label_df.shape)

Dropping 0 NaN rows...
(11431, 469)
(11431, 21)
(11431, 1)


In [41]:
feat_df.columns[feat_df.columns.str.contains("ratio")]

Index(['min.psa_score_ratio', 'max.psa_score_ratio', 'mean.psa_score_ratio',
       'std.psa_score_ratio', 'p1.psa_score_ratio', 'p5.psa_score_ratio',
       'p10.psa_score_ratio', 'p20.psa_score_ratio', 'p30.psa_score_ratio',
       'p40.psa_score_ratio', 'p50.psa_score_ratio', 'p60.psa_score_ratio',
       'p70.psa_score_ratio', 'p80.psa_score_ratio', 'p90.psa_score_ratio',
       'p95.psa_score_ratio', 'p99.psa_score_ratio', 'iqr.psa_score_ratio'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split

train_idxs, test_idxs = train_test_split(
    feat_df.index.to_list(), test_size=0.2, random_state=RSTATE
)
test_idxs, valid_idxs = train_test_split(test_idxs, test_size=0.5, random_state=RSTATE)
print(len(train_idxs), len(test_idxs), len(valid_idxs))
#
X_train = feat_df.loc[train_idxs]
X_test = feat_df.loc[test_idxs]
X_valid = feat_df.loc[valid_idxs]
y_train = label_df.loc[train_idxs].iloc[:, 0]
y_test = label_df.loc[test_idxs].iloc[:, 0]
y_valid = label_df.loc[valid_idxs].iloc[:, 0]

In [1]:
from aldiscore.prediction.predictor import DifficultyPredictor

model = DifficultyPredictor("latest").model
imps = model.feature_importance("gain")
imps

array([1.76741837e+01, 9.05170046e-01, 3.90400951e+00, 7.21766775e-01,
       2.85076636e+00, 3.86365895e-01, 1.89703203e+00, 1.17691104e+00,
       1.95932649e+00, 1.74592337e+00, 2.98750874e+00, 2.84242599e+00,
       1.49776667e+00, 1.32534720e+00, 1.22621959e+00, 4.88946049e-01,
       6.22372606e+00, 1.84989891e+00, 1.80195625e+00, 2.95652074e-01,
       1.34479599e+00, 2.59816165e-01, 1.43629467e+00, 3.49476777e-01,
       8.17355616e-01, 5.52849846e-01, 1.58785373e+00, 2.51189344e-01,
       2.35340762e+00, 3.08336830e-01, 1.30970047e+00, 3.51171937e-01,
       1.34123452e+00, 5.01085110e-01, 1.65315017e+00, 3.31618284e-01,
       2.43797917e+00, 7.65375974e-01, 1.74084122e+00, 8.52230917e-01,
       1.07656568e+00, 1.44448557e+00, 1.73289180e+00, 9.55300525e-01,
       2.34454238e+00, 2.07488100e+00, 2.50125267e+00, 1.26879574e+00,
       1.68873220e+00, 7.03280715e-01, 1.83535895e+00, 3.55606598e-01,
       1.68818849e+00, 6.90179662e-01, 1.22929783e+00, 4.05984957e-01,
      

In [5]:
clean_feat_names

['lower_bound_gap_percentage',
 'min.js_char',
 'max.js_char',
 'mean.js_char',
 'std.js_char',
 'p1.js_char',
 'p5.js_char',
 'p10.js_char',
 'p20.js_char',
 'p30.js_char',
 'p40.js_char',
 'p50.js_char',
 'p60.js_char',
 'p70.js_char',
 'p80.js_char',
 'p90.js_char',
 'p95.js_char',
 'p99.js_char',
 'iqr.js_char',
 'min.js_hpoly_count',
 'max.js_hpoly_count',
 'mean.js_hpoly_count',
 'std.js_hpoly_count',
 'p1.js_hpoly_count',
 'p5.js_hpoly_count',
 'p10.js_hpoly_count',
 'p20.js_hpoly_count',
 'p30.js_hpoly_count',
 'p40.js_hpoly_count',
 'p50.js_hpoly_count',
 'p60.js_hpoly_count',
 'p70.js_hpoly_count',
 'p80.js_hpoly_count',
 'p90.js_hpoly_count',
 'p95.js_hpoly_count',
 'p99.js_hpoly_count',
 'iqr.js_hpoly_count',
 'min.js_hpoly_len',
 'max.js_hpoly_len',
 'mean.js_hpoly_len',
 'std.js_hpoly_len',
 'p1.js_hpoly_len',
 'p5.js_hpoly_len',
 'p10.js_hpoly_len',
 'p20.js_hpoly_len',
 'p30.js_hpoly_len',
 'p40.js_hpoly_len',
 'p50.js_hpoly_len',
 'p60.js_hpoly_len',
 'p70.js_hpoly_len

In [42]:
imp_df = pd.concat([pd.Series(clean_feat_names), pd.Series(imps)], axis=1)
imp_df.columns = ["name", "gain"]
imp_df
imp_df["group"] = pd.Series(imp_df.name.str.split(".").map(lambda v: v[-1]))
imp_df.sort_values("gain", ascending=False)
print(set(imp_df.group))
for pat in [
    "mean",
    "min",
    "max",
    "mean",
    "iqr",
    "p50",
    "std",
    # "count",
    # "len",
    # "len_logdiff",
]:
    imp_df.group = imp_df.group.str.rsplit("_" + pat).map(lambda v: v[0])
groups = set(imp_df.group)
groups

{'frst_entropy', 'js_hpoly_count', 'lower_bound_gap_percentage', 'js_char', 'tc_base_p50', 'frst_serial-correlation', 'tc_base_min', '5mer_ent', 'psa_score_ratio', 'psa_gap_len_p50', 'tc_base_std', 'tc_base_mean', '3mer_ent', '3mer_js', 'frst_mean', '9mer_js', 'frst_inv-chi-square', '7mer_js', 'psa_gap_len_std', '9mer_ent', '7mer_ent', '5mer_js', 'psa_gap_len_iqr', 'psa_gap_len_logdiff', 'psa_gap_len_mean', 'tc_base_max', 'js_hpoly_len'}


{'3mer_ent',
 '3mer_js',
 '5mer_ent',
 '5mer_js',
 '7mer_ent',
 '7mer_js',
 '9mer_ent',
 '9mer_js',
 'frst',
 'frst_entropy',
 'frst_inv-chi-square',
 'frst_serial-correlation',
 'js_char',
 'js_hpoly_count',
 'js_hpoly_len',
 'lower_bound_gap_percentage',
 'psa_gap_len',
 'psa_gap_len_logdiff',
 'psa_score_ratio',
 'tc_base'}

In [43]:
imp_df.groupby("group").max(numeric_only=True).sort_values("gain", ascending=False)

Unnamed: 0_level_0,gain
group,Unnamed: 1_level_1
tc_base,598.06831
psa_score_ratio,80.090993
7mer_js,52.22956
9mer_ent,30.016441
5mer_js,24.973104
lower_bound_gap_percentage,17.674184
psa_gap_len,16.433802
7mer_ent,10.609474
9mer_js,10.58922
5mer_ent,6.76551


In [None]:
# Feature classes:
# - tc_base : Transitive consistency of sequence triplets
# - psa_score_ratio : Alignment score scaled by the minimum sequence length
# - kmer_js : Pairwise Jensen-Shannon divergence of kmer distributions
# - kmer_ent : Entropy of kmer distributions
# - lbgp : Lower bound on the percentage of gaps
# - psa_gap :