In [1]:
%reload_ext autoreload
%autoreload 2
import os
from pathlib import Path
import numpy as np
import pandas as pd
from aldiscore.prediction import utils
from aldiscore import ROOT, RSTATE
from aldiscore.constants.constants import STAT_SEP

In [8]:
data_dir = Path("/hits/fast/cme/bodynems/data/paper")
feat_df, drop_df, label_df = utils.load_features(
    data_dir,
    exclude_features=["is_dna", "num_seqs", "seq_length"],
)

print(feat_df.shape)
print(drop_df.shape)
print(label_df.shape)

Dropping 0 NaN rows...
(11431, 487)
(11431, 21)
(11431, 1)


In [9]:
feat_df.columns[feat_df.columns.str.contains("ratio")]

Index(['min.psa_score_ratio', 'max.psa_score_ratio', 'mean.psa_score_ratio',
       'std.psa_score_ratio', 'p1.psa_score_ratio', 'p5.psa_score_ratio',
       'p10.psa_score_ratio', 'p20.psa_score_ratio', 'p30.psa_score_ratio',
       'p40.psa_score_ratio', 'p50.psa_score_ratio', 'p60.psa_score_ratio',
       'p70.psa_score_ratio', 'p80.psa_score_ratio', 'p90.psa_score_ratio',
       'p95.psa_score_ratio', 'p99.psa_score_ratio', 'iqr.psa_score_ratio',
       'min.psa_gap_ratio', 'max.psa_gap_ratio', 'mean.psa_gap_ratio',
       'std.psa_gap_ratio', 'p1.psa_gap_ratio', 'p5.psa_gap_ratio',
       'p10.psa_gap_ratio', 'p20.psa_gap_ratio', 'p30.psa_gap_ratio',
       'p40.psa_gap_ratio', 'p50.psa_gap_ratio', 'p60.psa_gap_ratio',
       'p70.psa_gap_ratio', 'p80.psa_gap_ratio', 'p90.psa_gap_ratio',
       'p95.psa_gap_ratio', 'p99.psa_gap_ratio', 'iqr.psa_gap_ratio',
       'min.psa_stretch_ratio', 'max.psa_stretch_ratio',
       'mean.psa_stretch_ratio', 'std.psa_stretch_ratio',
       'p1.

In [10]:
from sklearn.model_selection import train_test_split

train_idxs, test_idxs = train_test_split(
    feat_df.index.to_list(), test_size=0.2, random_state=RSTATE
)
test_idxs, valid_idxs = train_test_split(test_idxs, test_size=0.5, random_state=RSTATE)
print(len(train_idxs), len(test_idxs), len(valid_idxs))
#
X_train = feat_df.loc[train_idxs]
X_test = feat_df.loc[test_idxs]
X_valid = feat_df.loc[valid_idxs]
y_train = label_df.loc[train_idxs].iloc[:, 0]
y_test = label_df.loc[test_idxs].iloc[:, 0]
y_valid = label_df.loc[valid_idxs].iloc[:, 0]

9144 1143 1144


In [11]:
from aldiscore.prediction.predictor import DifficultyPredictor

model = DifficultyPredictor("latest").model

feat_df = feat_df[model.feature_name()]
imps = model.feature_importance("gain")
imps

array([8.04534223e+00, 1.32410544e+00, 2.91905373e+00, 4.41585143e+00,
       2.77631243e+00, 1.48028450e+00, 2.48661195e+00, 2.69275846e+00,
       9.92216755e-01, 2.29292718e+00, 3.08267916e+00, 5.19993021e+00,
       1.35914026e+00, 1.96757552e+00, 1.31220019e+00, 2.21476764e+00,
       2.99080263e+00, 2.38292828e+00, 1.54994804e+00, 6.66119392e-01,
       1.86736148e+00, 1.23487106e+00, 7.56729780e-01, 6.65402227e-01,
       5.42992173e-01, 1.73968777e+00, 8.64870856e-01, 8.86101298e-01,
       1.36270865e+00, 1.06751265e+00, 1.37879700e+00, 1.54732912e+00,
       1.05967826e+00, 2.16808726e+00, 1.33269557e+00, 1.06390969e+00,
       1.77544471e+00, 7.51473835e-01, 1.06831909e+00, 1.23074635e+00,
       1.02535617e+00, 1.36056692e+00, 2.36909120e+00, 1.84759438e+00,
       1.44800012e+00, 1.69931730e+00, 1.30125230e+00, 8.55565760e-01,
       1.32342933e+00, 7.49930657e-01, 1.15022684e+00, 1.44723788e+00,
       9.87387262e-01, 1.01165321e+00, 2.73960940e+00, 1.78342388e+00,
      

In [13]:
imp_df = pd.concat([pd.Series(model.feature_name()), pd.Series(imps)], axis=1)
imp_df.columns = ["name", "gain"]
imp_df
imp_df["group"] = pd.Series(imp_df.name.str.split(".").map(lambda v: v[-1]))
imp_df.sort_values("gain", ascending=False)
print(set(imp_df.group))
for pat in [
    "mean",
    "min",
    "max",
    "mean",
    "iqr",
    "p50",
    "std",
    # "count",
    # "len",
    # "len_logdiff",
]:
    imp_df.group = imp_df.group.str.rsplit("_" + pat).map(lambda v: v[0])
groups = set(imp_df.group)
groups

{'psa_gap_len_mean', 'js_hpoly_len', 'js_hpoly_count', '11mer_js', 'psa_gap_len_iqr', 'tc_base_min', 'psa_stretch_ratio', 'lbgp', 'psa_score_ratio', 'tc_base_std', '7mer_ent', '9mer_ent', '7mer_js', 'tc_base_mean', 'tc_base_p50', '11mer_ent', 'psa_gap_len_std', 'psa_gap_ratio', 'js_char', '9mer_js', '5mer_ent', 'tc_base_max', '5mer_js', 'psa_gap_len_p50'}


{'11mer_ent',
 '11mer_js',
 '5mer_ent',
 '5mer_js',
 '7mer_ent',
 '7mer_js',
 '9mer_ent',
 '9mer_js',
 'js_char',
 'js_hpoly_count',
 'js_hpoly_len',
 'lbgp',
 'psa_gap_len',
 'psa_gap_ratio',
 'psa_score_ratio',
 'psa_stretch_ratio',
 'tc_base'}

In [None]:
imp_df.loc[imp_df.name.str.contains("gap_len")].sort_values(
    "gain", ascending=False
).iloc[:20]

Unnamed: 0,name,gain,group
203,p1.psa_gap_len_mean,11.134367,psa_gap_len
201,mean.psa_gap_len_mean,10.507347,psa_gap_len
240,p5.psa_gap_len_std,8.906027,psa_gap_len
210,p60.psa_gap_len_mean,7.464849,psa_gap_len
241,p10.psa_gap_len_std,7.07862,psa_gap_len
236,max.psa_gap_len_std,6.558598,psa_gap_len
239,p1.psa_gap_len_std,6.442121,psa_gap_len
211,p70.psa_gap_len_mean,6.2028,psa_gap_len
212,p80.psa_gap_len_mean,6.015574,psa_gap_len
209,p50.psa_gap_len_mean,5.79002,psa_gap_len


In [None]:
stat_df = imp_df.drop("name", axis=1).groupby("group").aggregate(["sum", "count"])
T_GAIN = "gain (%)"
# M_GAIN = "mean_gain (%)"
stat_df.columns = [T_GAIN, "feature count"]
stat_df = stat_df.sort_values(T_GAIN, ascending=False)
gain_cols = [T_GAIN]
stat_df[gain_cols] = (stat_df[gain_cols] / stat_df[gain_cols].sum(axis=0)).round(
    4
) * 100
stat_df

Unnamed: 0_level_0,importance by gain (%),feature count
group,Unnamed: 1_level_1,Unnamed: 2_level_1
tc_base,71.14,90
psa_score_ratio,6.2,18
psa_gap_ratio,4.52,18
11mer_js,4.36,18
psa_gap_len,3.81,72
9mer_js,2.88,18
5mer_js,1.04,18
9mer_ent,0.99,18
11mer_ent,0.94,18
js_char,0.8,18


In [70]:
name_map = dict(
    tc_base="psa_tc",
)
for i in [5, 7, 9, 11]:
    for sfx in ["ent", "js"]:
        name_map[f"{i}mer_{sfx}"] = f"{i}-mer_{sfx}"
name_map["js_char"] = "char_js"
name_map["ent_char"] = "char_ent"
name_map["js_hpoly_len"] = "hpoly_js_len"
name_map["js_hpoly_count"] = "hpoly_js_count"
stat_df = stat_df.rename(name_map)

In [79]:
stat_df["description"] = ""

stat_df.at["psa_tc", "description"] = "Transitive consistency of PSA triplets."
stat_df.at["psa_score_ratio", "description"] = (
    "Alignment score scaled by the minimum sequence length."
)
stat_df.at["psa_gap_ratio", "description"] = (
    "Number of gaps divided by total number of characters."
)
stat_df.at["11-mer_js", "description"] = (
    "Pairwise Jensen-Shannon divergence of k-mer distributions."
)
stat_df.at["psa_gap_len", "description"] = (
    "Features based on the lengths of gap regions."
)
stat_df.at["9-mer_js", "description"] = (
    "Pairwise Jensen-Shannon divergence of k-mer distributions."
)
stat_df.at["5-mer_js", "description"] = (
    "Pairwise Jensen-Shannon divergence of k-mer distributions."
)
stat_df.at["9-mer_ent", "description"] = "Shannon entropy of k-mer distributions."
stat_df.at["11-mer_ent", "description"] = "Shannon entropy of k-mer distributions."
stat_df.at["char_js", "description"] = (
    "Pairwise Jensen-Shannon divergence of character distributions."
)
stat_df.at["psa_stretch_ratio", "description"] = (
    "Ratio between max. sequence length and alignment length."
)
stat_df.at["7-mer_ent", "description"] = "Shannon entropy of k-mer distributions."
stat_df.at["5-mer_ent", "description"] = "Shannon entropy of k-mer distributions."
stat_df.at["7-mer_js", "description"] = (
    "Pairwise Jensen-Shannon divergence of k-mer distributions."
)
stat_df.at["hpoly_js_len", "description"] = (
    "Pairwise Jensen-Shannon divergence of homopolymer length distributions."
)
stat_df.at["hpoly_js_count", "description"] = (
    "Pairwise Jensen-Shannon divergence of homopolymer count distributions."
)
stat_df.at["lbgp", "description"] = (
    "Lower bound on the gap percentage (1 - mean_len/max_len)."
)
print(stat_df.to_latex(escape=True, float_format="%.2f"))

\begin{tabular}{lrrl}
\toprule
 & importance by gain (\%) & feature count & description \\
group &  &  &  \\
\midrule
psa\_tc & 71.14 & 90 & Transitive consistency of PSA triplets. \\
psa\_score\_ratio & 6.20 & 18 & Alignment score scaled by the minimum sequence length. \\
psa\_gap\_ratio & 4.52 & 18 & Number of gaps divided by total number of characters. \\
11-mer\_js & 4.36 & 18 & Pairwise Jensen-Shannon divergence of k-mer distributions. \\
psa\_gap\_len & 3.81 & 72 & Features based on the lengths of gap regions. \\
9-mer\_js & 2.88 & 18 & Pairwise Jensen-Shannon divergence of k-mer distributions. \\
5-mer\_js & 1.04 & 18 & Pairwise Jensen-Shannon divergence of k-mer distributions. \\
9-mer\_ent & 0.99 & 18 & Shannon entropy of k-mer distributions. \\
11-mer\_ent & 0.94 & 18 & Shannon entropy of k-mer distributions. \\
char\_js & 0.80 & 18 & Pairwise Jensen-Shannon divergence of character distributions. \\
psa\_stretch\_ratio & 0.77 & 18 & Ratio between max. sequence length and alig

In [None]:
# Feature classes:
# - tc_base :
# - psa_score_ratio : Alignment score scaled by the minimum sequence length
# - kmer_js : Pairwise Jensen-Shannon divergence of kmer distributions
# - kmer_ent : Entropy of kmer distributions
# - lbgp : Lower bound on the percentage of gaps
# - psa_gap :