In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import random

%matplotlib inline

In [None]:
import sys

sys.path.append("./")
sys.path.append("../code/")
sys.path.append("./code/")

import time

In [None]:
from typing import List, Tuple, Union

In [None]:
from loguru import logger

# Data Loader

In [None]:
from bps_numerical.preprocessing import merge_gene_phenotype, standardize_gene_data

In [None]:
CSV_GENE = "/Users/nishparadox/dev/uah/nasa-impact/gene-experiments/data/OneDrive_1_3-21-2022/gen.csv"

CSV_PHENOTYPE = "/Users/nishparadox/dev/uah/nasa-impact/gene-experiments/data/OneDrive_1_3-21-2022/meta.csv"

In [None]:
# df_merged = merge_gene_phenotype(standardize_gene_data(CSV_GENE), CSV_PHENOTYPE)

In [None]:
df_genes = standardize_gene_data(CSV_GENE)

In [None]:
df_genes.head()

In [None]:
samples = df_genes.pop("Sample")

In [None]:
df_genes = df_genes.astype(float)

In [None]:
df_genes.iloc[0].dtype

# Feature Selection

In [None]:
from bps_numerical.clustering import CorrelationClusterer
from bps_numerical.feature_selection import FirstFeatureSelector, KRandomizedFeatureSelector

In [None]:
clusterer = CorrelationClusterer(
    list(df_genes.columns),
    cutoff_threshold=0.3,
    debug=False
)

In [None]:
fs = FirstFeatureSelector(clusterer=clusterer)
# fs = KRandomizedFeatureSelector(clusterer=clusterer, k_features=2)

In [None]:
cols_genes = fs.select_features(df_genes)

In [None]:
# cols_genes = list(df_genes.columns)

In [None]:
len(cols_genes)

In [None]:
cols_genes[:10]

In [None]:
# len(df_genes.columns)

# Data Prep

In [None]:
df_merged = merge_gene_phenotype(
    pd.concat([samples, df_genes[cols_genes]], axis=1),
    CSV_PHENOTYPE,
    "Sample",
)

In [None]:
df_merged.shape

In [None]:
df_merged.head()

In [None]:
len(cols_genes)

# Ranker

To further narrow down the gene space, we could train N different isolated classifiers for a specific
phenotype and then use those genes for training only that phenotype later.

In [None]:
from bps_numerical.classification.feature_scorers import GeneRanker

In [None]:
ranker = GeneRanker(
    cols_genes, 
    phenotype="condition",
    n_runs = 5,
    debug=True,
)

In [None]:
features = ranker.get_features(df_merged, test_size=0.2, top_k=500, ignore_zeros=True, normalize=True)

In [None]:
len(features)

In [None]:
fts, _ = zip(*features)
fts

# Trainer

In [None]:
from bps_numerical.classification import SinglePhenotypeClassifier, MultiPhenotypeIsolatedClassifier

In [None]:
# single phenotype

# model = xgboost.XGBClassifier()
clf = SinglePhenotypeClassifier(
    cols_genes=cols_genes,
    phenotype="condition",
#     model = model
)

In [None]:
# tracker_single = clf.train(df_merged)

In [None]:
# tracker_single

# Multiple phenotypes

In [None]:
clf_condition = SinglePhenotypeClassifier(cols_genes, "condition")
clf_strain = SinglePhenotypeClassifier(cols_genes, "strain")
clf_gender = SinglePhenotypeClassifier(cols_genes, "gender")
clf_mission = SinglePhenotypeClassifier(cols_genes, "mission")
clf_animal_return = SinglePhenotypeClassifier(cols_genes, "animalreturn")
trainer = MultiPhenotypeIsolatedClassifier(
    cols_genes=cols_genes,
    classifiers=[
        clf_condition,
        clf_strain,
        clf_gender,
        clf_mission,
        clf_animal_return
    ],
    debug=True,
)

In [None]:
tracker_multi = trainer.train(df_merged)

In [None]:
tracker_multi

In [None]:
tracker_multi["animalreturn"]

# Feature Scorer

In [None]:
from bps_numerical.classification.feature_scorers import PhenotypeFeatureScorer

In [None]:
for clf in trainer.classifiers:
    print(clf.phenotype, len(PhenotypeFeatureScorer(clf).get_features(top_k=500, ignore_zeros=True)))

In [None]:
PhenotypeFeatureScorer(clf).get_features(top_k=500, ignore_zeros=True, normalize=True)

In [None]:
len(PhenotypeFeatureScorer(trainer).get_features(top_k=500, ignore_zeros=False))

In [None]:
# if we ignore 0-score features
len(PhenotypeFeatureScorer(*trainer.classifiers).get_features(top_k=500, ignore_zeros=True))

In [None]:
list(map(lambda f: f[0], PhenotypeFeatureScorer(clf_condition, clf_strain).get_features(top_k=500, ignore_zeros=True, normalize=True)))

### permutation

In [None]:
import itertools

In [None]:
def compute_permuted_scores(*classifiers, ignore_zeros: bool = True, top_k: int = 500):
    def _powerset(items):
        for sl in itertools.product(*[[[], [i]] for i in items]):
            yield {j for i in sl for j in i}
    
    res = {}
    for objs in _powerset(classifiers):
        if len(objs) < 2:
            continue
        labels = tuple(map(lambda clf: clf.phenotype, objs))
        res[labels] = PhenotypeFeatureScorer(*objs).get_features(top_k=top_k, ignore_zeros=True, normalize=True)
    return res

In [None]:
permuted_ =  compute_permuted_scores(*trainer.classifiers, top_k=1000)

In [None]:
dict(map(lambda p: (p[0], (len(p[1]), p[1])), permuted_.items()))

In [None]:
dict(map(lambda p: (p[0], len(p[1])), permuted_.items()))

# Plot Top features

In [None]:
def plot_features(features: List[Tuple[str, float]], view_slicer:int = 75):
    df_top_k = pd.DataFrame(features[:view_slicer], columns=["gene", "importance"])
    fig = px.bar(
        df_top_k,
        x="importance",
        y="gene",
        title=f"{view_slicer} features",
        orientation="h",
        height=1600,
        width=1000,
    #     text_auto=True,
    )
    # fig.update_traces(width=3)
    fig.update_layout(yaxis = dict(tickfont = dict(size=7)))
    fig.show()

In [None]:
plot_features(
    PhenotypeFeatureScorer(clf_mission, clf_strain).get_features(top_k=500, ignore_zeros=True, normalize=True)
)