In [10]:
from conservation_scores_dataset import (
    get_available_conservation_score_versions,
    get_available_conservation_scores
)
from tqdm.auto import tqdm
from conservation_scores_dataset import load_conservation_scores
from epigenomic_dataset import load_epigenomes
from tsnecuda import TSNE
from sklearn.impute import KNNImputer
import pandas as pd
from cache_decorator import Cache

In [2]:
assembly = "hg38"
dataset = "fantom"
regions = ["enhancers", "promoters"]
window_sizes = [64, 128, 256, 512, 1024]

In [12]:
@Cache(
    cache_path=[
        "cache/tsne_embeddings/{assembly}/{dataset}/{conservation_score}/{conservation_score_version}/{region}/{window_size}.csv.xz",
        "cache/filtered_labels/{assembly}/{dataset}/{conservation_score}/{conservation_score_version}/{region}/{window_size}.csv.xz",
    ],
    args_to_ignore=["y"]
)
def compute_embedding(
    assembly: str,
    dataset: str,
    conservation_score: str,
    conservation_score_version: str,
    region: str,
    window_size: int,
    y: pd.DataFrame
):
    # Retrieve the conservation scores
    X = load_conservation_scores(
        assembly=assembly,
        dataset=dataset,
        conservation_scores=conservation_score,
        conservation_score_version=conservation_score_version,
        region=region,
        window_size=window_size
    )
    # Identify rows with exclusively NaN values
    all_nan_values_mask = X.isna().all(axis=1)
    # Drop the rows with only NaN values
    X_without_nan_rows = X[~all_nan_values_mask]
    # Create a new set of labels without rows with only NaN values
    y_without_nan_rows = X[~all_nan_values_mask]
    # Impute the other NaN values
    X_imputed = pd.DataFrame(
        KNNImputer().fit_transform(X.values),
        index=X.index,
        columns=X.columns
    )
    # Compute the TSNE embedding
    tsne_embedding = TSNE().fit_transform(X_imputed.values)
    # Return the computed embedding and the filtered labels
    return (tsne_embedding, y_without_nan_rows)

In [13]:
for region in tqdm(
    regions,
    leave=False,
    desc="Region"
):
    for window_size in tqdm(
        window_sizes,
        leave=False,
        desc="Window size"
    ):
        _, y = load_epigenomes(
            assembly=assembly,
            dataset=dataset,
            region=region,
            window_size=window_size,
        )
        for conservation_score in tqdm(
            get_available_conservation_scores(),
            leave=False,
            desc="Conservation score"
        ):
            for conservation_score_version in tqdm(
                get_available_conservation_score_versions(),
                leave=False,
                desc="Conservation score versions"
            ):  
                tsne_embedding, y_without_nan_rows = compute_embedding(
                    assembly,
                    dataset,
                    conservation_score,
                    conservation_score_version,
                    region,
                    window_size,
                    y
                )
                0/0

Region:   0%|          | 0/2 [00:00<?, ?it/s]

Window size:   0%|          | 0/5 [00:00<?, ?it/s]

Conservation score:   0%|          | 0/2 [00:00<?, ?it/s]

Conservation score versions:   0%|          | 0/6 [00:00<?, ?it/s]

ValueError: could not convert string to float: '0.10.1'