In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import rna_utils as ru
from sklearn.model_selection import train_test_split

DIST_PATH = "../../data/rnadist_f_all.h5"
META_PATH = "../../data/sources/ArchiveII_with_prob_and_motiv.csv"
threshold = 100

SAVE_PATH_ORIG = (
    "/home/gkulemeyer/Documents/Repos/RNA-analysis/DataAnalysis/data/filtered_dataset/"
)

SAVE_PATH_FILTERED = f"/home/gkulemeyer/Documents/Repos/RNA-analysis/DataAnalysis/data/filtered_dataset/samples/n_{threshold}"

In [53]:
dist_PATH = f"../../data/samples_filtered{threshold}.h5"
dist, meta = ru.load_and_align(corr_path=DIST_PATH, meta_path=META_PATH)
meta.head(2)

Unnamed: 0_level_0,sequence,structure,base_pairs,len,motivos,fam,pseudo_probe,stem
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
16s_A.fulgidus_domain2,UUUAUUGGGCCUAAAGCGUCCGUAGCCGGGCUGGUAAGUCCUCCGG...,.......(((<<...(.((((.(.(((.(((((((.((((((((((...,"[[8, 329], [9, 328], [10, 327], [11, 312], [12...",359,EEEEEEESSSBBBBBSMSSSSISBSSSMSSSSSSSBSSSSSSSSSS...,16s,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, ..."
16s_A.fulgidus_domain3,AAGGAAUUGGCGGGGGAGCACUACAACGGGUGGAGCCUGCGGUUUA...,.......(((((.(((((((..((..((((((.((((((((((......,"[[8, 487], [9, 486], [10, 484], [11, 483], [12...",488,EEEEEEESSSSSBSSSSSSSIISSMMSSSSSSISSSSSSSSSSMMM...,16s,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, ..."


In [54]:
import numpy as np

meta_ids = []
for fam in meta.fam.unique():
    meta_fam = meta.query('fam == @fam').copy() 
    if meta_fam.shape[0] > threshold:
        meta_ids.append(meta_fam.sample(threshold).index)
    else:
        meta_ids.append(meta_fam.index)

all_meta_ids = np.concatenate(meta_ids)
filtered = meta.loc[all_meta_ids]
filtered.shape

(690, 8)

In [55]:
# Save dataset to csv, using in train / valid meta200 or meta400 and in test meta. save as: the id / fam / partition
def filter_ids(full, filtered, fam, split_type):
    save_cols = ["split_type", "fold_name", "fold_number", "partition"]
    cols = ["id", "fold_name", "partition"]
    fam_num = {
        "grp1": 0,
        "tmRNA": 1,
        "tRNA": 2,
        "5s": 3,
        "srp": 4,
        "telomerase": 5,
        "RNaseP": 6,
        "16s": 7,
        "23s": 8,
    }
    # train set
    filtered = filtered.copy().reset_index()
    filtered = filtered[filtered["fam"] != fam]
    train, valid = train_test_split(
        filtered, test_size=0.2, random_state=42, stratify=filtered["fam"]
    )
    train["fold_name"] = fam
    train["partition"] = "train"

    valid["fold_name"] = fam
    valid["partition"] = "valid"
    # test
    full = full.copy().reset_index()
    full = full[full["fam"] == fam]
    full = full[["id", "fam"]]
    full["fold_name"] = fam
    full["partition"] = "test"
    # concat
    df = pd.concat([train[cols], valid[cols], full[cols]], ignore_index=True)
    # df.set_index("id", drop=True)
    df["split_type"] = split_type
    df["fold_number"] = fam_num[fam]
    df.set_index("id", drop=True, inplace=True)
    return df[save_cols]

In [56]:
df_filter_list = []
for f in meta.fam.unique():
    df_filter_list.append(filter_ids(meta, filtered, f, f"ff-samples_{threshold}"))
dfs_f = pd.concat(df_filter_list)
dfs_f
dfs_f.to_csv(f"{SAVE_PATH_ORIG}/ArchiveII_samples_{threshold}.csv", index=True)

In [57]:


for f in meta.fam.unique():
    for partition in ["train", "valid", "test"]:
        print(f, partition)

        # Filtra los IDs del dataframe de particiones
        ids = dfs_f.query(f'fold_name == "{f}" and partition == "{partition}"').index

        # Verifica si hay IDs para esa combinación
        if len(ids) == 0:
            print("No hay datos para esta combinación.")
            continue

        # Filtra el dataframe meta por esos IDs
        subset = meta.loc[meta.index.intersection(ids)][
            ["sequence", "structure", "base_pairs", "len"]
        ]

        if not subset.empty:
            subset.to_csv(
                f"{SAVE_PATH_FILTERED}/{partition}_{f}.csv",
                index=True,
            )

16s train
16s valid
16s test
23s train
23s valid
23s test
5s train
5s valid
5s test
RNaseP train
RNaseP valid
RNaseP test
grp1 train
grp1 valid
grp1 test
srp train
srp valid
srp test
tRNA train
tRNA valid
tRNA test
telomerase train
telomerase valid
telomerase test
tmRNA train
tmRNA valid
tmRNA test
