In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

**Sampling data**

In [None]:
df = pd.read_csv("../data/limma_df_chr1_chr22_chr2_chr21_chr3_chr20_chr4_chr19_chr5_chr18_chr6_chr17_chr7_chr16_chr8_chr15_chr9_chr14_chr10_chr13_chr11_chr12.csv")

In [None]:
df_subsample = df.sample(n=10000, random_state=123)

df_subsample.to_csv("../data/data_test.csv", index=False)

**LBDfc**

In [9]:
df = pd.read_csv("../data/LBDfcFilteredMyNorm.csv", index_col=0)

# For testing
df = df.sample(n=10000, random_state=123)

df = df.T

if "Unnamed: 0" in df.index:
    df = df.drop("Unnamed: 0", axis=0)

def extraer_categoria(nombre):
    if nombre.startswith("CTRL"):
        return "CTRL"
    elif nombre.startswith("PDD"):
        return "PDD"
    elif nombre.startswith("PD"):
        return "PD"
    elif nombre.startswith("DLB"):
        return "DLB"
    else:
        return None

df['Categoria'] = df.index.to_series().apply(extraer_categoria)

if df['Categoria'].isnull().any():
    print("Algunos nombres de fila no pudieron clasificarse en una categoría.")

X = df.drop("Categoria", axis=1)
y = df["Categoria"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=123
)

print("Índices del conjunto de entrenamiento:")
print(X_train.index.tolist())
print("\nÍndices del conjunto de prueba:")
print(X_test.index.tolist())

X_train = X_train.T

X_train.to_csv("../data/LBDfcFilteredMyNorm_train.csv")

X_test = X_test.T

X_test.to_csv("../data/LBDfcFilteredMyNorm_test.csv")

# Sheet

df_sheet = pd.read_csv("../data/LBDfcSamplesheet.csv")

train_indices = X_train.columns.tolist()
test_indices = X_test.columns.tolist()

df_sheet_train = df_sheet[df_sheet["Sample_Name"].isin(train_indices)]
df_sheet_train = df_sheet_train.set_index("Sample_Name").reindex(X_train.columns).reset_index()

df_sheet_test = df_sheet[df_sheet["Sample_Name"].isin(test_indices)]
df_sheet_test = df_sheet_test.set_index("Sample_Name").reindex(X_test.columns).reset_index()

df_sheet_train.to_csv("../data/LBDfcSamplesheet_train.csv", index=False)
df_sheet_test.to_csv("../data/LBDfcSamplesheet_test.csv", index=False)

Índices del conjunto de entrenamiento:
['PDD38', 'CTRL61', 'PDD23', 'PDD21', 'PD29', 'PD5', 'CTRL88', 'PD41', 'PDD39', 'PD11', 'PDD26', 'PDD25', 'PD1', 'CTRL24', 'PD55', 'CTRL15', 'CTRL63', 'PDD60', 'PDD51', 'PDD43', 'PD14', 'PDD5', 'PD37', 'CTRL80', 'PDD74', 'CTRL62', 'PD3', 'CTRL39', 'CTRL97', 'PD47', 'DLB17', 'DLB2', 'CTRL35', 'PD8', 'CTRL51', 'CTRL77', 'CTRL68', 'PDD24', 'CTRL19', 'PDD13', 'CTRL43', 'DLB11', 'PDD58', 'CTRL42', 'PDD53', 'PD40', 'PDD44', 'PD6', 'CTRL1', 'PD35', 'PDD41', 'PDD70', 'PD59', 'PDD28', 'CTRL78', 'CTRL64', 'PD48', 'PD21', 'CTRL95', 'CTRL57', 'PDD30', 'PDD22', 'DLB15', 'PD50', 'CTRL29', 'PDD54', 'PDD27', 'DLB24', 'PD23', 'PD30', 'PDD15', 'CTRL50', 'CTRL93', 'PDD35', 'CTRL73', 'PDD6', 'CTRL13', 'PDD73', 'CTRL81', 'CTRL33', 'PDD49', 'PD27', 'PDD4', 'PD58', 'CTRL85', 'PD52', 'PD60', 'PD61', 'CTRL11', 'CTRL40', 'DLB35', 'PD9', 'CTRL79', 'PDD61', 'PD56', 'CTRL55', 'CTRL37', 'CTRL82', 'CTRL32', 'PD2', 'PD54', 'CTRL48', 'PD49', 'PDD69', 'PD45', 'CTRL12', 'PD33', 'PD

**MSAcrbl**

In [None]:
df = pd.read_csv("../data/MSAcrbl/MSAcrblMyNorm.csv", index_col=0)
df = df.T

if "Unnamed: 0" in df.index:
    df = df.drop("Unnamed: 0", axis=0)

def extraer_categoria(nombre):
    if nombre.startswith("CTRL"):
        return "CTRL"
    elif nombre.startswith("MSA_OPCA"):
        return "MSA_OPCA"
    elif nombre.startswith("MSA_SND"):
        return "MSA_SND"
    elif nombre.startswith("MSA"):
        return "MSA"
    else:
        return None

df['Categoria'] = df.index.to_series().apply(extraer_categoria)

if df['Categoria'].isnull().any():
    print("Algunos nombres de fila no pudieron clasificarse en una categoría.")

X = df.drop("Categoria", axis=1)
y = df["Categoria"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=123
)

print("Índices del conjunto de entrenamiento:")
print(X_train.index.tolist())
print("\nÍndices del conjunto de prueba:")
print(X_test.index.tolist())

X_train = X_train.T
X_test = X_test.T

X_train.to_csv("../data/MSAcrbl/MSAcrblMyNorm_train.csv")
X_test.to_csv("../data/MSAcrbl/MSAcrblMyNorm_test.csv")

# Sheet

df_sheet = pd.read_csv("../data/MSAcrblSamplesheet.csv")

train_indices = X_train.columns.tolist()
test_indices = X_test.columns.tolist()

df_sheet_train = df_sheet[df_sheet["Sample_Name"].isin(train_indices)]
df_sheet_train = df_sheet_train.set_index("Sample_Name").reindex(X_train.columns).reset_index()

df_sheet_test = df_sheet[df_sheet["Sample_Name"].isin(test_indices)]
df_sheet_test = df_sheet_test.set_index("Sample_Name").reindex(X_test.columns).reset_index()

df_sheet_train.to_csv("../data/MSAcrblSamplesheet_train.csv", index=False)
df_sheet_test.to_csv("../data/MSAcrblSamplesheet_test.csv", index=False)

**MSApfc**

In [None]:
df = pd.read_csv("../data/MSApfc/MSApfcMyNorm.csv", index_col=0)
df = df.T

if "Unnamed: 0" in df.index:
    df = df.drop("Unnamed: 0", axis=0)

def extraer_categoria(nombre):
    nombre = nombre.strip()
    if nombre.startswith("CTRL"):
        return "CTRL"
    elif nombre.startswith("MSA"):
        return "MSA"
    else:
        return None

df['Categoria'] = df.index.to_series().apply(extraer_categoria)

if df['Categoria'].isnull().any():
    print("Algunos nombres de fila no pudieron clasificarse en una categoría.")

X = df.drop("Categoria", axis=1)
y = df["Categoria"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=123
)

print("Índices del conjunto de entrenamiento:")
print(X_train.index.tolist())
print("\nÍndices del conjunto de prueba:")
print(X_test.index.tolist())

X_train = X_train.T
X_test = X_test.T

X_train.to_csv("../data/MSApfc/MSApfcMyNorm_train.csv")
X_test.to_csv("../data/MSApfc/MSApfcMyNorm_test.csv")

# Sheet

df_sheet = pd.read_csv("../data/MSApfcSamplesheet.csv")

train_indices = X_train.columns.tolist()
test_indices = X_test.columns.tolist()

df_sheet_train = df_sheet[df_sheet["Sample_Name"].isin(train_indices)]
df_sheet_train = df_sheet_train.set_index("Sample_Name").reindex(X_train.columns).reset_index()

df_sheet_test = df_sheet[df_sheet["Sample_Name"].isin(test_indices)]
df_sheet_test = df_sheet_test.set_index("Sample_Name").reindex(X_test.columns).reset_index()

df_sheet_train.to_csv("../data/MSApfcSamplesheet_train.csv", index=False)
df_sheet_test.to_csv("../data/MSApfcSamplesheet_test.csv", index=False)

**Sao dataset**

In [None]:
df = pd.read_csv("../data/MsaPdPsp/MsaPdPspMyNormBMIQExOutliersFinal.csv", index_col=0)
df = df.T

if "Unnamed: 0" in df.index:
    df = df.drop("Unnamed: 0", axis=0)

def extraer_categoria(nombre):
    if nombre.startswith("CTRL"):
        return "CTRL"
    elif nombre.startswith("MSA"):
        return "MSA"
    elif nombre.startswith("PD"):
        return "PD"
    elif nombre.startswith("PSP"):
        return "PSP"
    else:
        return None

df['Categoria'] = df.index.to_series().apply(extraer_categoria)

if df['Categoria'].isnull().any():
    print("Algunos nombres de fila no pudieron clasificarse en una categoría.")

X = df.drop("Categoria", axis=1)
y = df["Categoria"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=123
)

print("Índices del conjunto de entrenamiento:")
print(X_train.index.tolist())
print("\nÍndices del conjunto de prueba:")
print(X_test.index.tolist())

X_train = X_train.T
X_test = X_test.T

X_train.to_csv("../data/MsaPdPsp/MsaPdPspMyNormBMIQExOutliersFinal_train.csv")
X_test.to_csv("../data/MsaPdPsp/MsaPdPspMyNormBMIQExOutliersFinal_test.csv")

# Sheet

df_sheet = pd.read_csv("../data/MsaPdPspSamplesheet.csv")

train_indices = X_train.columns.tolist()
test_indices = X_test.columns.tolist()

df_sheet_train = df_sheet[df_sheet["Sample_Name"].isin(train_indices)]
df_sheet_train = df_sheet_train.set_index("Sample_Name").reindex(X_train.columns).reset_index()

df_sheet_test = df_sheet[df_sheet["Sample_Name"].isin(test_indices)]
df_sheet_test = df_sheet_test.set_index("Sample_Name").reindex(X_test.columns).reset_index()

df_sheet_train.to_csv("./data/MsaPdPspSamplesheet_train.csv", index=False)
df_sheet_test.to_csv("./data/MsaPdPspSamplesheet_test.csv", index=False)