In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

# MDN

In [2]:
corres_mdn = pd.read_excel("..\dataset\corres_MDN.xlsx", sheet_name="Summary")
corres_mdn = corres_mdn[
    [
        "patient_ID2",
        "sample_ID",
        "ID_Nucleic_Acid",
        "ID_scan",
        "histological_aspect",
        "sous_type_visuel",
        "stroma_visual_aspect",
    ]
]
corres_mdn.dropna(subset=["ID_Nucleic_Acid"], inplace=True)
corres_mdn.rename(columns={"patient_ID2": "patient_ID"}, inplace=True)
corres_mdn.ID_scan = corres_mdn.ID_scan.str.replace(".svs", "")

In [3]:
mdn_path = Path(r"C:\Users\inserm\Documents\histo_sign\dataset\MDN")
mdn_paths = list(mdn_path.glob("*.svs"))
mdn_df = pd.DataFrame(mdn_paths, columns=["path_svs"])
mdn_df["ID_scan"] = mdn_df["path_svs"].apply(lambda x: x.stem)


def get_xml(path):
    xml = path.parent / (path.stem + ".xml")
    if xml.exists():
        return xml
    else:
        return None


mdn_df["path_xml"] = mdn_df["path_svs"].apply(get_xml)
mdn_df

Unnamed: 0,path_svs,ID_scan,path_xml
0,C:\Users\inserm\Documents\histo_sign\dataset\M...,12AG00001-14_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
1,C:\Users\inserm\Documents\histo_sign\dataset\M...,12AG00001-17_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
2,C:\Users\inserm\Documents\histo_sign\dataset\M...,12AG000483-19_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
3,C:\Users\inserm\Documents\histo_sign\dataset\M...,12AG000483-26_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
4,C:\Users\inserm\Documents\histo_sign\dataset\M...,12AG00255-17_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
...,...,...,...
257,C:\Users\inserm\Documents\histo_sign\dataset\M...,551514-10_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
258,C:\Users\inserm\Documents\histo_sign\dataset\M...,551743-14_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
259,C:\Users\inserm\Documents\histo_sign\dataset\M...,551743-15_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...
260,C:\Users\inserm\Documents\histo_sign\dataset\M...,552138-25_MDNF01_HES,C:\Users\inserm\Documents\histo_sign\dataset\M...


In [4]:
corres_mdn = pd.merge(corres_mdn, mdn_df, on="ID_scan", how="inner")

In [5]:
print(
    f" {corres_mdn.shape[0]} rows, {corres_mdn.sample_ID.nunique()} samples, {corres_mdn.patient_ID.nunique()} patients, {corres_mdn.ID_scan.nunique()} scans"
)

 405 rows, 405 samples, 95 patients, 238 scans


In [6]:
sign_mdn_uq = pd.read_csv("..\dataset\signature_pacpaintMDN_normUQ.tsv", sep="\t")
sign_mdn_uq["ID_Nucleic_Acid"] = sign_mdn_uq.index
sign_mdn_uq.reset_index(drop=True, inplace=True)

In [7]:
mdn_summary_uq = pd.merge(corres_mdn, sign_mdn_uq, on="ID_Nucleic_Acid", how="inner")
mdn_summary_uq.to_csv("..\dataset\mdn_summary_uq.csv", index=False)

In [8]:
sign_mdn_vst = pd.read_csv("..\dataset\signature_pacpaintMDN_normVST.tsv", sep="\t")
sign_mdn_vst_2 = pd.read_csv("..\dataset\signature_pacpaintMDN_normVST_basalB.tsv", sep="\t")
sign_mdn_vst.update(sign_mdn_vst_2["PDAC_CSY20_BasallikeB"])
sign_mdn_vst["ID_Nucleic_Acid"] = sign_mdn_vst.index
sign_mdn_vst.reset_index(drop=True, inplace=True)

In [9]:
mdn_summary_vst = pd.merge(corres_mdn, sign_mdn_vst, on="ID_Nucleic_Acid", how="inner")
mdn_summary_vst.to_csv("..\dataset\mdn_summary_vst.csv", index=False)

In [10]:
col_mdn_uq = sign_mdn_vst.columns.values
np.savetxt(
    Path(r"C:\Users\inserm\Documents\histo_sign\dataset\col_names.txt"),
    col_mdn_uq,
    fmt="%s",
    encoding="utf-8",
)

In [11]:
mdn_summary_uq

Unnamed: 0,patient_ID,sample_ID,ID_Nucleic_Acid,ID_scan,histological_aspect,sous_type_visuel,stroma_visual_aspect,path_svs,path_xml,IMMU_Tcellatlas_CD4.CD8_c4_Tstr,...,PDAC_CSY20_Sig9,PDAC_CSY20_Sig12,Exocrine,Endocrine,Classic,StromaActiv,Basal,StromaActivInflam,Immune,StromaInactive
0,BPDAC_001,BPDAC_001_18_L1,BPDAC_001_18_L1_S21,543933-18_MDNF01_HES,gland_forming,classic,active,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.251528,...,-0.267548,-0.234046,1.651200,1.246714,0.795403,-0.627427,-0.114389,-0.807231,-0.260096,-0.085382
1,BPDAC_001,BPDAC_001_23_L1,BPDAC_001_23_L1_S22,543933-23_MDNF01_HES,gland_forming,classic,inactive,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.197341,...,-0.384039,-0.387991,1.312321,0.329441,-0.953591,0.811548,0.141361,-0.832222,0.120554,0.493352
2,BPDAC_001,BPDAC_001_24_L1,BPDAC_001_24_L1_S23,543933-24_MDNF01_HES,gland_forming,classic,inactive,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.227345,...,-0.240596,-0.254863,0.176609,1.295504,1.066609,-0.995978,-0.332557,-0.657534,-0.914294,-0.978189
3,BPDAC_001,BPDAC_001_24_L2,BPDAC_001_24_L2_S24,543933-24_MDNF01_HES,non_gland_forming,basal,inactive,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,0.029996,...,-0.393131,-0.407793,1.397862,-0.098630,-0.919093,-1.913689,-1.401714,-0.477149,-1.437690,0.641191
4,BPDAC_002,BPDAC_002_14_L1,BPDAC_002_14_L1_S26,544085-14_MDNF01_HES,gland_forming,classic,active,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.208567,...,-0.286690,-0.396827,6.645748,0.891678,-0.079652,-1.090691,-2.615928,-0.079042,-2.268747,0.153177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,BPDAC_100,BPDAC_100_23_L1,X0423_016,14AG03361-23_MDNF01_HES,,,,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.231280,...,0.213328,0.562602,-0.439336,0.169651,-1.216667,-2.471169,0.376327,-1.653971,0.952652,-1.131990
401,BPDAC_101,BPDAC_101_16_L1,X0423_020,14AG03394-16_MDNF01_HES,non_gland_forming,,,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.239232,...,-0.378726,-0.393438,-0.850817,-0.625973,-2.339338,0.570531,1.572889,0.656808,-0.566951,-1.666654
402,BPDAC_101,BPDAC_101_21_L1,X0423_021,14AG03394-21_MDNF01_HES,non_gland_forming,,,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.253645,...,-0.393514,-0.287920,-0.195845,-0.278156,-1.256822,1.297781,1.113797,-0.176676,-0.423811,-0.177933
403,BPDAC_101,BPDAC_101_21_L2,X0423_022,14AG03394-21_MDNF01_HES,non_gland_forming,,,C:\Users\inserm\Documents\histo_sign\dataset\M...,C:\Users\inserm\Documents\histo_sign\dataset\M...,-0.196312,...,0.065607,-0.233023,-1.230810,-0.588177,-2.731031,1.134563,1.992289,0.407219,-0.800610,-1.504987


# Prodige 24

In [12]:
corres_p24 = pd.read_excel("..\dataset\corres_p24.xlsx", sheet_name="Correspondance_Table")
corres_p24 = corres_p24[["Scan_HES", "ID_Nucleic_Acid"]]
corres_p24 = corres_p24[corres_p24.Scan_HES != "no_scan"]
corres_p24 = corres_p24[corres_p24.ID_Nucleic_Acid != "no_rna"]
corres_p24["patient_ID"] = corres_p24.Scan_HES.apply(lambda x: x.split("-")[0].split("_")[0])
corres_p24.loc[corres_p24.Scan_HES == "BJN-165_HES", "patient_ID"] = "BJN165"

In [13]:
p24_path = Path(r"C:\Users\inserm\Documents\histo_sign\dataset\PRODIGE_24")
p24_paths = list(p24_path.glob("*.svs"))
p24_df = pd.DataFrame(p24_paths, columns=["path_svs"])
p24_df["ID_scan"] = p24_df["path_svs"].apply(lambda x: x.stem)
p24_df["merge_id"] = p24_df["ID_scan"].apply(lambda x: x.split("_")[0])

In [14]:
corres_p24 = pd.merge(corres_p24, p24_df, left_on="Scan_HES", right_on="merge_id", how="inner")
corres_p24.drop(columns=["merge_id"], inplace=True)
corres_p24.Scan_HES = corres_p24.ID_scan.apply(lambda x: x.replace(".svs", ""))

In [15]:
corres_p24

Unnamed: 0,Scan_HES,ID_Nucleic_Acid,patient_ID,path_svs,ID_scan
0,148405-D_HES,MOSA226R1,148405,C:\Users\inserm\Documents\histo_sign\dataset\P...,148405-D_HES
1,148405-U_HES,MOSA226R1,148405,C:\Users\inserm\Documents\histo_sign\dataset\P...,148405-U_HES
2,155267-B6_HES,MOSA227R1,155267,C:\Users\inserm\Documents\histo_sign\dataset\P...,155267-B6_HES
3,155318-4L_HES,MOSA228R1,155318,C:\Users\inserm\Documents\histo_sign\dataset\P...,155318-4L_HES
4,156185-D1_HES,MOSA229R1,156185,C:\Users\inserm\Documents\histo_sign\dataset\P...,156185-D1_HES
...,...,...,...,...,...
354,15AG09870-29_HES,MOSA351R1,15AG09870,C:\Users\inserm\Documents\histo_sign\dataset\P...,15AG09870-29_HES
355,16AG00518-16_HES,MOSA352R1,16AG00518,C:\Users\inserm\Documents\histo_sign\dataset\P...,16AG00518-16_HES
356,16AG01316-12_HES,MOSA353R1,16AG01316,C:\Users\inserm\Documents\histo_sign\dataset\P...,16AG01316-12_HES
357,16AG03059-23_HES,MOSA355R1,16AG03059,C:\Users\inserm\Documents\histo_sign\dataset\P...,16AG03059-23_HES


In [16]:
print(
    f" {corres_p24.shape[0]} rows, {corres_p24.ID_Nucleic_Acid.nunique()} samples, {corres_p24.patient_ID.nunique()} patients, {corres_p24.ID_scan.nunique()} scans"
)

 359 rows, 349 samples, 355 patients, 359 scans


In [17]:
# Because the number of unique sample is the number of scans we set
corres_p24["sample_ID"] = corres_p24.ID_scan

In [18]:
sign_p24_uq = pd.read_csv("..\dataset\signature_pacpaintP24_normUQ.tsv", sep="\t")
sign_p24_uq["ID_Nucleic_Acid"] = sign_p24_uq.index
sign_p24_uq.reset_index(drop=True, inplace=True)

In [19]:
p24_summary_uq = pd.merge(corres_p24, sign_p24_uq, on="ID_Nucleic_Acid", how="inner")
p24_summary_uq.to_csv("..\dataset\p24_summary_uq.csv", index=False)

In [20]:
sign_p24_vst = pd.read_csv("..\dataset\signature_pacpaintP24_normVST.tsv", sep="\t")
sign_p24_vst_2 = pd.read_csv("..\dataset\signature_pacpaintP24_normVST_basalB.tsv", sep="\t")
sign_p24_vst.update(sign_p24_vst_2["PDAC_CSY20_BasallikeB"])
sign_p24_vst["ID_Nucleic_Acid"] = sign_p24_vst.index
sign_p24_vst.reset_index(drop=True, inplace=True)

In [21]:
p24_summary_vst = pd.merge(corres_p24, sign_p24_vst, on="ID_Nucleic_Acid", how="inner")
p24_summary_vst.to_csv("..\dataset\p24_summary_vst.csv", index=False)

# PANC multicentrique

In [22]:
panc_summary = pd.read_csv(r"C:\Users\inserm\Documents\histo_sign\dataset\panc_summary_vst.csv")
panc_summary.path_svs = panc_summary.path_svs.apply(Path)
panc_summary.path_xml = panc_summary.path_xml.apply(Path)
assert panc_summary.path_svs.apply(lambda x: x.exists()).all()
assert panc_summary.path_xml.apply(lambda x: x.exists()).all()

panc_summary["sample_ID"] = panc_summary.path_svs.apply(lambda x: x.stem)
panc_summary.sort_values("sample_ID", inplace=True)
panc_summary.reset_index(drop=True, inplace=True)
panc_summary["patient_ID"] = panc_summary.sample_ID.apply(lambda x: x.split("-")[0].split("_")[0])
# panc_summary.drop(columns="custom_id", inplace=True)
col_names = list(panc_summary.columns.values)
col_names = col_names[-2:] + col_names[:-2]
panc_summary = panc_summary[col_names]

In [23]:
print(
    f" {panc_summary.shape[0]} rows, {panc_summary.sample_ID.nunique()} samples, {panc_summary.patient_ID.nunique()} patients"
)

 732 rows, 732 samples, 354 patients


In [24]:
panc_summary.to_csv(r"C:\Users\inserm\Documents\histo_sign\dataset\panc_summary_vst.csv", index=False)

In [25]:
disc_summary = panc_summary[panc_summary.cohort == "DISC"]
disc_summary.to_csv(r"C:\Users\inserm\Documents\histo_sign\dataset\disc_summary_vst.csv", index=False)