In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

---
### Read in key dataframe with sample information

In [15]:
sample_key = pd.read_csv("../../resources/visium_sample_key.csv", index_col=0)

In [16]:
sample_key.head()

Unnamed: 0,sample_key,sample_key_short,block_name,patient_name,tumor_type,tumor_loc,gender,ethnicity,tumor_grade,tumor_stage,...,ws_flip,CNV_group,TMA,cooccurrence_dist,CIN Status,Polyclonal,Evolution,Immune Excluded,Vital Status,topup
8578_4_SR00001,8578_AS_4,8578_4,SR00001,SR00001,NL,,,,NL,NL,...,,grouped,,,CIN+,F,,F,,
7319_4_HTA11_01938,7319_AS_4,7319_4,HTA11_01938,HTA11_01938,TA/TVA,Cecum,F,W,G1,AD,...,,grouped,,,HM,F,,F,,
8270_1_HTA11_07862,8270_AS_1,8270_1,HTA11_07862,HTA11_07862,TA/TVA,Sigmoid,M,B,G1,AD,...,,HTA11_07862,,,CIN+,F,,F,,
8270_2_HTA11_10711,8270_AS_2,8270_2,HTA11_10711,HTA11_10711,TA/TVA,Cecum,F,W,G1,AD,...,,HTA11_10711,,,CIN+,F,,F,,
6723_4_WD86055,6723_KL_4,6723_4,WD86055,PAT71397,TA/TVA,Cecum,M,B,G1,AD,...,F,PAT71397,,,CIN+,F,,F,,


In [4]:
# markers in COLON MAP PCA data
allmarkers = ["ACTININ","BCATENIN","CD3D","CD4_","CD8","CD11B","CD20","CD45","CD68",
              "CGA","COLLAGEN","COX2","DAPI","ERBB2","FOXP3","GAMMAACTIN","HLAA","LYSOZYME","MUC2",
              "NAKATPASE","OLFM4","SOX9","PANCK","PCNA","PEGFR","PSTAT3","SMA","SNA","VIMENTIN"]

eliotmarkers = ["VIMENTIN","SOX9","SMA","PSTAT3","PEGFR","PCNA","PANCK","OLFM4",
                "NAKATPASE","MUC5AC","MUC2","LYSOZYME","HLAA","GAMMAACTIN","FOXP3",
                "ERBB2","COLLAGEN","CGA","CDX2","CD68","CD45","CD20","CD11B","CD8",
                "CD4_","CD3D","BCATENIN","AQP5","COX2"]

channels_26sep22 = ["CEACAM5","DPEP1","CD27","AQP5","VIMENTIN","SOX9","SMA","PSTAT3","PEGFR","PCNA",
                    "PANCK","OLFM4","NAKATPASE","MUC5AC","MUC2","LYSOZYME","HLAA","GAMMAACTIN",
                    "FOXP3","COLLAGEN","CGA","CDX2","CD68","CD45","CD20","CD11B","CD8","CD4",
                    "CD3D","BCATENIN"]

In [5]:
MxIF_cols = list(set(channels_26sep22).union(set(allmarkers).union(set(eliotmarkers))))

In [6]:
len(MxIF_cols)

36

---

In [None]:
for s in sample_key.loc[sample_key.topup=="T",:].index:
    print("Starting {}:".format(s), end="\n\t")
    a = sc.read(sample_key.loc[s, "trimmed_adata"])  # read in anndata

    # select obs and var columns
    a.obs = a.obs[["in_tissue", "array_row", "array_col"]].copy()
    a.var = a.var[["gene_ids", "feature_types", "genome", "n_cells", "Morans_I", "Morans_I_p_val", "Morans_I_adj_p_val"]].copy()

    # add sample-level metadata
    a.obs["Sample"] = sample_key.loc[s, "sample_key_short"]
    a.obs["Patient"] = sample_key.loc[s, "patient_name"]
    a.obs["Block ID"] = sample_key.loc[s, "block_name"]
    a.obs["Tumor Type"] = sample_key.loc[s, "tumor_type"]
    a.obs["Tumor Location"] = sample_key.loc[s, "tumor_loc"]
    a.obs["Tumor Stage"] = sample_key.loc[s, "tumor_stage"]
    a.obs["Tumor Grade"] = sample_key.loc[s, "tumor_grade"]
    a.obs["Gender"] = sample_key.loc[s, "gender"]
    a.obs["Ethnicity"] = sample_key.loc[s, "ethnicity"]
    a.obs["Evolution"] = sample_key.loc[s, "Evolution"]
    if sample_key.loc[s, "tumor_type"] in ["SSL/HP","TA/TVA","NL"]:
        a.obs["CIN Status"] = sample_key.loc[s, "tumor_type"]
    else:
        a.obs["CIN Status"] = sample_key.loc[s, "CIN Status"]
    
    # get manual pathology annotations
    try:
        path = pd.read_csv("datasets/{}/QC/{}_pathology_annotation.csv".format(sample_key.loc[s, "project"], sample_key.loc[s, "sample_key_short"]), index_col=0)
        a.obs = a.obs.merge(path, left_index=True, right_index=True, how="left")
        print("Manual pathology annotations", end=", ")
    except:
        print("No manual path. annotations found for {}".format(s), end=", ")
    
    # get VUMC refNMF fractions
    print("VUMC refNMF fractions & MILWRM domains", end=", ")
    tmp = sc.read(sample_key.loc[s, "MILWRM_VUMCrefNMF30_adata"])
    old_cols = list(tmp.obs.columns[tmp.obs.columns.str.startswith("usage_")])
    new_cols = [x + "_VUMCrefNMF30" for x in tmp.obs.columns[tmp.obs.columns.str.startswith("usage_")].str.split("_").str[-1]]
    tmp.obs.rename(columns = dict(zip(old_cols, new_cols)), inplace=True)
    a.obs = a.obs.merge(tmp.obs[new_cols + ["MILWRM Domain", "MILWRM Confidence Score"]], left_index=True, right_index=True, how="left")

    # get MxIF MILWRM
    try:
        # filter to MxIF pixels and add marker intensities, MILWRM domains
        obs = pd.read_csv(sample_key.loc[s, "MILWRM_VUMCrefNMF30_MxIF_obs"], index_col=0)
        #a = a[list(set(a.obs_names).intersection(set(obs.index))),:].copy()  # filter to spots within MxIF mask
        # add _MxIF label to protein stain markers in .obs
        new_cols = [x + "_MxIF" for x in MxIF_cols]
        obs.rename(columns = dict(zip(MxIF_cols, new_cols)), inplace=True)
        a.obs = a.obs.merge(
            obs[list(set(new_cols).intersection(set(obs.columns))) + ["refNMF-MxIF MILWRM Domain", "refNMF-MxIF MILWRM Confidence Score"]],
            left_index=True,
            right_index=True,
            how="left",
        )
        print("MxIF + refNMF MILWRM domains".format(s), end=", ")
    except:
        print("No MxIF MILWRM detected for {}".format(s), end=", ")
    
    # get LCM masks
    try:
        mask = pd.read_csv("../exome/resources/{}_ROIs_{}.csv".format(sample_key.loc[s, "block_name"], str(sample_key.loc[s, "sample_key_short"])), index_col="Barcode")
        print("Read LCM masks from ../exome/resources/{}_ROIs_{}.csv".format(sample_key.loc[s, "block_name"], sample_key.loc[s, "sample_key_short"]))
        # merge LCM masks with .obs
        a.obs = a.obs.merge(mask, left_index=True, right_index=True)
        a.obs.LCM_ROI = a.obs.LCM_ROI.astype(str)  # coerce to string for subsetting later
    except:
        print("No LCM mask(s) found for {}".format(s))

    # save to master anndata object
    print("\tSaving to datasets/{}_master.h5ad".format(s), end="\n\n")
    a.write("datasets/{}_master.h5ad".format(s), compression="gzip")

---
## Re-map `LCM_ROI` values for PAT71397

In [None]:
mapper = {
    "WD86055_ROI4":"ROI1",
    "WD86055_ROI2":"ROI2",
    "WD86055_ROI3":"ROI3",
    "WD86055_ROI1":"ROI4",
    "WD86056_ROI2":"ROI5",
    "WD86056_ROI1":"ROI6",
    "WD86056_ROI4":"ROI7",
    "WD86056_ROI3":"ROI8",
}

for s in sample_key.loc[sample_key.patient_name == "PAT71397", :].index:
    print("Starting {}:".format(s), end="\n\t")
    a = sc.read("datasets/{}_master.h5ad".format(s))
        
    try:
        print(a.obs.LCM_ROI.value_counts())
        a.obs.LCM_ROI.replace(mapper, inplace=True)
        print(a.obs.LCM_ROI.value_counts())
        # save to master anndata object
        print("\tSaving to datasets/{}_master.h5ad".format(s), end="\n\n")
        a.write("datasets/{}_master.h5ad".format(s), compression="gzip")
    except:
        print("Error on {}!!!".format(s))

---
## Re-map `LCM_ROI` values for the rest of the patients

In [None]:
for s in sample_key.loc[sample_key.project.isin(["7003_AS"]), :].index:
    print("Starting {}:".format(s), end="\n\t")
    a = sc.read("datasets/{}_master.h5ad".format(s))
        
    try:
        print(a.obs.LCM_ROI.value_counts())
        a.obs.LCM_ROI = a.obs.LCM_ROI.str.split("_").str[1]
        print(a.obs.LCM_ROI.value_counts())
        # save to master anndata object
        print("\tSaving to datasets/{}_master.h5ad".format(s), end="\n\n")
        a.write("datasets/}_master.h5ad".format(s), compression="gzip")
    except:
        print("Error on {}!!!".format(s))

---
## Read in new MILWRM data for all patients and add to master AnnData

In [None]:
for s in sample_key.index:
    print("Starting {}:".format(s), end="\n\t")
    a = sc.read("datasets/{}_master.h5ad".format(s))
    
    # drop refNMF, MxIF, MILWRM columns
    a.obs.drop(columns=a.obs.columns[a.obs.columns.duplicated()])
    a.obs.drop(
        columns=list(set(list(a.obs.columns[a.obs.columns.str.endswith("_VUMCrefNMF30")]) + 
        list(a.obs.columns[a.obs.columns.str.endswith("_MxIF")]) + 
        ["MILWRM Domain", "refNMF-MxIF MILWRM Domain", "MILWRM Confidence Score", "refNMF-MxIF MILWRM Confidence Score"]).intersection(set(a.obs.columns))),
        inplace=True,
    )
    
    # get VUMC refNMF fractions
    print("VUMC refNMF fractions & MILWRM domains", end=", ")
    tmp = sc.read(sample_key.loc[s, "MILWRM_VUMCrefNMF30_adata"])
    old_cols = list(tmp.obs.columns[tmp.obs.columns.str.startswith("usage_")])
    new_cols = [x + "_VUMCrefNMF30" for x in tmp.obs.columns[tmp.obs.columns.str.startswith("usage_")].str.split("_").str[-1]]
    tmp.obs.rename(columns = dict(zip(old_cols, new_cols)), inplace=True)
    a.obs = a.obs.merge(tmp.obs[new_cols + ["MILWRM Domain", "MILWRM Confidence Score"]], left_index=True, right_index=True, how="left")

    # get MxIF MILWRM
    try:
        # filter to MxIF pixels and add marker intensities, MILWRM domains
        obs = pd.read_csv(sample_key.loc[s, "MILWRM_VUMCrefNMF30_MxIF_obs"], index_col=0)
        #a = a[list(set(a.obs_names).intersection(set(obs.index))),:].copy()  # filter to spots within MxIF mask
        # add _MxIF label to protein stain markers in .obs
        new_cols = [x + "_MxIF" for x in MxIF_cols] + ["refNMF-MxIF MILWRM Domain", "refNMF-MxIF MILWRM Confidence Score"]
        obs.rename(columns = dict(zip(MxIF_cols + ["VUMCrefNMF30_MxIF_MILWRM_domain", "VUMCrefNMF30_MxIF_MILWRM_confidence_score"], new_cols)), inplace=True)
        a.obs = a.obs.merge(
            obs[list(set([x + "_MxIF" for x in MxIF_cols]).intersection(set(obs.columns))) + ["refNMF-MxIF MILWRM Domain", "refNMF-MxIF MILWRM Confidence Score"]],
            left_index=True,
            right_index=True,
            how="left",
        )
        print("MxIF + refNMF MILWRM domains".format(s), end=", ")
    except:
        print("No MxIF MILWRM detected for {}".format(s), end=", ")

    # save to master anndata object
    print("\tSaving to datasets/{}_master.h5ad".format(s), end="\n\n")
    a.write("datasets/{}_master.h5ad".format(s), compression="gzip")

---
## Define pseudotime ordering by `LCM_ROI`

In [20]:
muts = pd.read_csv("../exome_genome/resources/LCM_variants_per_sample.csv", index_col=0)
mapper = {
    "WD86055_ROI4":"WD86055_ROI1",
    "WD86055_ROI2":"WD86055_ROI2",
    "WD86055_ROI3":"WD86055_ROI3",
    "WD86055_ROI1":"WD86055_ROI4",
    "WD86056_ROI2":"WD86055_ROI5",
    "WD86056_ROI1":"WD86055_ROI6",
    "WD86056_ROI4":"WD86055_ROI7",
    "WD86056_ROI3":"WD86055_ROI8",
    "WD33475_ROI4":"WD33475_ROI3",
    "WD33475_ROI5":"WD33475_ROI4",
}
muts.Tumor_Sample_Barcode.replace(mapper, inplace=True)
muts["Tumor_Sample_Barcode"] = [sample_key.loc[sample_key.block_name==x.split("_")[0],"patient_name"][0] + "_" + x.split("_")[1] for x in muts["Tumor_Sample_Barcode"]]
muts[["Patient", "ROI"]] = muts["Tumor_Sample_Barcode"].str.split("_", expand=True)

In [21]:
muts = muts.merge(sample_key.loc[sample_key.tumor_type.isin(["MSS","MSI-H"]),["patient_name","tumor_type"]], left_on="Patient", right_on="patient_name", how="left").drop_duplicates()
muts = muts.drop(columns=["patient_name"])

In [22]:
muts

Unnamed: 0,Tumor_Sample_Barcode,Variants,Patient,ROI,tumor_type
0,PAT15211_ROI2,5303,PAT15211,ROI2,MSS
1,PAT15211_ROI3,4830,PAT15211,ROI3,MSS
2,SG00001_ROI2,2139,SG00001,ROI2,MSI-H
4,PAT00222_ROI1,2111,PAT00222,ROI1,MSI-H
5,PAT00222_ROI3,2075,PAT00222,ROI3,MSI-H
...,...,...,...,...,...
146,PAT06439_ROI2,25,PAT06439,ROI2,MSS
147,SG00001_ROI3,20,SG00001,ROI3,MSI-H
149,PAT40364_ROI5,12,PAT40364,ROI5,MSI-H
150,PAT40364_ROI3,11,PAT40364,ROI3,MSI-H


In [23]:
muts2 = pd.read_csv("../exome_genome/resources/pseudobulk_variants_per_sample.csv", index_col=0)

In [24]:
muts2

Unnamed: 0,Tumor_Sample_Barcode,Variants
1,PAT15211,6804
2,SG00001,3975
3,SG00002,3270
4,MAP.01938_polyp,2805
5,SG00003,2796
6,PAT00222,2613
7,PAT73458,2431
8,PAT40364,2210
9,PAT01586,1907
10,PAT54273,1470


---

In [25]:
for s in sample_key.loc[sample_key.topup=="T",:].index:
    a = sc.read("datasets/{}_master.h5ad".format(s))
    print("Read adata from datasets/{}_master.h5ad".format(s))
    
    if "LCM_ROI" in a.obs.columns:
        a.obs.LCM_ROI = a.obs.LCM_ROI.astype(str)
        
        # add number of detected Variants
        a.obs = a.obs.reset_index().merge(muts[["Patient","ROI","Variants"]], left_on=["Patient","LCM_ROI"], right_on=["Patient","ROI"], how="left").drop(columns="ROI").set_index("index")
    else:
        try:
            # add number of detected Variants
            a.obs = a.obs.reset_index().merge(muts[["Patient","ROI","Variants"]], left_on=["Patient","LCM_ROI"], right_on=["Patient","ROI"], how="left").drop(columns="ROI").set_index("index")
        except:
            print("No LCM variants found")
        try:
            a.obs["Variants"] = muts2.loc[muts2.Tumor_Sample_Barcode==sample_key.loc[s,:].block_name,"Variants"].values[0]
        except:
            print("No bulk variants found")

    if ("LCM_ROI" not in a.obs.columns) & (sample_key.loc[s,"block_name"].startswith("WD")):
        print("skipping")
    elif ("LCM_ROI" not in a.obs.columns) & (sample_key.loc[s,"block_name"].startswith("S")):
        print("skipping")
    else:
        # save to master anndata object
        print("\tSaving to datasets/{}_master.h5ad".format(s), end="\n\n")
        a.write("datasets/{}_master.h5ad".format(s), compression="gzip")

Read adata from datasets/8899_5_SG00003_master.h5ad
	Saving to datasets/8899_5_SG00003_master.h5ad

Read adata from datasets/8899_6_SG00003_master.h5ad
	Saving to datasets/8899_6_SG00003_master.h5ad

Read adata from datasets/8899_7_SG00004_master.h5ad
	Saving to datasets/8899_7_SG00004_master.h5ad

Read adata from datasets/8899_8_SG00004_master.h5ad
	Saving to datasets/8899_8_SG00004_master.h5ad

Read adata from datasets/8899_1_SG00001_master.h5ad
	Saving to datasets/8899_1_SG00001_master.h5ad

Read adata from datasets/8899_2_SG00001_master.h5ad
	Saving to datasets/8899_2_SG00001_master.h5ad

Read adata from datasets/8899_3_SG00002_master.h5ad
	Saving to datasets/8899_3_SG00002_master.h5ad

Read adata from datasets/8899_4_SG00002_master.h5ad
	Saving to datasets/8899_4_SG00002_master.h5ad

