In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import AgglomerativeClustering

In [None]:
import sys
print(sys.executable)


In [None]:
! pip show leidenalg

In [None]:
#python -m pip install stlearn igraph

In [None]:
## Reading the count data and metadata files
batch = pd.read_csv("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/MPSs_count_metadata.txt", sep="\t", header=0, index_col='BC')
#batch=batch.set_index('BC')
rna_counts = np.transpose(pd.read_csv("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/MPSs_count_mat.txt", sep="\t", header=0,index_col=0))

In [None]:
## creating anndata objects from counts
rna_adata = sc.AnnData(rna_counts)
rna_adata.obs=batch.iloc[:,[0,1,2]]
## Adding batch information to the Anndata objects from the metadata file
#rna_adata.obs['sample'] = batch['orig.ident']
## Making rows and column names unique (Cell barcode information is found in .obs and gene (feature) information in .var)
rna_adata.var_names_make_unique()
rna_adata.obs_names_make_unique()

In [None]:
rna_adata

In [None]:
#! pip install -U scikit-image on terminal
import skimage

In [None]:
#Doublet testing
sc.pp.scrublet(rna_adata, batch_key="orig.ident")

In [None]:
#rna_adata.obs

rna_adata.obs[rna_adata.obs['predicted_doublet'] == True]


In [None]:
rna_adata.obs.to_csv('/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/rna_adata_obs.csv', sep="\t",index=True)

In [None]:
sc.pl.violin(rna_adata, ['doublet_score'])


In [None]:
## QC plots for total genes, counts and percentage of mitochondrial genes in cells
rna_adata.var['mt'] = rna_adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(rna_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Scanpy will prepend the string in the save argument with "violin"
# and save it to our figure directory defined in the first step.
sc.pl.violin(rna_adata, ['n_genes_by_counts'], save='_n_genes', jitter=0.4)
sc.pl.violin(rna_adata, ['total_counts'], save='_total_counts', jitter=0.4)
sc.pl.violin(rna_adata, ['pct_counts_mt'], save='_mito_pct', jitter=0.4)

In [None]:
# Filtering out the low quality genes and cells
rna_adata = rna_adata[(rna_adata.obs.n_genes_by_counts < 7000) &
(rna_adata.obs.total_counts < 20000),:]

In [None]:
## Normalizing the counts
sc.pp.normalize_total(rna_adata, target_sum=1e4)
sc.pp.log1p(rna_adata)

In [None]:
## Identifying the highly variable genes
sc.pp.highly_variable_genes(rna_adata, min_mean=0.0125, max_mean=3, min_disp=0.25)
# This saves the original set of genes
rna_adata.raw = rna_adata

rna_adata = rna_adata[:,rna_adata.var.highly_variable]
sc.pp.scale(rna_adata, max_value=10)

In [None]:
# Lower dimension embedding - PCA
sc.tl.pca(rna_adata, svd_solver='arpack',n_comps=200)
sc.pl.pca_variance_ratio(rna_adata, log=True, n_pcs=200, save='200pc') # scanpy generates the filename automatically

In [None]:
# Lower dimension embedding - UMAP and clustering
sc.pp.neighbors(rna_adata, n_neighbors=30, n_pcs=75)
sc.tl.umap(rna_adata)
sc.tl.leiden(rna_adata, resolution=0.4)
sc.pl.umap(rna_adata, color=['leiden'] , legend_loc = 'best')

In [None]:
sc.tl.leiden(rna_adata, resolution=0.4)
sc.pl.umap(rna_adata, color=['leiden'] , legend_loc = 'best')

In [None]:
def one_col_lgd(umap):
    legend = umap.legend(bbox_to_anchor=[1.00, 0.5],
    loc='center left', ncol=1, prop={'size': 6})
    legend.get_frame().set_linewidth(0.0)
    for handle in legend.legendHandles:
        handle.set_sizes([25.0])
    return legend

In [None]:
## Clustering wiithout batch correction
donor_umap = sc.pl.umap(rna_adata, color=['orig.ident'],
show=False, palette=sns.color_palette("husl", 24),
    legend_fontsize=6, frameon=True, title='Donor')

lgd = one_col_lgd(donor_umap)

fig = donor_umap.get_figure()
fig.set_size_inches(5, 5)
fig.savefig('/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/umap_lgd_sample75pc_n',
    dpi=400, bbox_extra_artists=(lgd,), bbox_inches='tight')

# by cluster
leiden_umap = sc.pl.umap(rna_adata, color=['leiden'],
    show=False, palette=sns.color_palette("husl", 24),
legend_fontsize=6, frameon=True, title='Leiden')

lgd = one_col_lgd(leiden_umap)

fig = leiden_umap.get_figure()
fig.set_size_inches(5, 5)
fig.savefig('/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/umap_lgd_leiden75pc_n',
    dpi=400, bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:
## Batch correction
sc.external.pp.harmony_integrate(rna_adata, 'orig.ident')
rna_adata.obsm['X_pca'] = rna_adata.obsm['X_pca_harmony']
sc.pp.neighbors(rna_adata, n_neighbors=30, n_pcs=75)
sc.tl.umap(rna_adata)
sc.tl.leiden(rna_adata, resolution=0.4)

In [None]:
### Clustering after batch correction
# by sample
donor_umap = sc.pl.umap(rna_adata, color=['orig.ident'],
    show=False, legend_fontsize=6, frameon=True, title='Donor')

lgd = one_col_lgd(donor_umap)

fig = donor_umap.get_figure()
fig.set_size_inches(5, 5)
fig.savefig('/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/umap_lgd_harmony_sample75pcs_n',
    dpi=400, bbox_extra_artists=(lgd,), bbox_inches='tight')

# by cluster
leiden_umap = sc.pl.umap(rna_adata, color=['leiden'],
show=False, palette=sns.color_palette("husl", 24),
    legend_fontsize=6, frameon=True, title='Leiden')

lgd = one_col_lgd(leiden_umap)

fig = leiden_umap.get_figure()
fig.set_size_inches(5, 5)
fig.savefig('/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/umap_lgd_harmony_leiden75pcs_n',
    dpi=400, bbox_extra_artists=(lgd,), bbox_inches='tight')

In [None]:
sc.pl.umap(rna_adata, color="leiden", legend_loc='on data')

In [None]:
sc.pl.umap(rna_adata, color=['doublet_score'] , legend_loc = 'best')

In [None]:
rna_adata

In [None]:
sc.pl.umap(rna_adata, color="leiden", legend_loc='on data')

In [None]:
sc.tl.leiden(rna_adata, key_added="leiden_res0_25", resolution=0.25)
sc.tl.leiden(rna_adata, key_added="leiden_res0_5", resolution=0.5)
sc.tl.leiden(rna_adata, key_added="leiden_res1", resolution=1.0)
sc.pl.umap(
    rna_adata,
    color=["leiden_res0_25", "leiden_res0_5", "leiden_res1"],
    legend_loc="on data",
)

In [None]:
rna_adata.raw = rna_adata
rna_adata = rna_adata[:, rna_adata.var["highly_variable"]].copy()

In [None]:
sc.tl.rank_genes_groups(rna_adata, groupby="leiden", method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(
    rna_adata, groupby="leiden", standard_scale="var", n_genes=5
)

In [None]:
# 6, 14 are doublets with keratinocytes?? Express KRT14

new_cell_dict = {
    "0": "Melanocytes",
    "1": "KC", #
    "2": "Fibroblast",
    "3": "Immune", #T
    "4": "KC", #Basal
    "5": "Immune", #DC
    "6": "KC", #
    "7": "Immune", #T
    "8": "Endothelial cell", #
    "9": "KC", #
    "10": "KC", #
    "11": "KC", #
    "12": "Melanocytes", #
    "13": "KC", #Basal
    "14": "Fibroblast",
    "15": "Immune", #DC
    "16": "Immune", #DC
    "17": "Immune", #NK
    "18": "Immune", #LC (Immature DC)
    "19": "Endothelial cell",
    "20": "Immune", #DC
    "21": "KC", #stem/hair/stress
    "22": ""
}

new_cell_dict2 = {
    "0": "Melanocytes",
    "1": "KC Differentiating",
    "2": "Fibroblast",
    "3": "Treg",
    "4": "KC Basal",
    "5": "Macrophage",
    "6": "KC Cornified", #?
    "7": "T cell",
    "8": "Endothelial cell",
    "9": "KC Differentiating", #?
    "10": "KC nail",
    "11": "KC stem/hair/stress",
    "12": "Melanocytes",
    "13": "KC Basal",
    "14": "Fibroblast",
    "15": "DC",  #?
    "16": "DC", #pDC
    "17": "NK",
    "18": "LC",
    "19": "Endothelial cell",
    "20": "DC", #?
    "21": "KC stem/hair/stress", 
    "22": ""
}

new_cell_dict2 = {
    "0": "Melanocytes",
    "1": "KC Differentiating",
    "2": "Fibroblast",
    "3": "Treg",
    "4": "KC Basal",
    "5": "Macrophage",
    "6": "KC Cornified", #?
    "7": "T cell",
    "8": "Endothelial cell",
    "9": "KC Differentiating", #?
    "10": "KC nail",
    "11": "KC stem/hair/stress",
    "12": "Melanocytes",
    "13": "KC Basal",
    "14": "Fibroblast",
    "15": "DC",  #?
    "16": "DC", #pDC
    "17": "NK",
    "18": "LC",
    "19": "Endothelial cell",
    "20": "DC", #?
    "21": "KC stem/hair/stress", 
    "22": ""
}

rna_adata.obs["Level1"] = [new_cell_dict[x] for x in rna_adata.obs["leiden"]]
rna_adata.obs["Level2"] = [new_cell_dict2[x] for x in rna_adata.obs["leiden"]]

#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()
#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()

In [None]:
sc.pl.umap(
    rna_adata,
    color=["Level1","Level2"],
)

In [None]:
#from tools import cmp

sc.pl.umap(
    rna_adata,
    color=["CD3E","KRT10","KRT6A",
        "KRT17",
        "KRT15",
        "KRT2",
        "LOR",
        "IVL",
    ]
)

In [None]:
sc.pl.umap(
    rna_adata,
    color=[
        "RRM2",
        "HELLS",
        "UHRF1",
        "ASS1",
        "COL17A1",
        "POSTN",
        "KRT19",
        "GJB2",
        "KRT6A",
        "KRT16",
        "CCND1",
        "DEFB1",
        "CALML3",
        "CALML5",
        "ZNF750",
        "SPINK5",
        "CAPN3",
    ]
)

In [None]:
# Immune subclusters
sc.pl.dotplot(
    rna_adata,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "contamination": ["CD3D"],
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
) #"leiden_res0_25"

In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    rna_adata,
    {
        "Immune cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
            "ITGAM",
            "S100A8",
        ],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "KC Basal": ["KRT15", "C1orf56"],
        "KC Differentiating": ["KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": [
            "KRT6B",
            "KRT79",
        ],
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
)

In [None]:
sc.pl.dotplot(
    rna_adata,
    {
        "T cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
        ],
        "Trm": ["CD69", "CRTAM", "S1PR1"],
        "Treg": ["FOXP3", "PDCD1", "CTLA4", "IL2RA"],
        "NK": [ "NCR1", "GZMB","GNLY", "XCL2"],
        "ILC": ["IL7R"],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "Contamination": ["ITGAM", "S100A8"],
        "Endothelial": ["CLDN5", "PLVAP", "SPARCL1"],
        "Fibroblast": ["PDGFRA", "PDGFRB", "DCN"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
)

# Cluster verification

In [None]:
sc.pl.dotplot(
    rna_adata,
    {
        "Immune cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
            "ITGAM",
            "S100A8",
        ],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "KC Basal": ["KRT15", "C1orf56"],
        "KC Differentiating": ["KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": [
            "KRT6B",
            "KRT79",
        ],
               "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "Endothelial": ["CLDN5", "PLVAP", "SPARCL1"],
        "Fibroblast": ["PDGFRA", "PDGFRB", "DCN"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="Level2",
)

## res1 clusters

In [None]:
# Immune subclusters
sc.pl.dotplot(
    rna_adata,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "contamination": ["CD3D"],
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_res1",
) 

In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    rna_adata,
    {
        "Immune cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
            "ITGAM",
            "S100A8",
        ],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "KC Basal": ["KRT15", "C1orf56"],
        "KC Differentiating": ["KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": [
            "KRT6B",
            "KRT79",
        ],
        "KC beta-catenin+": [ "C1orf56"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_res1",
)

In [None]:
sc.pl.dotplot(
    rna_adata,
    {
        "T cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
        ],
        "Trm": ["CD69", "CRTAM", "S1PR1"],
        "Treg": ["FOXP3", "PDCD1", "CTLA4", "IL2RA"],
        "NK": [ "NCR1", "GZMB","GNLY", "XCL2"],
        "ILC": ["IL7R"],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "Contamination": ["ITGAM", "S100A8"],
        "Endothelial": ["CLDN5", "PLVAP", "SPARCL1"],
        "Fibroblast": ["PDGFRA", "PDGFRB", "DCN"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_res1",
)

In [None]:

new_cell_dict1 = {
    "0": "KC", #Differentiating
    "1": "Melanocytes", 
    "2": "KC",
    "3": "Fibroblast", 
    "4": "Immune",  #T
    "5": "Endothelial cell", 
    "6": "Melanocytes", 
    "7": "Immune",  #Mac
    "8": "Immune", #T 
    "9": "Fibroblast", 
    "10": "KC", 
    "11": "Immune", #IFN T
    "12": "KC", 
    "13": "KC", 
    "14": "Immune", #Treg
    "15": "Melanocytes",
    "16": "KC",
    "17": "KC", 
    "18": "KC",
    "19": "Melanocytes",
    "20": "KC", 
    "21": "KC",
    "22": "Immune", #DC
    "23": "",
    "24": "Immune", #NK
    "25": "Fibroblast",
    "26": "Immune", # T
    "27": "Endothelial cell",
    "28": "Immune", #NK
    "29": "",
    "30": "Immune", #IFN T
    "31": "",
    "32": "",
}

new_cell_dict1_2 = {
    "0": "KC Differentiating",
    "1": "Melanocytes", 
    "2": "KC Basal",
    "3": "Fibroblast", 
    "4": "T cell", 
    "5": "Endothelial cell", 
    "6": "Melanocytes", 
    "7": "Macrophage", 
    "8": "CD8+ T cell", 
    "9": "Fibroblast", 
    "10": "KC Granular", 
    "11": "", 
    "12": "KC Differentiating", #? 
    "13": "KC Cornified", 
    "14": "Treg", #Treg
    "15": "Melanocytes",
    "16": "KC stem/hair/stress",
    "17": "KC nail", 
    "18": "", #?
    "19": "Melanocytes",
    "20": "KC Basal", 
    "21": "KC stem/hair/stress",
    "22": "DC", #DC
    "23": "",
    "24": "NK",
    "25": "Fibroblast",
    "26": "",
    "27": "Endothelial cell",
    "28": "NK",
    "29": "",
    "30": "",
    "31": "",
    "32": "",
}

rna_adata.obs["Level1_res1"] = [new_cell_dict1[x] for x in rna_adata.obs["leiden_res1"]]
rna_adata.obs["Level2_res1"] = [new_cell_dict1_2[x] for x in rna_adata.obs["leiden_res1"]]

#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()
#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()

sc.pl.umap(
    rna_adata,
    color=["Level1_res1","Level2_res1"],
)

## subset KC

In [None]:
# Subset Immune Cells
idata = rna_adata[rna_adata.obs.Level1_res1 == 'Immune']

idata.var["mt"] = idata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    idata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)
sc.pp.normalize_total(idata, target_sum=1e4)
sc.pp.log1p(idata)
sc.pp.highly_variable_genes(idata, min_mean=0.0125, max_mean=3, min_disp=0.5)


In [None]:
# 6, 14 are doublets with keratinocytes?? Express KRT14

new_cell_dict = {
    "0": "",
    "1": "", #
    "2": "",
    "3": "", #T
    "4": "", #Basal
    "5": "", #DC
    "6": "", #
    "7": "", #T
    "8": "", #
    "9": "", #
    "10": "", #
    "11": "Melanocytes", #
    "12": "", #
    "13": "", #Basal
    "14": "",
    "15": "", #DC
    "16": "",
    "17": "", #
    "18": "",
    "19": "",
    "20": "", #DC
    "21": "", #stem/hair/stress
    "22": ""
}
rna_adata.obs["Level1"] = [new_cell_dict[x] for x in rna_adata.obs["leiden"]]


# 6, 14 are doublets with keratinocytes?? Express KRT14


rna_adata.obs["Level1"] = [new_cell_dict[x] for x in rna_adata.obs["leiden"]]
#rna_adata.obs["Level2"] = [new_cell_dict2[x] for x in rna_adata.obs["leiden"]]

#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()
#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()

In [None]:
rna_adata.obs["leiden_res1"]

In [None]:
rna_adata.write_h5ad("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/Mel_3samples_75pcs.h5ad")

In [None]:
import anndata
rna_adata=anndata.read_h5ad("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/Mel_3samples_75pcs.h5ad")

In [None]:
rna_adata.raw.X