In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import AgglomerativeClustering
import anndata

In [None]:
import sys
print(sys.executable)


In [None]:
## Reading the count data and metadata files
batch = pd.read_csv("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/MPSs_count_metadata.txt", sep="\t", header=0, index_col='BC')
#batch=batch.set_index('BC')
rna_counts = np.transpose(pd.read_csv("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/MPSs_count_mat.txt", sep="\t", header=0,index_col=0))

In [None]:
## creating anndata objects from counts
rna_adata = sc.AnnData(rna_counts)
rna_adata.obs=batch.iloc[:,[0,1,2]]
## Adding batch information to the Anndata objects from the metadata file
#rna_adata.obs['sample'] = batch['orig.ident']
## Making rows and column names unique (Cell barcode information is found in .obs and gene (feature) information in .var)
rna_adata.var_names_make_unique()
rna_adata.obs_names_make_unique()

In [None]:
rna_adata

In [None]:
#! pip install -U scikit-image on terminal
import skimage

In [None]:
#Doublet testing
sc.pp.scrublet(rna_adata, batch_key="orig.ident")
sc.pl.violin(rna_adata, ['doublet_score'])


In [None]:
## QC plots for total genes, counts and percentage of mitochondrial genes in cells
rna_adata.var['mt'] = rna_adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(rna_adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Scanpy will prepend the string in the save argument with "violin"
# and save it to our figure directory defined in the first step.
sc.pl.violin(rna_adata, ['n_genes_by_counts','total_counts','pct_counts_mt'],  jitter=0.4)


In [None]:
# Filtering out the low quality genes and cells
rna_adata = rna_adata[(rna_adata.obs.n_genes_by_counts < 7000) &
(rna_adata.obs.total_counts < 20000),:]

In [None]:
rna_adata

In [None]:
## Normalizing the counts
sc.pp.normalize_total(rna_adata, target_sum=1e4)
sc.pp.log1p(rna_adata)

In [None]:
rna_adata

In [None]:
rna_adata.X

In [None]:
## Identifying the highly variable genes
sc.pp.highly_variable_genes(rna_adata, min_mean=0.0125, max_mean=3, min_disp=0.25)
# This saves the original set of genes
#rna_adata.raw = rna_adata

rna_adata = rna_adata[:,rna_adata.var.highly_variable]
sc.pp.scale(rna_adata, max_value=10)

In [None]:
rna_adata

In [None]:
# Lower dimension embedding - PCA
sc.tl.pca(rna_adata, svd_solver='arpack',n_comps=200)
sc.pl.pca_variance_ratio(rna_adata, log=True, n_pcs=200, save='200pc') # scanpy generates the filename automatically

In [None]:
# Lower dimension embedding - UMAP and clustering
sc.pp.neighbors(rna_adata, n_neighbors=30, n_pcs=75)
sc.tl.umap(rna_adata)
sc.tl.leiden(rna_adata, resolution=1)
sc.pl.umap(rna_adata, color=['leiden'] , legend_loc = 'best')

In [None]:
def one_col_lgd(umap):
    legend = umap.legend(bbox_to_anchor=[1.00, 0.5],
    loc='center left', ncol=1, prop={'size': 6})
    legend.get_frame().set_linewidth(0.0)
    for handle in legend.legendHandles:
        handle.set_sizes([25.0])
    return legend

## Clustering wiithout batch correction
donor_umap = sc.pl.umap(rna_adata, color=['orig.ident'],
show=False, palette=sns.color_palette("tab20", 24),
    legend_fontsize=6, frameon=True, title='Donor')

lgd = one_col_lgd(donor_umap)

fig = donor_umap.get_figure()
fig.set_size_inches(5, 5)

# by cluster
leiden_umap = sc.pl.umap(rna_adata, color=['leiden'],
    show=False, palette=sns.color_palette("husl", 24),
legend_fontsize=6, frameon=True, title='Leiden')

lgd = one_col_lgd(leiden_umap)

fig = leiden_umap.get_figure()
fig.set_size_inches(5, 5)


In [None]:
## Batch correction
sc.external.pp.harmony_integrate(rna_adata, 'orig.ident')
rna_adata.obsm['X_pca'] = rna_adata.obsm['X_pca_harmony']
sc.pp.neighbors(rna_adata, n_neighbors=30, n_pcs=75)
sc.tl.umap(rna_adata)
sc.tl.leiden(rna_adata, resolution=1)

In [None]:
### Clustering after batch correction
# by sample
donor_umap = sc.pl.umap(rna_adata, color=['orig.ident'],
    show=False, legend_fontsize=6, frameon=True, title='Donor')

lgd = one_col_lgd(donor_umap)

fig = donor_umap.get_figure()
fig.set_size_inches(5, 5)

# by cluster
leiden_umap = sc.pl.umap(rna_adata, color=['leiden'],
show=False, palette=sns.color_palette("tab20", 24),
    legend_fontsize=6, frameon=True, title='Leiden')

lgd = one_col_lgd(leiden_umap)

fig = leiden_umap.get_figure()
fig.set_size_inches(5, 5)


In [None]:
sc.pl.umap(rna_adata, color="leiden", legend_loc='on data')

In [None]:
rna_adata.raw = rna_adata
rna_adata = rna_adata[:, rna_adata.var["highly_variable"]].copy()

In [None]:
sc.tl.rank_genes_groups(rna_adata, groupby="leiden", method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(
    rna_adata, groupby="leiden", standard_scale="var", n_genes=5
)

In [None]:
# Immune subclusters
sc.pl.dotplot(
    rna_adata,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "contamination": ["CD3D"],
        "sweat gland":["KRT7", "KRT19","AQP5","SCGB2A2","DCD","SCNN1A"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
) #"leiden_res0_25"

In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    rna_adata,
    {
        "Immune cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
            "ITGAM",
            "S100A8",
        ],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "KC Basal": ["KRT15", "C1orf56"],
        "KC Differentiating": ["KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": [
            "KRT6B",
            "KRT79",
        ],
        "sweat gland":["KRT7", "KRT19","AQP5","SCGB2A2","DCD","SCNN1A"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
)

In [None]:
sc.pl.dotplot(
    rna_adata,
    {
        "T cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
        ],
        "Trm": ["CD69", "CRTAM", "S1PR1"],
        "Treg": ["FOXP3", "PDCD1", "CTLA4", "IL2RA"],
        "NK": [ "NCR1", "GZMB","GNLY", "XCL2"],
        "ILC": ["IL7R"],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "Contamination": ["ITGAM", "S100A8"],
        "Endothelial": ["CLDN5", "PLVAP", "SPARCL1"],
        "Fibroblast": ["PDGFRA", "PDGFRB", "DCN"],
        "B cell":["CD79A","CD22"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
)

In [None]:
# 6, 14 are doublets with keratinocytes?? Express KRT14

new_cell_dict = {
    "0": "KC",
    "1": "Melanocytes", #
    "2": "KC",
    "3": "Fibroblast", #T
    "4": "Immune", #Basal
    "5": "Endothelial cell", #DC
    "6,0": "Melanocytes", #
    "6,1": "Other", #
    "7": "Immune", #T
    "8": "Immune", #
    "9": "Fibroblast", #
    "10": "KC", #
    "11": "Immune", #
    "12": "KC", #
    "13": "Immune", #Basal
    "14": "KC",
    "15": "Melanocytes", #DC
    "16": "KC", #DC
    "17": "KC", #NK
    "18": "KC", #LC (Immature DC)
    "19": "Melanocytes",
    "20": "Sweat gland related", #DC
    "21": "KC", #stem/hair/stress
    "22": "Immune", #???
    "23":"Immune",
    "24":"Immune",
    "25":"Immune",
    "26":"Immune",
    "27":"Endothelial cell",
    "28":"Immune",
    "29":"Immune",
    "30":"Immune",
    "31":"Immune",
    "32":"Immune",
}

new_cell_dict2 = {
    "0": "KC Differentiating",
    "1": "Melanocytes", #
    "2": "KC Basal",
    "3": "Fibroblast", #T
    "4": "T cell", #Basal
    "5": "Endothelial cell", #DC
    "6,0": "Melanocytes", #
    "6,1": "Schwann Cells", #
    "7": "Macrophage", #T
    "8": "CD8+ Tcell", #
    "9": "Fibroblast", #
    "10": "KC Granular", #
    "11": "DC", #
    "12": "KC Differentiating", #
    "13": "Treg", #Basal
    "14": "KC Cornified",
    "15": "Melanocytes", #DC
    "16": "KC stem/hair/stress", #DC
    "17": "KC Nail", #NK
    "18": "KC Basal", #LC (Immature DC)
    "19": "Melanocytes",
    "20": "Sweat gland related", #DC
    "21": "KC Basal", #stem/hair/stress
    "22": "DC", #???
    "23":"Mast Cells",
    "24":"NK",
    "25":"Pericytes",
    "26":"LC",
    "27":"Endothelial cell",
    "28":"pDC",
    "29":"Pericytes",
    "30":"DC",
    "31":"B cell",
    "32":"Pericytes",
}


rna_adata.obs["Level1"] = [new_cell_dict[x] for x in rna_adata.obs["leiden_R"]]
rna_adata.obs["Level2"] = [new_cell_dict2[x] for x in rna_adata.obs["leiden_R"]]
#rna_adata.obs["Level2"] = [new_cell_dict2[x] for x in rna_adata.obs["leiden"]]

#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()
#mdata = idata[idata.obs["Level1"].isin(["Imm_Myeloid"])].copy()
#tdata = idata[idata.obs["Level1"].isin(["Imm_T cell"])].copy()

sc.pl.umap(
    rna_adata,
    color=["Level1","Level2"],
)

In [None]:
rna_adata.obs.groupby('leiden')['Level3'].apply(lambda x: list(set(x))).reset_index()

In [None]:
# Copy the 'original_Level2' column to 'original_Level2_with_Melanoma'
rna_adata.obs['original_Level2_with_Melanoma'] = rna_adata.obs['original_Level2'].copy()

# Add 'Melanoma' as a category in 'original_Level2_with_Melanoma'
rna_adata.obs['original_Level2_with_Melanoma'] = rna_adata.obs['original_Level2_with_Melanoma'].cat.add_categories(['Melanoma'])

# Replace 'original_Level2_with_Melanoma' with 'Melanoma' where 'orig.ident' is 'MPS13' and 'original_Level2' is 'Melanocytes'
rna_adata.obs.loc[(rna_adata.obs['orig.ident'] == 'MPS13') & (rna_adata.obs['original_Level2_with_Melanoma'] == 'Melanocytes'), 'original_Level2_with_Melanoma'] = 'Melanoma'

# Plot the UMAP with the updated 'original_Level2_with_Melanoma'
sc.pl.umap(rna_adata, color=['original_Level2_with_Melanoma'])


In [None]:
sc.pl.umap(rna_adata, color=['original_Level2_with_Melanoma'],legend_loc='on data',legend_fontsize=6  )


In [None]:
rna_adata.obs['copykat_pred']

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

rna_adata.obs['copykat_pred']=rna_adata_OLD.obs['copykat_pred']
rna_adata.obs['infercnv_pred']=rna_adata_OLD.obs['infercnv_pred']


mel_only = rna_adata[rna_adata.obs['original_Level2'] == 'Melanocytes'].copy()

# Ensure `mel_only` is your AnnData object

def handle_categorical(column):
    if column.dtype.name == 'category':
        # Check if 'nan' is already a category
        if 'nan' not in column.cat.categories:
            column = column.cat.add_categories(['nan'])
        # Fill missing values with 'nan'
        column = column.fillna('nan')
    else:
        # Convert to category and fill missing values
        column = pd.Categorical(column.fillna('nan'))
    return column

# Handle categorical columns
mel_only.obs['copykat_pred'] = handle_categorical(mel_only.obs['copykat_pred'])
mel_only.obs['infercnv_pred'] = handle_categorical(mel_only.obs['infercnv_pred'])

# Create a combined clustering column focusing on "Aneuploid" clusters and NA values
def categorize_clusters(row):
    infercnv = row['infercnv_pred']
    copykat = row['copykat_pred']
    
    if infercnv == 'Aneuploid' and copykat == 'aneuploid':
        return 'overlap'
    elif infercnv == 'Aneuploid' and copykat != 'aneuploid':
        return 'infercnv_only'
    elif infercnv != 'Aneuploid' and copykat == 'aneuploid':
        return 'copykat_only'
    else:
        return 'none'

mel_only.obs['combined_clusters'] = mel_only.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
#mel_only.obs['combined_clusters_color'] = mel_only.obs['combined_clusters'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
#sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors)


# Sort the data to ensure 'none' (grey) is at the back, blue/orange in the middle, and red at the front
mel_only.obs['sort_order'] = mel_only.obs['combined_clusters'].map({
    'none': 0,             # Grey: plotted first (back)
    'infercnv_only': 2,     # Blue: middle
    'copykat_only': 3,      # Orange: middle
    'overlap': 4            # Red: plotted last (front)
})

# Sort the data based on the sort order
mel_only = mel_only[mel_only.obs.sort_values('sort_order').index]

# Plot UMAP with custom colors and point size, ensuring grey is in the background
sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors)


In [None]:
# Sort the data to ensure 'none' (grey) is at the back, blue/orange in the middle, and red at the front
mel_only.obs['sort_order_SAMPLE'] = mel_only.obs['original_Level2_with_Melanoma'].map({
    'MPS42': 0,             # Grey: plotted first (back)
    'MPS43': 2,     # Blue: middle
    'MPS13': 3    # Orange: middle
})

# Sort the data based on the sort order
mel_only = mel_only[mel_only.obs.sort_values('sort_order_SAMPLE').index]

# Plot UMAP with custom colors and point size, ensuring grey is in the background
sc.pl.umap(mel_only, color='original_Level2_with_Melanoma')


In [None]:
mel_only.obs['original_Level2_with_Melanoma'].value_counts()

In [None]:
mel_only.obs['combined_clusters'].value_counts()

In [None]:
rna_adata.obs['copykat_pred']=rna_adata_OLD.obs['copykat_pred']
rna_adata.obs['infercnv_pred']=rna_adata_OLD.obs['infercnv_pred']


#mel_only = rna_adata[rna_adata.obs['original_Level2'] == 'Melanocytes'].copy()

# Ensure `mel_only` is your AnnData object

def handle_categorical(column):
    if column.dtype.name == 'category':
        # Check if 'nan' is already a category
        if 'nan' not in column.cat.categories:
            column = column.cat.add_categories(['nan'])
        # Fill missing values with 'nan'
        column = column.fillna('nan')
    else:
        # Convert to category and fill missing values
        column = pd.Categorical(column.fillna('nan'))
    return column

# Handle categorical columns
rna_adata.obs['copykat_pred'] = handle_categorical(rna_adata.obs['copykat_pred'])
rna_adata.obs['infercnv_pred'] = handle_categorical(rna_adata.obs['infercnv_pred'])

# Create a combined clustering column focusing on "Aneuploid" clusters and NA values
def categorize_clusters(row):
    infercnv = row['infercnv_pred']
    copykat = row['copykat_pred']
    
    if infercnv == 'Aneuploid' and copykat == 'aneuploid':
        return 'overlap'
    elif infercnv == 'Aneuploid' and copykat != 'aneuploid':
        return 'infercnv_only'
    elif infercnv != 'Aneuploid' and copykat == 'aneuploid':
        return 'copykat_only'
    else:
        return 'none'

rna_adata.obs['Aneuploid_combined_clusters'] = rna_adata.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
#mel_only.obs['Aneuploid_combined_clusters'] = mel_only.obs['combined_clusters'].map(colors)




# Sort the data to ensure 'none' (grey) is at the back, blue/orange in the middle, and red at the front
rna_adata.obs['sort_order'] = rna_adata.obs['Aneuploid_combined_clusters'].map({
    'none': 0,             # Grey: plotted first (back)
    'infercnv_only': 2,     # Blue: middle
    'copykat_only': 3,      # Orange: middle
    'overlap': 4            # Red: plotted last (front)
})

# Sort the data based on the sort order
rna_adata = rna_adata[rna_adata.obs.sort_values('sort_order').index]

# Plot UMAP with custom colors and point size, ensuring grey is in the background
sc.pl.umap(rna_adata, color='Aneuploid_combined_clusters', title='UMAP of Combined Clusters', size=10, palette=colors)


In [None]:
sc.pl.umap(rna_adata_OLD, color=['Level2_res1'], legend_loc='on data',legend_fontsize=6)

In [None]:
sc.pl.umap(rna_adata_OLD, color=['Level1','Level2'])

In [None]:
rna_adata.obs['original_Level1']=rna_adata_OLD.obs['Level1']
sc.pl.umap(rna_adata, color=['Level1'])

In [None]:
rna_adata.obs['Level3']=rna_adata.obs['original_Level2_with_Melanoma']
sc.pl.umap(rna_adata, color=['Level1','Level2','Level3'])

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt

# First, add "Schwann Cell" as a category if it's not already there
rna_adata.obs['Level3'] = rna_adata.obs['Level3'].cat.add_categories(['Schwann Cell'])
# Now replace NaN values with "Schwann Cell"
rna_adata.obs['Level3'] = rna_adata.obs['Level3'].fillna('Schwann Cell')
# Your existing color dictionary
rna_adata.obs['Level3'] = rna_adata.obs['Level3'].str.replace('cell', 'Cell', case=False)

color_dict = {
    "DC": "#5F9EA0",
    "Endothelial Cell": "#FFA500",
    "Fibroblast": "#458B00",
    "KC Basal": "#FF6A6A",
    "KC Cornified": "#8B3A62",
    "KC Granular": "#8B3A62",
    "KC Differentiating": "#AB82FF",
    "KC Cancer": "#000000",
    "KC Hair": "#FF0000",
    "LC": "#0000CD",
    "Macrophage": "#EEEE00",
    "Melanocytes": "#8B4513",
    "Melanoma": "#000000",
    "NK": "#9ACD32",
    "T Cell": "#1874CD",
    "Treg": "#00B2EE",
    "pDC": "#8A2BE2",
    "Mast Cell": "#ab2952",
    "mRegDC":"#809693", 'CD8+ T Cell': "#3f6573","B Cell":"#ffff00","Pericytes":"#dba465","Sweat gland related":"#6f9c57","nan":"grey",
    'Schwann Cell':'#737475'
}

# Ensure that your 'Level3' labels match the keys in the color_dict
# Example of setting up UMAP
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color='Level3',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
rna_adata.obs['Level1'] = rna_adata.obs['Level1'].replace('Sweat gland related', 'Other')
# Replace "KC Nail" and "KC stem/hair/stress" with "KC Hair"
rna_adata.obs['Level2'] = rna_adata.obs['Level2'].replace({
    'KC Nail': 'KC Hair',
    'KC stem/hair/stress': 'KC Hair',
    'pDC': 'DC',
    'Treg' : 'T Cell',
    'T cell' : 'T Cell',
    'CD8+ Tcell': 'T Cell',
    'Mast Cells':'Mast Cell',
    'Schwann Cells':'Schwann Cell'
})

sc.pl.umap(
    rna_adata,  # Your AnnData object
    color=['Level1','Level2'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
# Your existing color dictionary
level2_color_dict = {
    "DC": "#5F9EA0",
    "Endothelial cell": "#FFA500",
    "Fibroblast": "#458B00",
    "KC Basal": "#FF6A6A",
    "KC Cornified": "#8B3A62",
    "KC Granular": "#8B3A62",
    "KC Differentiating": "#AB82FF",
    "KC Cancer": "#000000",
    "KC Hair": "#FF0000",
    "LC": "#0000CD",
    "Macrophage": "#EEEE00",
    "Melanocytes": "#8B4513",
    "Melanoma": "#000000",
    "NK": "#9ACD32",
    "T Cell": "#1874CD",
    "Mast Cell": "#ab2952",
"B cell":"#ffff00","Pericytes":"#dba465","Sweat gland related":"#6f9c57",   'Schwann Cell':'#737475'
}

# Ensure that your 'Level3' labels match the keys in the color_dict
# Example of setting up UMAP
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color='Level2',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level2_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
rna_adata.obs['Level1'].unique().tolist()

In [None]:
# Your existing color dictionary
level1_color_dict = {
    "Endothelial cell": "#FFA500",
    "Fibroblast": "#458B00",
    "KC": "#FF6A6A",
    "Immune": "#1874CD",
      "Melanocytes": "#8B4513",
    "Other":'#737475'
}

# Ensure that your 'Level3' labels match the keys in the color_dict
# Example of setting up UMAP
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color='Level1',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level1_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
rna_adata

In [None]:
# List of columns to remove
#columns_to_remove = [    'original_Level2', 'original_Level2_with_Melanoma','new_Level2','original_Level1']

# Remove the specified columns
#rna_adata.obs.drop(columns=columns_to_remove, inplace=True)


# List of keys to remove
keys_to_remove = [
    'original_Level1_colors', 
    'original_Level2_colors', 
    'original_Level2_with_Melanoma_colors'
]

# Remove the specified keys from .uns
for key in keys_to_remove:
    if key in rna_adata.uns:
        del rna_adata.uns[key]
rna_adata

In [None]:
rna_adata.write('skin_atlas/Melanoma_sc_reproduced_final_oct16.h5ad')


In [None]:
mel_only.obs['leiden'].unique().tolist()

In [None]:
rna_adata.obs['mel_leiden'] = mel_only.obs['leiden'].reindex(rna_adata.obs.index)
rna_adata

In [None]:
sc.pl.umap(rna_adata, color='mel_leiden', size=30)


In [None]:
# Group by Level1, Level2, and Level3 and count occurrences
tally = rna_adata.obs.groupby(['leiden_R','Level1', 'Level2', 'Level3']).size().reset_index(name='count')

# Filter to keep only counts greater than 0
tally_positive = tally[tally['count'] > 0]

# Display the tally with counts greater than 0
print(tally_positive)


In [None]:
sc.pl.umap(rna_adata, color='leiden_R', size=5, legend_loc="on data")


In [None]:
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color=['Level1','Level2','Level3'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
rna_adata.write('skin_atlas/Melanoma_sc_reproduced_final_oct31.h5ad')
mel_only.write('skin_atlas/Melanoma_only_sc_reproduced_final_oct31.h5ad')
rna_adata.obs.to_csv('skin_atlas/Melanoma_sc_reproduced_final_oct31_metadata.txt',sep="\t")

In [None]:
import anndata
#rna_adata=anndata.read_h5ad("skin_atlas/Melanoma_sc_reproduced_final.h5ad")
rna_adata=anndata.read_h5ad("skin_atlas/Melanoma_sc_reproduced_final_oct31.h5ad")


In [None]:
# Now set Level1_Final to 'Other' where Level2_Cancer is 'Pericytes'
rna_adata.obs.loc[rna_adata.obs['Level2'] == 'Pericytes', 'Level1'] = 'Other'


In [None]:
# Group by Level1, Level2, and Level3 and count occurrences
tally = rna_adata.obs.groupby(['leiden_R', 'Level2', 'Level3']).size().reset_index(name='count')

# Filter to keep only counts greater than 0
tally_positive = tally[tally['count'] > 0]

# Display the tally with counts greater than 0
print(tally_positive)


In [None]:
# Group by Level1, Level2, and Level3 and count occurrences
tally = adata.obs.groupby(['Level1', 'Level2', 'Level3']).size().reset_index(name='count')

# Filter to keep only counts greater than 0
tally_positive = tally[tally['count'] > 0]

# Display the tally with counts greater than 0
print(tally_positive)


In [None]:
# Filter and print rows where Level1 is 'KC Differentiating' and Level2 is 'Mast Cell'
print(adata.obs[(adata.obs['Level2'] == 'KC Differentiating') & (adata.obs['Level3'] == 'Mast cell')])
# Mark the specific cell
rna_adata.obs['highlight'] = 'Other'
rna_adata.obs.loc['MPS42_CGATGTTGTAGTAGGCACTACTCA-1', 'highlight'] = 'Target Cell'

# Plot the UMAP, highlighting the specified cell
sc.pl.umap(rna_adata, color='highlight', palette=['lightgrey', 'red'], size=10, title='Highlighted Cell on UMAP')

In [None]:
# Filter and print rows where Level1 is 'KC Differentiating' and Level2 is 'Mast Cell'
print(adata.obs[(adata.obs['Level2'] == 'KC Differentiating') & (adata.obs['Level3'] == 'Fibroblast')])
# Mark the specific cell
rna_adata.obs['highlight'] = 'Other'
rna_adata.obs.loc['MPS42_TTGATCAGTCAGGAACACTACTCA-1', 'highlight'] = 'Target Cell'

# Plot the UMAP, highlighting the specified cell
sc.pl.umap(rna_adata, color='highlight', palette=['lightgrey', 'red'], size=10, title='Highlighted Cell on UMAP')

In [None]:
# Filter and print rows where Level1 is 'KC Differentiating' and Level2 is 'Mast Cell'
print(adata.obs[(adata.obs['Level2'] == 'T cell') & (adata.obs['Level3'] == 'Treg')])
# Mark the specific cell
rna_adata.obs['highlight'] = 'Other'
rna_adata.obs.loc['MPS13_AATTTGGAGGCAAGCCAGAGGCAA-1', 'highlight'] = 'Target Cell'

# Plot the UMAP, highlighting the specified cell
sc.pl.umap(rna_adata, color='highlight', palette=['lightgrey', 'red'], size=10, title='Highlighted Cell on UMAP')

In [None]:
# Update Level3 for the specific cell
#rna_adata.obs.loc['MPS42_CGATGTTGTAGTAGGCACTACTCA-1', 'Level3'] = 'KC Differentiating'
#rna_adata.obs.loc['MPS42_TTGATCAGTCAGGAACACTACTCA-1', 'Level3'] = 'KC Differentiating'
rna_adata.obs.loc[rna_adata.obs['leiden_R'] == "13", 'Level3'] = 'Treg'
rna_adata.obs.loc[rna_adata.obs['leiden_R'] == "4", 'Level3'] = 'T Cell'
rna_adata.obs.loc[rna_adata.obs['leiden_R'] == "24", 'Level3'] = 'NK'


In [None]:
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color=['Level1','Level2','Level3'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
rna_adata.write('skin_atlas/Melanoma_sc_reproduced_final_Nov1.h5ad')
#mel_only.write('skin_atlas/Melanoma_only_sc_reproduced_final_Nov31.h5ad')
rna_adata.obs.to_csv('skin_atlas/Melanoma_sc_reproduced_final_Nov1_metadata.txt',sep="\t")

In [None]:
rna_adata.obs['Level3'].unique()

## Without harmony

In [None]:
import anndata
mel=anndata.read_h5ad('skin_atlas/Melanoma_sc_reproduced_final_Nov1.h5ad')

In [None]:
mel

In [None]:
mel.X


In [None]:
rna_adata.X

In [None]:
rna_adata.raw.X=rna_adata.X
rna_adata.raw.X

In [None]:
#mel.raw.X = rna_adata.X
#mel.raw.X 

# Replacing `mel.raw` with unprocessed counts from `rna_adata.X`
mel.raw = sc.AnnData(X=rna_adata.X, var=rna_adata.var, obs=rna_adata.obs)
mel.raw.X

In [None]:
# Dimensions of mel.raw.X
print("Dimensions of mel.raw.X:", mel.raw.shape if mel.raw is not None else "mel.raw is not set")

# Dimensions of mel.X
print("Dimensions of mel.X:", mel.shape)

# Dimensions of rna_adata.X
print("Dimensions of rna_adata.X:", rna_adata.shape)


In [None]:
#mel.write('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/Melanoma_sc_reproduced_final_Nov6.h5ad')
mel=anndata.read('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/Melanoma_sc_reproduced_final_Nov6.h5ad')

In [None]:
sc.pl.umap(
    mel,  # Your AnnData object
    color=['Level3'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
mel

In [None]:
rna_adata.obs['Level3']=mel.obs['Level3']
rna_adata.uns['Level3_colors']=mel.uns['Level3_colors']

In [None]:
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color=['Level3'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
import pandas as pd

# Define the desired order of 'Level3' categories
level3_order = [
    'KC Basal',
    'KC Differentiating',
    'KC Cornified',
    'KC Granular',
    'KC Hair',
    'Mast Cell',
    'LC',
    'pDC',
    'mRegDC',
    'DC',
    'Macrophage',
    'NK',
    'T cell',
    'Treg',
    'CD8+ T Cell',
    'B Cell',
    'Melanocytes',
    'Melanoma',
    'Endothelial cell',
    'Fibroblast',
    'Pericytes',
    'Sweat gland related',
    'Schwann Cell',
]

# Set 'Level3' as a categorical variable with the defined order
rna_adata.obs['Level3'] = pd.Categorical(
    rna_adata.obs['Level3'], 
    categories=level3_order, 
    ordered=True
)

# Now plot the dotplot with the custom order
sc.pl.dotplot(
    rna_adata,
    { "Category1": ['KRT15','COL7A1','COL17A1','KRT2','KRT10','DMKN','SBSN','SPINK5',
                    'LOR', 'KRT2', 'KLK11',
                    'ACSL1','HDC','VWA5A','MS4A2',
      'SLC18A2','FCGBP','CD207','WDFY4','IRF8','SPI1','CD68','MS4A6A','IGKC','CORO1A','AKNA','ACAP1','CD3E','CD8A','CD79A','CD22','FOXP3','DCT','MLANA','TYRP1','PMEL',
      'LY6E','SERPINE2','CDH5','PECAM1','EGFL7','COL6A1','COL6A2','DCN','KRT79','KRT7','KRT19','SCGB2A2','S100B']},
    standard_scale="var",
    color_map="Reds",
    groupby="Level3", figsize=(18, 6) 
)


In [None]:
rna_adata.write('skin_atlas/Melanoma_sc_reproduced_final_Nov4_without_harmony.h5ad')


In [None]:

# Define your gene sets
gene_sets = {
    'melano': ["GDF15","PLAB","L1CAM","SEMA3B","HEY1","NES","NTRK3","KNSL5","CITED1","SPP1","CSTB","CDH3","PSEN2","PMEL","MLANA"],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"] #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(rna_adata, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())

# Plot the module scores
sc.pl.umap(rna_adata, color=['melano','hub_genes'], size=10, vmax=0.2)


In [None]:
import scanpy as sc
import pandas as pd
adata=anndata.read_h5ad("skin_atlas/Melanoma_sc_reproduced_final_Nov1.h5ad")
# Assuming adata is your AnnData object and 'Level3' is in adata.obs
# Step 1: Find top 100 markers for each cell type in 'Level3'
sc.tl.rank_genes_groups(adata, groupby='Level3', method='t-test', n_genes=100)

# Step 2: Collect the results
result_dict = {}
for group in adata.uns['rank_genes_groups']['names'].dtype.names:
    result_dict[group] = adata.uns['rank_genes_groups']['names'][group][:100]

# Create a DataFrame from the result
markers_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in result_dict.items()]))

# Step 3: Write the DataFrame to a CSV file
markers_df.to_csv('skin_atlas/Mel_top_100_markers_by_cell_type.txt', index=False, sep="\t")


In [None]:
# Save the AnnData object to a file
rna_adata.write('skin_atlas/Melanoma_sc_reproduced_final_oct16.h5ad')


In [None]:
rna_adata.obs.to_csv('skin_atlas/Melanoma_sc_reproduced_final_oct16_metadata.txt',sep="\t")

In [None]:
import anndata
#rna_adata=anndata.read_h5ad("skin_atlas/Melanoma_sc_reproduced_final.h5ad")
rna_adata=anndata.read_h5ad("skin_atlas/Melanoma_sc_reproduced_final_oct16.h5ad")


In [None]:
rna_adata

In [None]:
sc.pl.umap(rna_adata, color='leiden', size=30)


In [None]:
sc.tl.leiden(rna_adata, resolution=0.2, restrict_to=("leiden", ["6"]))
#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["4"]))
#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["6"]))
sc.pl.umap(rna_adata, color="leiden_R", legend_loc="on data", legend_fontoutline=2)

In [None]:
# Immune subclusters
sc.pl.dotplot(
    rna_adata,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "contamination": ["CD3D"],
        "sweat gland":["KRT7", "KRT19","AQP5","SCGB2A2","DCD","SCNN1A"],
                "T cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
        ],
        "Trm": ["CD69", "CRTAM", "S1PR1"],
        "Treg": ["FOXP3", "PDCD1", "CTLA4", "IL2RA"],
        "NK": [ "NCR1", "GZMB","GNLY", "XCL2"],
        "ILC": ["IL7R"],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "Contamination": ["ITGAM", "S100A8"],
        "Endothelial": ["CLDN5", "PLVAP", "SPARCL1"],
        "Fibroblast": ["PDGFRA", "PDGFRB", "DCN"],
        "B cell":["CD79A","CD22"],
                "Immune cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
            "ITGAM",
            "S100A8",
        ],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "KC Basal": ["KRT15", "C1orf56"],
        "KC Differentiating": ["KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": [ "KRT6B", "KRT79"],
       "Neuronal cells": ["S100A1","S100B"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_R",
) #"leiden_res0_25"

In [None]:
rna_adata.obs['Level3'].unique().tolist()

In [None]:
# Dictionary mapping old Idents to new Idents
ident_mapping = {
    'KC_Cornified': 'KC Cornified',
    'Imm_DC':'DC',
    'KC_Differentiating': 'KC Differentiating',
    'KC_Basal':'KC Basal',
    'Imm_Fibroblast': 'Fibroblast',
    'KC_Granular':'KC Granular',
    'Imm_Tcell':'T cell',
    'KC_stem/hair/stress': 'KC Hair',
    'Imm_CD8+ T cell': 'CD8+ T cell',
    'Imm_Macrophage':'Macrophage',
    'Imm_Treg':'Treg',
    'Imm_Endothelial cell':'Endothelial cell' ,
    'Imm_NK': 'NK',
    'Imm_Pericytes':'Pericytes',
    'Imm_Bcells': 'B cell',
    'Imm_Mast Cells': 'Mast cell',
    'Imm_mRegDC':'mRegDC',
    'Imm_LC':'LC',
    'Imm_pDC':'pDC',
    'Sweat gland related':'Sweat gland related',
    'nan':'nan',
    'Melanocytes':'Melanocytes',
    'Melanoma':'Melanoma',
}

# Rename Idents and add to a new column
rna_adata.obs['Level3'] = rna_adata.obs['Level3'].map(ident_mapping)

# If you want to set the new Idents as active identity (depends on your workflow)
#rna_adata.obs['Level3'] = rna_adata.obs['Level3']


In [None]:
import pandas as pd

# Define the desired order of 'Level3' categories
level3_order = [
    'KC Basal',
    'KC Differentiating',
    'KC Cornified',
    'KC Granular',
    'KC Hair',
    'Mast cell',
    'LC',
    'pDC',
    'mRegDC',
    'DC',
    'Macrophage',
    'NK',
    'T cell',
    'Treg',
    'CD8+ T cell',
    'B cell',
    'Melanocytes',
    'Melanoma',
    'Endothelial cell',
    'Fibroblast',
    'Pericytes',
    'Sweat gland related',
    'nan',
]

# Set 'Level3' as a categorical variable with the defined order
rna_adata.obs['Level3'] = pd.Categorical(
    rna_adata.obs['Level3'], 
    categories=level3_order, 
    ordered=True
)

# Now plot the dotplot with the custom order
sc.pl.dotplot(
    rna_adata,
    { "Category1": ['KRT15','COL7A1','COL17A1','KRT2','KRT10','DMKN','SBSN','SPINK5',
                    'LOR', 'KRT2', 'KLK11',
                    'ACSL1','HDC','VWA5A','MS4A2',
      'SLC18A2','FCGBP','CD207','WDFY4','IRF8','SPI1','CD68','MS4A6A','IGKC','CORO1A','AKNA','ACAP1','CD3E','CD8A','CD79A','CD22','FOXP3','DCT','MLANA','TYRP1','PMEL',
      'LY6E','SERPINE2','CDH5','PECAM1','EGFL7','COL6A1','COL6A2','DCN','KRT79','KRT7','KRT19','SCGB2A2']},
    standard_scale="var",
    color_map="Reds",
    groupby="Level3", figsize=(18, 6) 
)


In [None]:
from matplotlib import rcParams

sc.tl.rank_genes_groups(rna_adata, 'Level3', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(rna_adata)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
sc.pl.rank_genes_groups(rna_adata, key='rank_genes_groups_filtered', ncols=3)



In [None]:
sc.pl.rank_genes_groups_dotplot(rna_adata, n_genes=4)


In [None]:
axs = sc.pl.rank_genes_groups_dotplot(rna_adata, n_genes=15, groups=['KC_Cornified', 'KC_Granular'])


In [None]:
sc.tl.rank_genes_groups(rna_adata, 'Level3', method='wilcoxon', groups =['KC_Cornified'], reference='KC_Granular')
sc.pl.rank_genes_groups(rna_adata, groups =['KC_Cornified'], n_genes=20)

In [None]:
rna_adata.uns['rank_genes_groups']['names']['KC_Cornified'].tolist()

In [None]:
sc.tl.rank_genes_groups(rna_adata, 'Level3', method='wilcoxon', reference ='KC_Cornified', groups=['KC_Granular'])
sc.pl.rank_genes_groups(rna_adata, groups =['KC_Granular'], n_genes=20)
rna_adata.uns['rank_genes_groups']['names']['KC_Granular'].tolist()

In [None]:
rna_adata

In [None]:
mel_only.obs['copykat_pred']

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#rna_adata.obs['copykat_pred']=rna_adata.obs['copykat_pred']
#rna_adata.obs['infercnv_pred']=rna_adata_OLD.obs['infercnv_pred']


mel_only = rna_adata[rna_adata.obs['Level2'] == 'Melanocytes'].copy() #original_Level2
# Lower dimension embedding - UMAP and clustering
sc.pp.neighbors(mel_only, n_neighbors=30, n_pcs=75)
sc.tl.umap(mel_only)
sc.tl.leiden(mel_only, resolution=1)
#sc.pl.umap(mel_only, color=['leiden'] , legend_loc = 'best')


# Ensure `mel_only` is your AnnData object

def handle_categorical(column):
    if column.dtype.name == 'category':
        # Check if 'nan' is already a category
        if 'nan' not in column.cat.categories:
            column = column.cat.add_categories(['nan'])
        # Fill missing values with 'nan'
        column = column.fillna('nan')
    else:
        # Convert to category and fill missing values
        column = pd.Categorical(column.fillna('nan'))
    return column

# Handle categorical columns
mel_only.obs['copykat_pred'] = handle_categorical(mel_only.obs['copykat_pred'])
mel_only.obs['infercnv_pred'] = handle_categorical(mel_only.obs['infercnv_pred'])

# Create a combined clustering column focusing on "Aneuploid" clusters and NA values
def categorize_clusters(row):
    infercnv = row['infercnv_pred']
    copykat = row['copykat_pred']
    
    if infercnv == 'Aneuploid' and copykat == 'aneuploid':
        return 'overlap'
    elif infercnv == 'Aneuploid' and copykat != 'aneuploid':
        return 'infercnv_only'
    elif infercnv != 'Aneuploid' and copykat == 'aneuploid':
        return 'copykat_only'
    else:
        return 'none'

mel_only.obs['combined_clusters'] = mel_only.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
#mel_only.obs['combined_clusters_color'] = mel_only.obs['combined_clusters'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
#sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors)


# Sort the data to ensure 'none' (grey) is at the back, blue/orange in the middle, and red at the front
mel_only.obs['sort_order'] = mel_only.obs['combined_clusters'].map({
    'none': 0,             # Grey: plotted first (back)
    'infercnv_only': 2,     # Blue: middle
    'copykat_only': 3,      # Orange: middle
    'overlap': 4            # Red: plotted last (front)
})

# Sort the data based on the sort order
mel_only = mel_only[mel_only.obs.sort_values('sort_order').index]

# Plot UMAP with custom colors and point size, ensuring grey is in the background
sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors)


In [None]:
sc.pl.umap(mel_only, color=['leiden','orig.ident','combined_clusters'] , legend_loc = 'best')

In [None]:
sc.pl.umap(mel_only, color=['leiden'] , legend_loc = 'on data')

In [None]:
print(rna_adata)
sc.pl.umap(rna_adata, color=['Level1'] )

In [None]:
# Immune subclusters
sc.pl.dotplot(
    mel_only,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "contamination": ["CD3D"],
        "sweat gland":["KRT7", "KRT19","AQP5","SCGB2A2","DCD","SCNN1A"],
                "T cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
        ],
        "Trm": ["CD69", "CRTAM", "S1PR1"],
        "Treg": ["FOXP3", "PDCD1", "CTLA4", "IL2RA"],
        "NK": [ "NCR1", "GZMB","GNLY", "XCL2"],
        "ILC": ["IL7R"],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "Contamination": ["ITGAM", "S100A8"],
        "Endothelial": ["CLDN5", "PLVAP", "SPARCL1"],
        "Fibroblast": ["PDGFRA", "PDGFRB", "DCN"],
        "B cell":["CD79A","CD22"],
                "Immune cell": [
            "CD3D",
            "CD3E",
            "CD4",
            "CD8A",
            "CCR7",
            "SELL",
            "CD27",
            "ITGAM",
            "S100A8",
        ],
        "prolif": ["MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
        "KC Basal": ["KRT15", "C1orf56"],
        "KC Differentiating": ["KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": [
            "KRT6B",
            "KRT79",
        ],

    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
) #"leiden_res0_25"

In [None]:
import scanpy as sc
#mel_only = rna_adata[rna_adata.obs['original_Level2'] == 'Melanocytes'].copy()

# Load your data (replace with your actual data loading)

# Define your gene sets
gene_sets = {
    'melano': ["GDF15","PLAB","L1CAM","SEMA3B","HEY1","NES","NTRK3","KNSL5","CITED1","SPP1","CSTB","CDH3","PSEN2","PMEL","MLANA"],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"] #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())

# Plot the module scores
sc.pl.umap(mel_only, color=['melano','hub_genes'], size=30, vmax=0.2)


In [None]:
# Filter the DataFrame to include only rows where 'hub_genes' > 0
filtered_df = mel_only.obs[mel_only.obs['hub_genes'] > 0.1]

# Group by 'melanocyte_clusters' and calculate the mean of 'hub_genes'
print(filtered_df.groupby('leiden')['hub_genes'].mean())


print(filtered_df.groupby('leiden')['hub_genes'].median())

In [None]:
sc.pl.umap(mel_only, color=['leiden'] , legend_loc = 'on data')

In [None]:
# Extract indices where leiden_R is either '6' or '1'
indices_61 = rna_adata.obs.index[rna_adata.obs['leiden_R'].isin(['6,1'])].tolist()

# Extract indices where leiden is '0'
indices_0 = mel_only.obs.index[mel_only.obs['leiden'].isin(['10'])].tolist()

# Show the results
print("Indices for leiden_R == '6,1':", indices_61)
print("Indices for leiden == '10':", indices_0)


In [None]:
#print(rna_adata)
sc.pl.dotplot(rna_adata, {"QC":['n_genes_by_counts','total_counts','pct_counts_mt']},groupby='leiden_R',swap_axes=True,  # Swap x and y axes
    show=True)


In [None]:
from matplotlib import rcParams

# adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
sc.tl.rank_genes_groups(rna_adata, 'leiden_R', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(rna_adata)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
axs = sc.pl.rank_genes_groups_dotplot(rna_adata, n_genes=20, groups=['6,0', '6,1'])


In [None]:
import scanpy as sc
import numpy as np

# Assuming rna_adata is your AnnData object
# Calculate total counts per cell
rna_adata.obs['total_counts'] = rna_adata.X.sum(axis=1).A1  # A1 is used to convert from sparse to dense array if needed

# Calculate number of genes detected per cell
rna_adata.obs['n_genes'] = (rna_adata.X > 0).sum(axis=1).A1

# Calculate percentage of mitochondrial genes (assuming mitochondrial genes start with 'MT-')
mt_genes = rna_adata.var.index.str.startswith('MT-')
rna_adata.obs['percent_mt'] = rna_adata.X[:, mt_genes].sum(axis=1).A1 / rna_adata.obs['total_counts'] * 100
sc.pl.umap(rna_adata, color=['n_genes'] , legend_loc = 'on data')

In [None]:
filtered_df['leiden'].value_counts()

In [None]:
# Filter the DataFrame to include only rows where 'hub_genes' > 0
filtered_df2 = mel_only.obs[mel_only.obs['combined_clusters'] == "overlap"]

# Group by 'melanocyte_clusters' and calculate the mean of 'hub_genes'
#print(filtered_df2.groupby('leiden')['combined_clusters'].mean())
#print(filtered_df2.groupby('leiden')['combined_clusters'].median())

filtered_df2['leiden'].value_counts()

In [None]:
rna_adata.obs['Level3'].value_counts()

In [None]:
import anndata
rna_adata_OLD=anndata.read_h5ad("/QRISdata/Q1851/Prakrithi/Melanoma_scRNAseq/Mel_3samples_75pcs.h5ad")

In [None]:
rna_adata_OLD

In [None]:
#rna_adata.obs['original_Level2']=rna_adata_OLD.obs['Level2_res1']
rna_adata.obs['original_Level2']=rna_adata_OLD.obs['Level2_res1']


In [None]:
## creating anndata objects from counts
rna_adata2 = sc.AnnData(rna_counts)
rna_adata2.obs=batch.iloc[:,[0,1,2]]
## Adding batch information to the Anndata objects from the metadata file
#rna_adata.obs['sample'] = batch['orig.ident']
## Making rows and column names unique (Cell barcode information is found in .obs and gene (feature) information in .var)
rna_adata2.var_names_make_unique()
rna_adata2.obs_names_make_unique()


## QC plots for total genes, counts and percentage of mitochondrial genes in cells
rna_adata2.var['mt'] = rna_adata2.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(rna_adata2, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

# Scanpy will prepend the string in the save argument with "violin"
# and save it to our figure directory defined in the first step.

# Filtering out the low quality genes and cells
rna_adata2 = rna_adata2[(rna_adata2.obs.n_genes_by_counts < 7000) &
(rna_adata2.obs.total_counts < 20000),:]
## Normalizing the counts
sc.pp.normalize_total(rna_adata2, target_sum=1e4)
sc.pp.log1p(rna_adata2)
## Identifying the highly variable genes
sc.pp.highly_variable_genes(rna_adata2, min_mean=0.0125, max_mean=3, min_disp=0.25)
# This saves the original set of genes
rna_adata2.raw = rna_adata2

rna_adata2 = rna_adata2[:,rna_adata2.var.highly_variable]
sc.pp.scale(rna_adata2, max_value=10)
# Lower dimension embedding - PCA
sc.tl.pca(rna_adata2, svd_solver='arpack',n_comps=200)

## Batch correction
sc.external.pp.harmony_integrate(rna_adata2, 'orig.ident')
rna_adata2.obsm['X_pca'] = rna_adata2.obsm['X_pca_harmony']
sc.pp.neighbors(rna_adata2, n_neighbors=30, n_pcs=75)
sc.tl.leiden(rna_adata2, resolution=1)
sc.tl.umap(rna_adata2)

In [None]:
sc.pl.umap(rna_adata2, color='leiden')

### Import old frozen

In [None]:
import anndata
rna_adata_OLD=anndata.read_h5ad("/QRISdata/Q2051/SCC_Paper/resources/data/melanoma_sc.h5ad")


In [None]:



sc.pl.dotplot(
    rna_adata_OLD,
    {
        "B cell": [
            "MS4A1",
            "CD79A",
            "CD79B","CD22","CD40"]
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_res1",
)

In [None]:
sc.pl.umap(rna_adata_OLD, color= ["CD79A","MS4A1","CD22","KRT2",])

In [None]:
sc.pl.umap(rna_adata_OLD, color= ["Level2_res1",])

In [None]:
sc.pl.umap(rna_adata_OLD, color= ["orig.ident"])

In [None]:
rna_adata_OLD

# Melanoma annotations 

In [None]:
mel_only=anndata.read_h5ad('/QRISdata/Q1851/Prakrithi/skin_atlas/reanalysis_nov2024/Melanocytes_only_nov22.h5ad')

In [None]:
mel

In [None]:
mel_only.obs['orig.ident'].unique
# Add 'status' column and assign values based on the condition
mel_only.obs['status'] = mel_only.obs['orig.ident'].apply(lambda x: 'Malignant' if x == 'MPS13' else 'Benign')
mel_only.obs['status'].unique()


In [None]:
melonly_raw=mel_only.raw

### with raw counts

In [None]:
sc.pp.log1p(rna_adata)
sc.tl.rank_genes_groups(mel_only, 'status', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(mel_only)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(mel_only, n_genes=20, groups=['Malignant', 'Benign'])
print("Malignant: ",mel_only.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100])
print("Benign: ",mel_only.uns['rank_genes_groups']['names']['Benign'].tolist()[:100])

In [None]:
## SCC genes from literature
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()

gene_sets = {
    'melano': mel_only.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"] #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
split_umap(mel_only, color=['melano'], size=30,split_by='status')
split_umap(mel_only, color=['hub_genes'], size=30,split_by='status')


#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = mel_only.obs['melano']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of all_Cancer', fontsize=14)
plt.xlabel('melano mod score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()
import numpy as np
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = mel_only.obs['melano']
# Calculate the 90th percentile
percentile_80 = np.percentile(all_cancer_values, 80)
percentile_90 = np.percentile(all_cancer_values, 90)
percentile_95 = np.percentile(all_cancer_values, 95)
percentile_99 = np.percentile(all_cancer_values, 99)
percentile_995 = np.percentile(all_cancer_values, 99.5)


# Print the result
print(f"90th Percentile of 'all_Cancer': {percentile_80}")
print(f"90th Percentile of 'all_Cancer': {percentile_90}")
print(f"95th Percentile of 'all_Cancer': {percentile_95}")
print(f"99th Percentile of 'all_Cancer': {percentile_99}")
print(f"99.5th Percentile of 'all_Cancer': {percentile_995}")

In [None]:
# Define the condition
condition80p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 0.66)
condition90p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 1.28)
condition95p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 1.85)
condition99p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 5.13)
condition99_5p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 7.32)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
mel_only.obs['Melano_cancer_DE_80p'] = ['Cancer' if cond else 'Normal' for cond in condition80p]
mel_only.obs['Melano_cancer_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in condition90p]
mel_only.obs['Melano_cancer_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in condition95p]
mel_only.obs['Melano_cancer_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in condition99p]
mel_only.obs['Melano_cancer_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in condition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(mel_only, color=['Melano_cancer_DE_80p','Melano_cancer_DE_90p','Melano_cancer_DE_95p','Melano_cancer_DE_99p'], palette=color_map, size=30)


# Assuming `df` is your DataFrame
cancer_count80p = mel_only.obs[mel_only.obs['Melano_cancer_DE_80p'] == 'Cancer'].shape[0]
cancer_count90p = mel_only.obs[mel_only.obs['Melano_cancer_DE_90p'] == 'Cancer'].shape[0]
cancer_count95p=mel_only.obs[mel_only.obs['Melano_cancer_DE_95p'] == 'Cancer'].shape[0]
cancer_count99p=mel_only.obs[mel_only.obs['Melano_cancer_DE_99p'] == 'Cancer'].shape[0]
cancer_count99_5p=mel_only.obs[mel_only.obs['Melano_cancer_DE_99_5p'] == 'Cancer'].shape[0]
print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count80p}")
print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {cancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {cancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {cancer_count99_5p}")



In [None]:
from matplotlib import rcParams

#mel_only = mel[mel.obs['Level2'] == 'Melanocytes']

mel_only.raw=None
sc.tl.rank_genes_groups(mel_only, 'status', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(mel_only)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(mel_only, n_genes=20, groups=['Malignant', 'Benign'])


In [None]:
print("Malignant: ",mel_only.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100])
print("Benign: ",mel_only.uns['rank_genes_groups']['names']['Benign'].tolist()[:100])

In [None]:
## SCC genes from literature
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()

gene_sets = {
    'melano': mel_only.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"], #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
    #'edgeR':['MMP16','COL9A3','TIMP3','CNIH3','KCNN4','TBC1D10A','PPIB','XBP1','COL6A3','SRPX','ECM1','ALDH1A3','NES','SH3BGRL3','CCN3','PDIA3','SHTN1','SSR4','AQP1','BST2','FGFR1','TIMP1','LAP3','P4HB','CTHRC1','COL4A2','LGMN','TRAM1','CKAP4','WIPF1','AFAP1L2','CALU','IFI6','FN1','CAVIN3','PDIA6','DUSP6','MYDGF','CD276','MT2A','PKM','BCAP31','FKBP10','CTSK','PLEC','MIA','COL1A2','QDPR','LOXL4','PRNP','COL6A1','COL9A1','SLC5A3','METRN','ACSS2','IFI44','STAT1','SNED1','FGFBP2','IMPAD1','LY6E','FXYD5','KRT2','CC2D1A','AP1S2','IFI44L','IFI27','AKAP6','OAS3','S100A16','RCN1','SLC9A3','EMILIN1','COL6A2','CYTL1','PCOLCE','SOD3','MCAM','RNF125','BMP8B','OAF','SH2B3','DBNDD1','ITM2B','LY96','CD22','ANGPT1','DUSP5','MS4A2','OLFML3','SCML4','ITGB3','MPZ','ISLR','LOXL3','CD68','FCER1G','ITGA10','ITIH5','COL9A2','BAALC','THBS1','PLAUR','AGT','PRRX1','TNC','SLC20A1','WARS','MGP','ANO4','ECE1','LIMK1','S100A4','IER3','A2M','CD109','MX1','OAS1','CAPG','TGFBI','FABP5','HIST1H1B','COL4A1','IGFBP4','ISG15','S100A6'],
    'edgeR':['CTHRC1','TIMP1','LOXL4','HIST1H1B','PPIB','PDIA6','FGFBP2','TIMP3','COL1A2','MYDGF','SOD3','LGMN','SCAMP2','EGR1','USP18','TGFBI','HAPLN3','A2M','IGFBP2','LIMK1','FGFR1','TMED2','MGP','MMP16','NRGN','ITGB3','BAALC','AGMO','LIF','ALDH1A1','FOXM1','PCOLCE','ISLR','LAYN','HJURP','MAT1A','ASPM','OLFML3','C2','FCER1G','FMOD','AQP1','KIF23','GGH','KDELR3','QDPR','SSR4','SH2B3','ITGA10','PLAUR','EMILIN1','S100A16','OAF','COL4A2','SELENOM','MEOX2','FABP5','NUSAP1','FREM2','AGT','MDK','EMILIN2','PLEC','CKAP4','WIPF1','COL6A1','COL6A2','TRAM1','CAVIN3','P4HB','COL11A1','ACSS2','AFAP1L2','PRG2','SLC5A3','RNF125','MICALL2','SHTN1','SND1','CYP7B1','AKAP6','COL4A1','CSF2RA','ADGRG6','XBP1','EGR3','SRGN','RELL1','FGL2','PTPRE','MLLT11','ECE1','ANGPTL7','SORL1','COL9A3','SLC20A1','COL22A1','COL9A2','LRP8','DUSP5','MPZ','TTR','RCN1','DBNDD1','BMP8B','LOXL3','RFX8','TIAM1','HSD3B7','STRA6','ORC6','RAB7B','CABP4','DNAH9','CD22','ITGA7','AXL','EBF3','DIO2','GALNT5','FGF1']
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
split_umap(mel_only, color=['melano'], size=30,split_by='status')
split_umap(mel_only, color=['hub_genes'], size=30,split_by='status')
split_umap(mel_only, color=['edgeR'], size=30,split_by='status')


#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
sc.pl.umap(mel_only, color=['Aneuploid_combined_clusters'], size=30)


In [None]:
sc.pl.umap(mel_only, color=['mel_leiden'], size=30,legend_loc="on data")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = mel_only.obs['melano']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of all_Cancer', fontsize=14)
plt.xlabel('melano mod score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = mel_only.obs['melano']
# Calculate the 90th percentile
percentile_90 = np.percentile(all_cancer_values, 90)
percentile_95 = np.percentile(all_cancer_values, 95)
percentile_99 = np.percentile(all_cancer_values, 99)
percentile_995 = np.percentile(all_cancer_values, 99.5)


# Print the result
print(f"90th Percentile of 'all_Cancer': {percentile_90}")
print(f"95th Percentile of 'all_Cancer': {percentile_95}")
print(f"99th Percentile of 'all_Cancer': {percentile_99}")
print(f"99.5th Percentile of 'all_Cancer': {percentile_995}")

In [None]:
# Define the condition
condition90p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 1.37)
condition95p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 1.64)
condition99p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 2.08)
condition99_5p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['melano'] > 2.14)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
mel_only.obs['Melano_cancer_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in condition90p]
mel_only.obs['Melano_cancer_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in condition95p]
mel_only.obs['Melano_cancer_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in condition99p]
mel_only.obs['Melano_cancer_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in condition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(mel_only, color=['Melano_cancer_DE_90p','Melano_cancer_DE_95p','Melano_cancer_DE_99p','Melano_cancer_DE_99_5p'], palette=color_map, size=30)


# Assuming `df` is your DataFrame
cancer_count90p = mel_only.obs[mel_only.obs['Melano_cancer_DE_90p'] == 'Cancer'].shape[0]
cancer_count95p=mel_only.obs[mel_only.obs['Melano_cancer_DE_95p'] == 'Cancer'].shape[0]
cancer_count99p=mel_only.obs[mel_only.obs['Melano_cancer_DE_99p'] == 'Cancer'].shape[0]
cancer_count99_5p=mel_only.obs[mel_only.obs['Melano_cancer_DE_99_5p'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {cancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {cancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {cancer_count99_5p}")



In [None]:
#edgeR results

In [None]:
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = mel_only.obs['edgeR']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of edgeR', fontsize=14)
plt.xlabel('melano mod score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


import numpy as np
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = mel_only.obs['edgeR']
# Calculate the 90th percentile
percentile_90 = np.percentile(all_cancer_values, 90)
percentile_95 = np.percentile(all_cancer_values, 95)
percentile_99 = np.percentile(all_cancer_values, 99)
percentile_995 = np.percentile(all_cancer_values, 99.5)


# Print the result
print(f"90th Percentile of 'all_Cancer': {percentile_90}")
print(f"95th Percentile of 'all_Cancer': {percentile_95}")
print(f"99th Percentile of 'all_Cancer': {percentile_99}")
print(f"99.5th Percentile of 'all_Cancer': {percentile_995}")

In [None]:
# Define the condition
condition90p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['edgeR'] > 1.02)
condition95p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['edgeR'] > 1.36)
condition99p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['edgeR'] > 1.76)
condition99_5p = (mel_only.obs['Aneuploid_combined_clusters'] == 'overlap') & (mel_only.obs['edgeR'] > 1.9)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
mel_only.obs['edgeR_cancer_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in condition90p]
mel_only.obs['edgeR_cancer_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in condition95p]
mel_only.obs['edgeR_cancer_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in condition99p]
mel_only.obs['edgeR_cancer_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in condition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(mel_only, color=['edgeR_cancer_DE_90p','edgeR_cancer_DE_95p','edgeR_cancer_DE_99p','edgeR_cancer_DE_99_5p'], palette=color_map, size=30)


# Assuming `df` is your DataFrame
cancer_count90p = mel_only.obs[mel_only.obs['edgeR_cancer_DE_90p'] == 'Cancer'].shape[0]
cancer_count95p=mel_only.obs[mel_only.obs['edgeR_cancer_DE_95p'] == 'Cancer'].shape[0]
cancer_count99p=mel_only.obs[mel_only.obs['edgeR_cancer_DE_99p'] == 'Cancer'].shape[0]
cancer_count99_5p=mel_only.obs[mel_only.obs['edgeR_cancer_DE_99_5p'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {cancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {cancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {cancer_count99_5p}")



In [None]:
import pickle

# Save the AnnData object to a pickle file
with open('/QRISdata/Q1851/Prakrithi/skin_atlas/reanalysis_nov2024/Melanocytes_only_nov22_with_modscore_noraw.pkl', 'wb') as f:
    pickle.dump(mel_only, f)


In [None]:
# edgeR res

In [None]:

## SCC genes from literature
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    adata.obs[split_by] = adata.obs[split_by].astype('category')
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()

gene_sets = {
  #  'melano': mel_only.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"], #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
    #'edgeR':['MMP16','COL9A3','TIMP3','CNIH3','KCNN4','TBC1D10A','PPIB','XBP1','COL6A3','SRPX','ECM1','ALDH1A3','NES','SH3BGRL3','CCN3','PDIA3','SHTN1','SSR4','AQP1','BST2','FGFR1','TIMP1','LAP3','P4HB','CTHRC1','COL4A2','LGMN','TRAM1','CKAP4','WIPF1','AFAP1L2','CALU','IFI6','FN1','CAVIN3','PDIA6','DUSP6','MYDGF','CD276','MT2A','PKM','BCAP31','FKBP10','CTSK','PLEC','MIA','COL1A2','QDPR','LOXL4','PRNP','COL6A1','COL9A1','SLC5A3','METRN','ACSS2','IFI44','STAT1','SNED1','FGFBP2','IMPAD1','LY6E','FXYD5','KRT2','CC2D1A','AP1S2','IFI44L','IFI27','AKAP6','OAS3','S100A16','RCN1','SLC9A3','EMILIN1','COL6A2','CYTL1','PCOLCE','SOD3','MCAM','RNF125','BMP8B','OAF','SH2B3','DBNDD1','ITM2B','LY96','CD22','ANGPT1','DUSP5','MS4A2','OLFML3','SCML4','ITGB3','MPZ','ISLR','LOXL3','CD68','FCER1G','ITGA10','ITIH5','COL9A2','BAALC','THBS1','PLAUR','AGT','PRRX1','TNC','SLC20A1','WARS','MGP','ANO4','ECE1','LIMK1','S100A4','IER3','A2M','CD109','MX1','OAS1','CAPG','TGFBI','FABP5','HIST1H1B','COL4A1','IGFBP4','ISG15','S100A6'],
    'edgeR':['SASS6','PARP14','CMPK2','DDX60L','IFI44L','IFI44','ACAP3','STAT1','FGF1','ART3','FYB1','RFX8','PARP9','LBH','NRN1','AGTRAP','FKBP7','SIPA1L2','TTL','RGS1','COLEC11','TGFBI','S100A4','EMILIN1','FN1','QDPR','FLG','WIPF1','LAP3','CTNNBIP1','SH3BGRL3','COL6A3','SDF4','PDIA6','RHOC','CTSK','FGFBP2','ANGPTL7','BMP8B','AGT','SOD3','OLFML3','FCER1G','COL9A2','FHL2','COL11A1','NID1','SERPINE2','IFI6','S100A6','NES','SPON2','CNIH3','PRRX1','ITGA1','SHC1','SLC9A3','MPZ','LOXL3','ITGA10','SLC20A1','PLPP3','SNED1','NIPAL3','CITED4','TMEM163','RELL1','S100A16','ADAMTS2','COL5A2','PDLIM4','TMEM158','ASPM','HJURP','CTSS','EGR1','TRIP13','ADCY3','DTL','MTHFD2','RAB7B','STIL','LRP8','HERC6','DDX60','LRP2','SLC7A11','S100A2','ISG15','CYTL1','UCHL1','ECM1','CAPG','GBA','FMOD','IGFBP2','GALNT2','KIF3C','BASP1','ERRFI1','GALNT5','ECE1','MLLT11','NENF','CYBRD1','AGA']
} #"GDF15"="PLAB", MLANA=MART1
# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
#split_umap(mel_only, color=['melano'], size=30,split_by='status')
split_umap(mel_only, color=['hub_genes'], size=30,split_by='status')
split_umap(mel_only, color=['edgeR'], size=30,split_by='status')


#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
def split_umap(adata, split_by, ncol=2, nrow=None, vmax=None, **kwargs):
    adata.obs[split_by] = adata.obs[split_by].astype('category')
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    
    for i, cat in enumerate(categories):
        ax = axs[i]
        # Adjust the color scale range using vmax (or vmin if desired)
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, vmax=vmax, **kwargs)
    
    plt.tight_layout()

split_umap(mel_only, color=['edgeR'], size=30, split_by='status', vmax=0.5)


In [None]:
split_umap(mel_only, color=['mel_leiden'], size=30, split_by='status')


In [None]:

# Assuming 'adata' is your AnnData object and 'adata.obs['cluster']' contains the cluster assignments
# Filter the AnnData object to include cells from clusters 9 and 10 (MPS13)
adata_mps13 = mel_only[mel_only.obs['mel_leiden'].isin(['9', '10'])]

# Create a new group for MPS13 and the rest of the clusters
mel_only.obs['orig.ident'] = mel_only.obs['mel_leiden'].apply(lambda x: 'MPS13' if x in ['9', '10'] else 'Others')

# Run differential expression analysis between MPS13 (clusters 9 and 10) and all other clusters
sc.tl.rank_genes_groups(mel_only, groupby='orig.ident', method='t-test', key_added='rank_genes_mps13')

# Inspect the results
sc.pl.rank_genes_groups(mel_only, groupby='orig.ident', n_genes=10, key='rank_genes_mps13')


In [None]:
sc.pl.rank_genes_groups_dotplot(mel_only, groupby='orig.ident', n_genes=10, key='rank_genes_mps13')


In [None]:
import scanpy as sc

# Create a new 'group' column to label clusters 9 and 10 as one group and all other clusters as 'Other_clusters'
mel_only.obs['group'] = mel_only.obs['mel_leiden'].apply(lambda x: 'Cluster_9_10' if x in ['9', '10'] else 'Other_clusters')

# Verify the 'group' column exists in obs
print(mel_only.obs['group'].unique())  # This should show 'Cluster_9_10' and 'Other_clusters'

# Perform differential expression analysis (DE) between 'Cluster_9_10' and 'Other_clusters'
sc.tl.rank_genes_groups(mel_only, groupby='group', groups=['Cluster_9_10', 'Other_clusters'], method='t-test', key_added='rank_genes_cluster')

# Visualize the top DE genes
sc.pl.rank_genes_groups(mel_only, groupby='group', n_genes=10, key='rank_genes_cluster')


In [None]:
sc.pl.rank_genes_groups_dotplot(mel_only, groupby='group', n_genes=30, key='rank_genes_cluster')


In [None]:
print("c9: ",mel_only.uns['rank_genes_groups']['names']['9'].tolist()[:100])

In [None]:
import scanpy as sc

# Log-transform the raw count data (this modifies the raw data stored in adata.raw)
sc.pp.log1p(mel_only.raw.X)  # Log-transform raw counts

# Now normalize the data per cell (using log-transformed data)
sc.pp.normalize_total(mel_only, target_sum=1e4)

# Perform differential expression using the Wilcoxon method
sc.tl.rank_genes_groups(mel_only, 'mel_leiden', method='wilcoxon', n_genes=200)

# Filter the ranked genes
sc.tl.filter_rank_genes_groups(mel_only)

# Set figure size for visualizations
from matplotlib import rcParams
rcParams['figure.figsize'] = 4, 4
rcParams['axes.grid'] = True

# Plot the ranked genes as a dotplot
axs = sc.pl.rank_genes_groups_dotplot(mel_only, n_genes=10, groups=['9', '10', '0', '1', '2', '3', '4'])


In [None]:
mel_only.obs['exclusive_clust'] = np.where(mel_only.obs['mel_leiden'].isin(['9', '10']), 'Malignant', 'Benign')

#mel_only = mel[mel.obs['Level2'] == 'Melanocytes']

#mel_only.raw=None
sc.tl.rank_genes_groups(mel_only, 'exclusive_clust', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(mel_only)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(mel_only, n_genes=20, groups=['Malignant', 'Benign'])


In [None]:
#mel_only = mel[mel.obs['Level2'] == 'Melanocytes']

#mel_only.raw=None
sc.tl.rank_genes_groups(mel_only, 'orig.ident', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(mel_only)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(mel_only, n_genes=20, groups=['MPS13'])


In [None]:
#### edgeR ranodm pseudo

In [None]:
def split_umap(adata, split_by, ncol=2, nrow=None, vmax=None, **kwargs):
    adata.obs[split_by] = adata.obs[split_by].astype('category')
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    
    for i, cat in enumerate(categories):
        ax = axs[i]
        # Adjust the color scale range using vmax (or vmin if desired)
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, vmax=vmax, **kwargs)
    
    plt.tight_layout() 
    
gene_sets = {
    #'melano': mel_only.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100],
    'MPS13': mel_only.uns['rank_genes_groups']['names']['MPS13'].tolist()[:100],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"], #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
    #'edgeR':['MMP16','COL9A3','TIMP3','CNIH3','KCNN4','TBC1D10A','PPIB','XBP1','COL6A3','SRPX','ECM1','ALDH1A3','NES','SH3BGRL3','CCN3','PDIA3','SHTN1','SSR4','AQP1','BST2','FGFR1','TIMP1','LAP3','P4HB','CTHRC1','COL4A2','LGMN','TRAM1','CKAP4','WIPF1','AFAP1L2','CALU','IFI6','FN1','CAVIN3','PDIA6','DUSP6','MYDGF','CD276','MT2A','PKM','BCAP31','FKBP10','CTSK','PLEC','MIA','COL1A2','QDPR','LOXL4','PRNP','COL6A1','COL9A1','SLC5A3','METRN','ACSS2','IFI44','STAT1','SNED1','FGFBP2','IMPAD1','LY6E','FXYD5','KRT2','CC2D1A','AP1S2','IFI44L','IFI27','AKAP6','OAS3','S100A16','RCN1','SLC9A3','EMILIN1','COL6A2','CYTL1','PCOLCE','SOD3','MCAM','RNF125','BMP8B','OAF','SH2B3','DBNDD1','ITM2B','LY96','CD22','ANGPT1','DUSP5','MS4A2','OLFML3','SCML4','ITGB3','MPZ','ISLR','LOXL3','CD68','FCER1G','ITGA10','ITIH5','COL9A2','BAALC','THBS1','PLAUR','AGT','PRRX1','TNC','SLC20A1','WARS','MGP','ANO4','ECE1','LIMK1','S100A4','IER3','A2M','CD109','MX1','OAS1','CAPG','TGFBI','FABP5','HIST1H1B','COL4A1','IGFBP4','ISG15','S100A6'],
    'edgeR':['COL5A2','DTL','ART3','DDX60L','EGR1','UCHL1','ADAMTS2','RFX8','ANGPTL7','NRN1','RELL1','AGT','CMPK2','LAP3','NENF','COL6A3','FLG','PDIA6','SOD3','BMP8B','SASS6','OLFML3','FCER1G','LOXL3','ITGA10','MPZ','EMILIN1','SLC9A3','IFI6','SLC20A1','CAPG','S100A6','PRRX1','S100A4','SERPINE2','ECE1','FN1','CYTL1','CTSK','QDPR','ITGA1','ECM1','TGFBI','COL9A2','LRP8','FYB1','CTNNBIP1','IGFBP2','TTL','ASPM','MTHFD2','MLLT11','KIF3C','HJURP','SIPA1L2','TRIP13','GALNT2','SHC1','PARP9','SH3BGRL3','AGA','TMEM158','FMOD','FHL2','GALNT5','BASP1','HERC6','DDX60','COL11A1','CYBRD1','ADCY3','FGFBP2','SDF4','GBA','STAT1','WIPF1','SPON2','SNED1','IFI44L','IFI44','NID1','PARP14','RAB7B','FGF1','S100A16','LBH','ISG15','PDLIM4','ACAP3','AGTRAP','ERRFI1','STIL','CTSS','RGS1','COLEC11','LRP2','SLC7A11','PLPP3','S100A2','CITED4'],
    'seurat_FM':['MGP','ITIH5','S100A4','COL6A2','COL6A1','TIMP1','CTHRC1','LOXL3','LAMA4','ITGA10','COL1A2','PLEC','TIMP3','EMILIN1','MPZ','ISLR','SLC9A3','FCER1G','TNC','LGMN','ALDH1A3','SOD3','IER3','OLFML3','AQP1','OAF','FGFBP2','PCOLCE','ITGB3','LOXL4','SLC20A1','TGFBI','DUSP6','CC2D1A','PRNP','ACSS2','MYDGF','A2M','PPIB','AGT','FN1','AKAP6','CD109','PDIA3','MMP16','FGL2','PLAUR','IGFBP2','WFDC1','COL9A2','BMP8B','SCML4','RNF125','RCN1','AGMO','ANGPTL7','FREM2','FUCA2','MEOX2','SHTN1','LY6K','CST7','ANO4','PLPP3','SLC5A3','ANGPT1','FMOD','COL15A1','DUSP5','STRA6','TRAF1','COL11A2','LIF','THBS1','ECRG4','LRP1B','SP140','CSPG5','ADGRL3','EYA4','PLP2','FHL2','SERPING1','MMP15','CTSG','TTR','KDELR3','FKBP14','NOLC1','GASK1B','HERC5','RITA1','TC2N','NPW','BRCA1','SORL1','EME1','YIPF2','IL2RG','ITGA7','CD52','MATN3','CFB','CITED4','CSF2RA','SERTAD1','DLGAP1','ANKH','CPED1','EGFL7','CABP4','TRPM3','LUM','OSBP2','FECH','TMEM143']
} #"GDF15"="PLAB", MLANA=MART1
# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
#split_umap(mel_only, color=['melano'], size=30,split_by='status')
split_umap(mel_only, color=['seurat_FM'], size=30,split_by='status',vmax=1)
split_umap(mel_only, color=['edgeR'], size=30,split_by='status',vmax=5)
split_umap(mel_only, color=['melano'], size=30,split_by='status',vmax=5)
split_umap(mel_only, color=['MPS13'], size=30,split_by='status',vmax=5)


#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:

# Step 1: Subset rna_data to include only cells in mel_only
rna_data_subset = rna_adata[rna_adata.obs.index.isin(mel_only.obs.index), :]

# Step 2: Add 'Level2' from mel_only.obs to rna_data_subset.obs
rna_data_subset.obs['Level2'] = mel_only.obs.loc[rna_data_subset.obs.index, 'Level2']
rna_data_subset.obs['status'] = mel_only.obs.loc[rna_data_subset.obs.index, 'status']
rna_data_subset.obs['mel_leiden'] = mel_only.obs.loc[rna_data_subset.obs.index, 'mel_leiden']

# Now rna_data_subset will have the 'Level2' column from mel_only for matching cells


In [None]:
#mel_only = mel[mel.obs['Level2'] == 'Melanocytes']

#rna_data_subset.raw=None
sc.tl.rank_genes_groups(rna_data_subset, 'status', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(rna_data_subset)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(rna_data_subset, n_genes=20, groups=['Malignant','Benign'])


In [None]:

gene_sets = {
    'melano_raw': rna_data_subset.uns['rank_genes_groups']['names']['Malignant'].tolist()[:100],
} #"GDF15"="PLAB", MLANA=MART1
# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')


split_umap(mel_only, color=['melano_raw'], size=30,split_by='status',vmax=2)


#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
sc.tl.rank_genes_groups(mel_only, groupby="mel_leiden", method="wilcoxon")
sc.tl.dendrogram(mel_only, groupby="mel_leiden")
sc.pl.rank_genes_groups_dotplot(
    mel_only, groupby="mel_leiden", standard_scale="var", n_genes=5
)

In [None]:
sc.tl.rank_genes_groups(mel_noraw, groupby="mel_leiden", method="wilcoxon")
sc.pl.rank_genes_groups_dotplot(
    mel_noraw, groupby="mel_leiden", standard_scale="var", n_genes=5
)

In [None]:
    
gene_sets = {
    'melano_clus9': mel_noraw.uns['rank_genes_groups']['names']['9'].tolist()[:100],
} #"GDF15"="PLAB", MLANA=MART1
# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_noraw, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
#split_umap(mel_only, color=['melano'], size=30,split_by='status')
split_umap(mel_noraw, color=['melano_clus9'], size=30,split_by='status', vmax=2)


#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


# Final Clean object

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt

# Your existing color dictionary
color_dict = {
    "DC": "#5F9EA0",
    "Endothelial cell": "#FFA500",
    "Fibroblast": "#458B00",
    "KC Basal": "#FF6A6A",
    "KC Cornified": "#8B3A62",
    "KC Granular": "#bf6290",
    "KC Differentiating": "#AB82FF",
    "KC Cancer": "#000000",
    "KC Hair": "#FF0000",
    "LC": "#0000CD",
    "Macrophage": "#EEEE00",
    "Melanocytes": "#8B4513",
    "Melanoma": "#000000",
    "NK": "#9ACD32",
    "T cell": "#1874CD",
    "Treg": "#00B2EE",
    "pDC": "#8A2BE2",
    "Mast cell": "#ab2952",
    "mRegDC":"#809693", 'CD8+ T cell': "#7a4900","B cell":"#ffaa92","Pericytes":"#dba465","Sweat gland related":"#6f9c57","nan":"grey"
}

# Ensure that your 'Level3' labels match the keys in the color_dict
# Example of setting up UMAP
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color='Level3',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color='Level3',  # Assuming 'Level3' is the categorical variable you want to color by  # Use your color dictionary
    legend_loc='on data',  # Set to False if you don't want to display the plot immediately
)

In [None]:
rna_adata.write('skin_atlas/Melanoma_sc_reproduced_final_oct23.h5ad')


In [None]:
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color=["Level1","Level2",'Level3'],  # Assuming 'Level3' is the categorical variable you want to color by  # Use your color dictionary
    legend_loc='on data',  # Set to False if you don't want to display the plot immediately
)

In [None]:
rna_adata

In [None]:
import anndata
rna_adata=anndata.read('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/Melanoma_final_object_Nov10.h5ad')


In [None]:
rna_adata

In [None]:
sc.pl.umap(
    rna_adata,  # Your AnnData object
    color='Level1',  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)


In [None]:
mel=anndata.read("/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/old_versions/Melanoma_sc_reproduced_final_Nov6.h5ad")
mel

In [None]:
mel.obs['Level2'].unique().to_list()

In [None]:
mel.obs['Level2'] = mel.obs['Level2'].str.replace('cell', 'Cell')


# Your existing color dictionary
level2_color_dict = {
    "DC": "#5F9EA0",
    "Endothelial Cell": "#FFA500",
    "Fibroblast": "#458B00",
    "KC Basal": "#FF6A6A",
    "KC Cornified": "#8B3A62",
    "KC Granular": "#7d4a63",
    "KC Differentiating": "#AB82FF",
 #   "KC Cancer": "#000000",
    "KC Hair": "#FF0000",
    "LC": "#0000CD",
    "Macrophage": "#EEEE00",
    "Melanocytes": "#8B4513",
    "Melanoma": "#000000",
    "NK": "#9ACD32",
    "T Cell": "#1874CD",
    "Mast Cell": "#ab2952",
"B Cell":"#ffaa92","Pericytes":"#dba465","Sweat gland related":"#6f9c57",   'Schwann Cell':'#737475'
}

# Ensure that your 'Level3' labels match the keys in the color_dict
# Example of setting up UMAP
sc.pl.umap(
    mel,  # Your AnnData object
    color='Level2',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level2_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
mel.obs['Level3'].unique().tolist()

In [None]:

color_dict = {
    "DC": "#5F9EA0",
    "Endothelial Cell": "#FFA500",
    "Fibroblast": "#458B00",
    "KC Basal": "#FF6A6A",
    "KC Cornified": "#8B3A62",
    "KC Granular": "#7d4a63",
    "KC Differentiating": "#AB82FF",
    "KC Hair": "#FF0000",
    "LC": "#0000CD",
    "Macrophage": "#EEEE00",
    "Melanocytes": "#8B4513",
    "Melanoma": "#000000",
    "NK": "#9ACD32",
    "T Cell": "#1874CD",
    "Treg": "#00B2EE",
    "pDC": "#8A2BE2",
    "Mast Cell": "#ab2952",
    "mRegDC":"#809693",
    'CD8+ T Cell': "#3f6573",
    "B Cell":"#ffaa92",
    "Pericytes":"#dba465",
    "Sweat gland related":"#6f9c57",
    "nan":"grey",
    'Schwann Cell':'#737475'
}

# Ensure that your 'Level3' labels match the keys in the color_dict
# Example of setting up UMAP
sc.pl.umap(
    mel,  # Your AnnData object
    color='Level3',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)


In [None]:
sc.pl.umap(
    mel,  # Your AnnData object
    color=['Level1','Level2','Level3'], # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)


In [None]:
mel.write('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/Melanoma_final_object_Nov10.h5ad')


In [None]:
sc.pl.umap(
    mel,  # Your AnnData object
    color=['leiden'], # Assuming 'Level3' is the categorical variable you want to color by
    show=True, legend_loc="on data"  # Set to False if you don't want to display the plot immediately
)


In [None]:
sc.tl.leiden(mel, resolution=0.2, restrict_to=("leiden", ["7"]))

#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["4"]))
#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["6"]))
sc.pl.umap(mel, color="leiden_R", legend_loc="on data", legend_fontoutline=2)

In [None]:
imm_only = mel[mel.obs['Level2'] == 'Macrophage'].copy()
sc.pl.umap(imm_only, color="leiden_R", legend_loc="on data", legend_fontoutline=2)

In [None]:
# Immune subclusters
sc.pl.dotplot(
    imm_only,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
   
   


    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_R",
) #"leiden_res0_25"

In [None]:
sc.tl.leiden(imm_only, resolution=0.2, restrict_to=("leiden", ["7"]))

#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["4"]))
#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["6"]))
sc.pl.umap(imm_only, color="leiden_R", legend_loc="on data", legend_fontoutline=2)

In [None]:
# Immune subclusters
imm_only.raw = None

sc.pl.dotplot(
    imm_only,
    {
        "General": ["ITGAM"],
        "Melanocyte": ["MITF","PMEL","TYR","DCT","MLANA","PMEL","S100A1"], 
        "LC": ["CD207", "EPCAM", "ITGAX"],
        "Monocyte": ["CCR2", "CD14", "S100A8", "S100A9", "CX3CR1"],
        "DC1": ["XCR1", "CLEC9A"],
        "DC2": ["CLEC10A", "THBD"],
        "DC3": ["CSF3R", "CSF2RA"],
        "mRegDC": ["LAMP3", "CCR7"],
        "pDC": ["IL3RA", "KIT", "IRF8"],
        "Mac": [
            "C1QA",
            "TREM2",
            "SIGLEC1",
            "CD68",
            "LYVE1",
            "ARG1",
            "CD163",
        ],
        "prolif": ["UBE2C","NUSAP1","MKI67"],
        "IFN": ["ISG15", "IFI27", "STAT1"],
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_R",
) #"leiden_res0_25"

In [None]:
mel.X

# Paper figs

In [None]:
rna_adata=anndata.read('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/Melanoma_final_object_Nov10.h5ad')


In [None]:
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
    sc.pl.umap(rna_adata, color="Level3", legend_fontoutline=2,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/Level3.pdf", bbox_inches="tight")

In [None]:
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
    sc.pl.umap(rna_adata, color="Level2", legend_fontoutline=2,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/Level2.pdf", bbox_inches="tight")

In [None]:
rna_adata.obs['Level3'].unique().tolist()

In [None]:
import pandas as pd
rna_adata.raw = None

# Define the desired order of 'Level3' categories
level3_order = [
    'KC Basal',
    'KC Differentiating',
    'KC Cornified',
    'KC Granular',
    'KC Hair',
    'Mast Cell',
    'LC',
    'pDC',
    'mRegDC',
    'DC',
    'Macrophage',
    'NK',
    'T Cell',
    'Treg',
    'CD8+ T Cell',
    'B Cell',
    'Melanocytes',
    'Melanoma',
    'Endothelial Cell',
    'Fibroblast',
    'Pericytes',
    'Sweat gland related',
]

# Set 'Level3' as a categorical variable with the defined order
rna_adata.obs['Level3'] = pd.Categorical(
    rna_adata.obs['Level3'], 
    categories=level3_order, 
    ordered=True
)
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
  #Now plot the dotplot with the custom order
    sc.pl.dotplot(
    rna_adata,
    { "Category1": ['KRT15','COL7A1','COL17A1','KRT2','KRT10','DMKN','SBSN','SPINK5',
                    'LOR', 'KRT2', 'KLK11',
                    'ACSL1','HDC','VWA5A','MS4A2',
      'SLC18A2','FCGBP','CD207','WDFY4','IRF8','SPI1','CD68','MS4A6A','IGKC','CORO1A','AKNA','ACAP1','CD3E','CD8A','CD79A','CD22','FOXP3','DCT','MLANA','TYRP1','PMEL',
      'LY6E','SERPINE2','CDH5','PECAM1','EGFL7','COL6A1','COL6A2','DCN','KRT79','KRT7','KRT19','SCGB2A2']},
    standard_scale="var",
    color_map="Reds",
    groupby="Level3", figsize=(18, 6) , show=False
)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/Level3_dotplot.pdf", bbox_inches="tight")
#

In [None]:
rna_adata.obs['Level2'].unique().tolist()

In [None]:
import pandas as pd
rna_adata.raw = None

# Define the desired order of 'Level3' categories
level3_order = [
    'KC Basal',
    'KC Differentiating',
    'KC Cornified',
    'KC Granular',
    'KC Hair',
    'Mast Cell',
    'LC',
    'DC',
    'Macrophage',
    'NK',
    'T Cell',
    'B Cell',
    'Melanocytes',
    'Endothelial Cell',
    'Fibroblast',
    'Pericytes',
    'Sweat gland related',
]

# Set 'Level3' as a categorical variable with the defined order
rna_adata.obs['Level2'] = pd.Categorical(
    rna_adata.obs['Level2'], 
    categories=level3_order, 
    ordered=True
)
with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
  #Now plot the dotplot with the custom order
    sc.pl.dotplot(
    rna_adata,
    { "Category1": ['KRT15','COL7A1','COL17A1','KRT2','KRT10','DMKN','SBSN','SPINK5',
                    'LOR', 'KRT2', 'KLK11',
                    'ACSL1','HDC','VWA5A','MS4A2',
      'SLC18A2','FCGBP','CD207','WDFY4','IRF8','SPI1','CD68','MS4A6A','IGKC','CORO1A','AKNA','ACAP1','CD3E','CD8A','CD79A','CD22','FOXP3','DCT','MLANA','TYRP1','PMEL',
      'LY6E','SERPINE2','CDH5','PECAM1','EGFL7','COL6A1','COL6A2','DCN','KRT79','KRT7','KRT19','SCGB2A2']},
    standard_scale="var",
    color_map="Reds",
    groupby="Level2", figsize=(18, 6),show=False
)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/Level2_dotplot.pdf", bbox_inches="tight")
#

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#rna_adata.obs['copykat_pred']=rna_adata.obs['copykat_pred']
#rna_adata.obs['infercnv_pred']=rna_adata_OLD.obs['infercnv_pred']


mel_only = rna_adata[rna_adata.obs['Level2'] == 'Melanocytes'].copy() #original_Level2
# Lower dimension embedding - UMAP and clustering
sc.pp.neighbors(mel_only, n_neighbors=30, n_pcs=75)
sc.tl.umap(mel_only)
sc.tl.leiden(mel_only, resolution=1)
#sc.pl.umap(mel_only, color=['leiden'] , legend_loc = 'best')


# Ensure `mel_only` is your AnnData object

def handle_categorical(column):
    if column.dtype.name == 'category':
        # Check if 'nan' is already a category
        if 'nan' not in column.cat.categories:
            column = column.cat.add_categories(['nan'])
        # Fill missing values with 'nan'
        column = column.fillna('nan')
    else:
        # Convert to category and fill missing values
        column = pd.Categorical(column.fillna('nan'))
    return column

# Handle categorical columns
mel_only.obs['copykat_pred'] = handle_categorical(mel_only.obs['copykat_pred'])
mel_only.obs['infercnv_pred'] = handle_categorical(mel_only.obs['infercnv_pred'])

# Create a combined clustering column focusing on "Aneuploid" clusters and NA values
def categorize_clusters(row):
    infercnv = row['infercnv_pred']
    copykat = row['copykat_pred']
    
    if infercnv == 'Aneuploid' and copykat == 'aneuploid':
        return 'overlap'
    elif infercnv == 'Aneuploid' and copykat != 'aneuploid':
        return 'infercnv_only'
    elif infercnv != 'Aneuploid' and copykat == 'aneuploid':
        return 'copykat_only'
    else:
        return 'none'

mel_only.obs['combined_clusters'] = mel_only.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
#mel_only.obs['combined_clusters_color'] = mel_only.obs['combined_clusters'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
#sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors)


# Sort the data to ensure 'none' (grey) is at the back, blue/orange in the middle, and red at the front
mel_only.obs['sort_order'] = mel_only.obs['combined_clusters'].map({
    'none': 0,             # Grey: plotted first (back)
    'infercnv_only': 2,     # Blue: middle
    'copykat_only': 3,      # Orange: middle
    'overlap': 4            # Red: plotted last (front)
})

# Sort the data based on the sort order
mel_only = mel_only[mel_only.obs.sort_values('sort_order').index]

# Plot UMAP with custom colors and point size, ensuring grey is in the background
sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors,show=True)
#sc.pl.umap(mel_only, color=['leiden','orig.ident','combined_clusters'] , legend_loc = 'best')

In [None]:
with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": (300)}):
    sc.pl.umap(mel_only, color='combined_clusters', title='UMAP of Combined Clusters', size=30, palette=colors,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/Aneuploid.pdf", bbox_inches="tight")

In [None]:
with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": (300)}):
    sc.pl.umap(mel_only, color='leiden', title='mel subclusters', size=30, legend_loc="on data",show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/mel_subclusters.pdf", bbox_inches="tight")

In [None]:
# Define your gene sets
gene_sets = {
    'melano': ["GDF15","PLAB","L1CAM","SEMA3B","HEY1","NES","NTRK3","KNSL5","CITED1","SPP1","CSTB","CDH3","PSEN2","PMEL","MLANA"],
    'hub_genes':["FLG", "DSG1", "DSG3", "IVL", "EGFR"] #https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7596746/
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(mel_only, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": (300)}):
# Plot the module scores
    sc.pl.umap(mel_only, color=['hub_genes'], size=30, vmax=0.2,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/mel/mod_score_literature.pdf", bbox_inches="tight")

In [None]:
with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": (300)}):
# Plot the module scores
    sc.pl.umap(mel_only, color=['orig.ident'], size=30, vmax=0.2,show=False)
    plt.savefig("/QRISdata/Q2051/SCC_Paper/resources/data/reanalysis_figs/mel/sample_IDs_melanocytes_only.pdf", bbox_inches="tight")



In [None]:
with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": (300)}):
# Plot the module scores
    sc.pl.umap(rna_adata, color=['orig.ident'], size=10, vmax=0.2,show=False)
    plt.savefig("/QRISdata/Q2051/SCC_Paper/resources/data/reanalysis_figs/mel/sample_IDs_whole.pdf", bbox_inches="tight")



In [None]:
mel_only.write("/QRISdata/Q1851/Prakrithi/skin_atlas/reanalysis_nov2024/Melanocytes_only_nov22.h5ad")