In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import os
import warnings
import anndata
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt


# Suppress all future warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

sc.logging.print_header()

# Steps
- [KC subcluster annotations](#section1)
- [KC final figs](#section2)
- [Prep for label transfer whole obj](#section3)
- [Whole object Paper figures](#section4)
- [Splitting SCC/BCC](#section5)
- [Immune sanity check final](#section6)
- [KC Cancer](#section8)

print("Previous working directory: " + os.getcwd())
if 'notebooks' in os.getcwd():
    os.chdir("..")
print("Current working directory: " + os.getcwd())

In [None]:
adata = sc.read_h5ad("SCC_KCreanalysis_RNA_PP_sep2024.h5ad") #Prakrithi
adata

In [None]:
dense_matrix = adata.X.todense()
print(dense_matrix)


In [None]:
scc = sc.read_h5ad('/QRISdata/Q2051/SCC_Paper/resources/data/SCC_final_object_Nov4.h5ad')


In [None]:
print(scc.X[:30, :30].todense())


In [None]:
adata.X[:15, :5].toarray()  # View the first 5 rows and 5 columns
#adata.var

In [None]:
adata.obs.index

In [None]:
import numpy as np

# Assuming `adata.obsm['UMAP']` is a DataFrame with 'umap_1' and 'umap_2' columns
umap_df = adata.obsm['UMAP']

# Convert DataFrame to NumPy array
umap_array = umap_df[['umap_1', 'umap_2']].to_numpy()

# Assign the array to adata.obsm['X_umap']
adata.obsm['X_umap'] = umap_array

# Check the result
adata.obsm['X_umap']

In [None]:
sc.pl.umap(adata, color=["ident","Level1_unnamed"])

In [None]:
sc.pl.umap(adata, color=["Level2_unnamed"])

In [None]:
adata.obs

# <a id='section1'></a> KC subcluster annotations

In [None]:
idata = adata[adata.obs["Level1_unnamed"] == 'KC']
idata

In [None]:
idata.var["mt"] = idata.var_names.str.startswith("MT-")
sc.pp.calculate_qc_metrics(
    idata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True
)
sc.pp.normalize_total(idata, target_sum=1e4)
sc.pp.log1p(idata)
sc.pp.highly_variable_genes(idata, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
idata.raw = idata
idata = idata[:, idata.var["highly_variable"]].copy()

In [None]:
sc.pp.scale(idata, max_value=10)
sc.tl.pca(idata, svd_solver="arpack")
sc.external.pp.harmony_integrate(idata, "sample_ID_corrected")

sc.pp.neighbors(idata, n_neighbors=10, n_pcs=50, use_rep="X_pca_harmony")
sc.tl.leiden(idata)
sc.tl.umap(idata, min_dist=0.3, random_state=0)
sc.pl.umap(idata, color=["leiden"])

In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    idata,
    {
        "KC Basal": ["KRT15", "KRT5", "KRT14", "CTNNB1", "C1orf56"],
        "KC Differentiating": ["KRT1", "KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2", "FLG"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": ["KRT6B", "KRT79"],
        "sebocytes": ['FASN', 'PPARG', 'MUC1'], 
        "adipocytes": ['FABP4', 'RBP4'],
        "eccrine gland cells": ['KRT7', 'AQP5'],"IFN": ["ISG15", "IFI27", "STAT1"],
        "cancer":['TP63','HEY1']
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden",
)

In [None]:
new_cell_dict = {
    "1": "KC Basal",
    "0": "KC Differentiating",
    "3": "KC Differentiating",
    "2": "KC Basal",
    "7": "KC Differentiating",
    "5": "KC Hair",
    "6": "KC Hair",
    "4": "KC Cornified",
    "8": "KC Hair",
    "9": "KC Differentiating",
    "10": "KC Hair",
    "11": "KC Basal",
    "12": "NA",
    "13": "NA",
    "14": "KC IFN",
    "15": "NA",
    "16": "NA",
    "17": "NA",
    "18": "KC Hair"
}
idata.obs["cell_type_PP"] = [new_cell_dict[x] for x in idata.obs["leiden"]]
sc.pl.umap(idata, color=["cell_type_PP"])

In [None]:
sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden", ["10"]))
sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["11"]))

#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["4"]))
#sc.tl.leiden(idata, resolution=0.2, restrict_to=("leiden_R", ["6"]))
sc.pl.umap(idata, color="leiden_R", legend_loc="on data", legend_fontoutline=2)

In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    idata,
    {
        "KC Basal": ["KRT15", "KRT5", "KRT14", "CTNNB1", "C1orf56"],
        "KC Differentiating": ["KRT1", "KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2", "FLG"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A"],
        "KC nail/nail": ["KRT6B", "KRT79"],
        "sebocytes": ['FASN', 'PPARG', 'MUC1'], 
        "adipocytes": ['FABP4', 'RBP4'],
        "eccrine gland cells": ['KRT7', 'AQP5'],"IFN": ["ISG15", "IFI27", "STAT1"],
        "STEM":['TP63','HEY1']
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_R",
)

In [None]:
new_cell_dict = {
    "1": "KC Basal",
    "0": "KC Differentiating",
    "3": "KC Differentiating",
    "2": "KC Basal",
    "7": "KC Differentiating",
    "5": "KC Hair",
    "6": "KC Hair",
    "4": "KC Cornified",
    "8": "KC Hair",
    "9": "KC Differentiating",
    "10,0": "KC Hair",
    "10,1": "KC Hair",
    "11,0": "KC Basal",
    "11,1": "KC Differentiating",
    "11,2": "KC Differentiating",
    "12": "KC Differentiating",
    "13": "NA",
    "14": "KC IFN",
    "15": "NA",
    "16": "NA",
    "17": "NA",
    "18": "NA"
}
idata.obs["cell_type_PP"] = [new_cell_dict[x] for x in idata.obs["leiden_R"]]
sc.pl.umap(idata, color=["cell_type_PP"])

In [None]:
import pandas as pd

# Read the metadata file
meta_df2 = pd.read_csv("/QRISdata/Q2051/SCC_Paper/resources/data/scc_bcc_sc_metadata.csv", sep=',', index_col=0)
# Check if indices match
common_bcs = idata.obs.index.intersection(meta_df2.index)
meta_df2_subset = meta_df2.loc[common_bcs]
# Add new columns from the metadata DataFrame to adata.obs
idata.obs['cell_type_js'] = meta_df2_subset['cell_type_js']
idata.obs['Level2'] = meta_df2_subset['Level2']
idata.obs['Level2_Cancer'] = meta_df2_subset['Level2_Cancer']

sc.pl.umap(idata, color=['cell_type_js','Level2_Cancer','cell_type_PP'], ncols=3)

In [None]:
idata.write('KC_reanalysis_15Oct.h5ad')


In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    idata,
    {
        "KC Basal": ["KRT15", "KRT5", "KRT14", "CTNNB1", "C1orf56"],
        "KC Differentiating": ["KRT1", "KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2", "FLG"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A","CD34","SOX9"],#,"KRT19","NES","LGR5"],
        "KC nail/nail": ["KRT6B", "KRT79"],
        "sebocytes": ['FASN', 'PPARG', 'MUC1'], 
        "adipocytes": ['FABP4', 'RBP4'],
        "eccrine gland cells": ['KRT7', 'AQP5'],"IFN": ["ISG15", "IFI27", "STAT1"],
        "STEM":['TP63','HEY1']
    },
    standard_scale="var",
    color_map="Blues",
    groupby="leiden_R",
)

In [None]:
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    idata,
    {
        "KC Basal": ["KRT15", "KRT5", "KRT14", "CTNNB1", "C1orf56"],
        "KC Differentiating": ["KRT1", "KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2", "FLG"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A","CD34","SOX9"],#,"KRT19","NES","LGR5"],
        "KC nail/nail": ["KRT6B", "KRT79"],
        "sebocytes": ['FASN', 'PPARG', 'MUC1'], 
        "adipocytes": ['FABP4', 'RBP4'],
        "eccrine gland cells": ['KRT7', 'AQP5'],"IFN": ["ISG15", "IFI27", "STAT1"],
        "STEM":['TP63','HEY1']
    },
    standard_scale="var",
    color_map="Blues",
    groupby="cell_type_PP",
)

In [None]:
new_cell_dict = {
    "1": "KC Basal",
    "0": "KC Differentiating",
    "3": "KC Differentiating",
    "2": "KC Basal",
    "7": "KC Differentiating",
    "5": "KC Basal",
    "6": "KC Basal",
    "4": "KC Cornified",
    "8": "KC Hair",
    "9": "KC Differentiating",
    "10,0": "KC Hair",
    "10,1": "KC Hair",
    "11,0": "KC Basal",
    "11,1": "KC Differentiating",
    "11,2": "KC Differentiating",
    "12": "KC Differentiating",
    "13": "NA",
    "14": "KC IFN",
    "15": "NA",
    "16": "NA",
    "17": "NA",
    "18": "NA"
}
idata.obs["cell_type_PP2"] = [new_cell_dict[x] for x in idata.obs["leiden_R"]]
# Not as useful compared to the umap gene expression graphs

sc.pl.dotplot(
    idata,
    {
        "KC Basal": ["KRT15", "KRT5", "KRT14", "CTNNB1", "C1orf56"],
        "KC Differentiating": ["KRT1", "KRT10"],
        "KC Cornified": ["LOR", "IVL"],
        "KC Granular": ["KRT2", "FLG"],
        "KC stem/hair/stress": ["KRT16", "KRT17", "KRT6A","CD34","SOX9"],#,"KRT19","NES","LGR5"],
        "KC nail/nail": ["KRT6B", "KRT79"],
        "sebocytes": ['FASN', 'PPARG', 'MUC1'], 
        "adipocytes": ['FABP4', 'RBP4'],
        "eccrine gland cells": ['KRT7', 'AQP5'],"IFN": ["ISG15", "IFI27", "STAT1"],
        "STEM":['TP63','HEY1']
    },
    standard_scale="var",
    color_map="Blues",
    groupby="cell_type_PP2",
)

In [None]:
from matplotlib import rcParams

sc.tl.rank_genes_groups(idata, 'leiden_R', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(idata)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
sc.pl.rank_genes_groups(idata, key='rank_genes_groups_filtered', ncols=3)

In [None]:
axs = sc.pl.rank_genes_groups_dotplot(idata, n_genes=20, groups=['5', '6','8','10,0','10,1'])

In [None]:
print('C5:',idata.uns['rank_genes_groups']['names']['5'][:20])
print('C6:',idata.uns['rank_genes_groups']['names']['6'][:20])
print('C8:',idata.uns['rank_genes_groups']['names']['8'][:20])
print('C10,0:',idata.uns['rank_genes_groups']['names']['10,0'][:20])
print('C10,1:',idata.uns['rank_genes_groups']['names']['10,1'][:20])


In [None]:
idata.obs['sample_ID_corrected'].unique()

In [None]:
idata.obs['diagnosis_corrected'].unique()

In [None]:
idata

In [None]:
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()
split_umap(idata, color = ['Level2_Cancer'], split_by='diagnosis_corrected',legend_loc = "on data")


In [None]:
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt

def split_umap(adata, split_by, ncol=2, nrow=None, size=None, **kwargs):
    # Get unique categories
    categories = adata.obs[split_by].cat.categories
    
    # Determine the number of rows for subplots
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    
    # Create subplots
    fig, axs = plt.subplots(nrow, ncol, figsize=(5 * ncol, 4 * nrow))
    axs = axs.flatten()
    
    # Loop through categories to create UMAP plots
    for i, cat in enumerate(categories):
        ax = axs[i]
        
        # Subset the data for the current category
        subset = adata[adata.obs[split_by] == cat]
        
        # Plot UMAP with specified size
        sc.pl.umap(subset, ax=ax, show=False, title=cat, size=size, **kwargs)
    
    # Adjust layout
    plt.tight_layout()
    
    # Show the plot
    plt.show()

# Example usage
split_umap(idata, split_by='diagnosis_corrected', ncol=4, size=20, color=['leiden_R'], legend_loc="on data")


In [None]:
split_umap(idata, split_by='diagnosis_corrected', ncol=4, size=20, color=['Level2_Cancer'], legend_loc="on data")


In [None]:
def categorize_clusters(row):
    cancer = row['Level2_Cancer']
    
    if cancer == 'KC Cancer':
        return 'Cancer'
    else:
        return 'none'

idata.obs['cancer'] = idata.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors = {
    'Cancer': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
idata.obs['cancer_color'] = idata.obs['cancer'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
#sc.pl.umap(idata, color='cancer', size=10, palette=colors)

split_umap(idata, split_by='diagnosis_corrected', ncol=4, size=20, color=['cancer'], legend_loc="best", palette=colors)


In [None]:
idata

In [None]:
sc.pl.umap(idata, color=["leiden_R","cell_type_PP"], legend_loc="on data")

In [None]:
new_cell_dict = {
    "1": "KC Basal",
    "0": "KC Differentiating",
    "3": "KC Differentiating",
    "2": "KC Basal",
    "7": "KC Differentiating",
    "5": "KC Dysplastic", #/atypical
    "6": "KC Dysplastic",
    "4": "KC Cornified",
    "8": "KC Dysplastic",
    "9": "KC Differentiating",
    "10,0": "KC Hair",
    "10,1": "KC Hair",
    "11,0": "KC Basal",
    "11,1": "KC Differentiating",
    "11,2": "KC Differentiating",
    "12": "KC Differentiating",
    "13": "NA",
    "14": "KC IFN",
    "15": "NA",
    "16": "NA",
    "17": "NA",
    "18": "NA"
}

idata.obs["cell_type_PP_final"] = [new_cell_dict[x] for x in idata.obs["leiden_R"]]
sc.pl.umap(idata, color=["cell_type_PP_final"])

# Final Obj Immune + KC

In [None]:
imm=pd.read_csv("/QRISdata/Q2051/SCC_Paper/resources/data/allintegratedClustered_w_JS_imm_and_kc_labels.csv", index_col=0)
imm

In [None]:
adata.obs['Level3_final'] = adata.obs.index.map(imm['cell_type_js'])  # Replace 'annotation_column_name' with the column name in your CSV

# Ensure the indices in idata.obs are also in adata.obs to avoid mismatches
matching_indices = adata.obs.index.intersection(idata.obs.index)

# Update adata.obs['Final'] for these matching indices
adata.obs.loc[matching_indices, 'Level3_final'] = idata.obs.loc[matching_indices, 'cell_type_PP_final']
sc.pl.umap(adata, color=["Level3_final"])

KC = adata[adata.obs["Level1_unnamed"] == 'KC']
sc.pl.umap(KC, color=["Level3_final"])

In [None]:
Immune = adata[adata.obs["Level1_unnamed"] == 'immune']
sc.pl.umap(Immune, color=["Level3_final"])

In [None]:
adata

In [None]:
print(adata.obs['Level3_final'].unique().tolist())


In [None]:
new_cell_dict = {
   'KC Basal':"KC Basal", 'KC Differentiating':'KC Differentiating', 'KC Cornified':'KC Cornified',
    'Imm_LC':"Imm_LC", 'melanocyte_0':"Melanocytes", 'Ambiguous':"Ambiguous", 'KC Dysplastic':'KC Dysplastic',
    'Imm_CD8Tem':"Imm_T cell", 'KC Hair':'KC Hair', 'melanocyte_2':"Melanocytes", 'Imm_Treg':"Imm_T cell", 'NA':"Ambiguous", 'Imm_CD4Tcm':"Imm_T cell",
    'Imm_CX3CR1+ Mono/Mac':"Immune_Mono/Mac", 
    'Imm_mRegDC':"Imm_DC", 'Imm_CD14+ Mono':"Imm_Monocytes", 'Imm_CD16- NK':"Imm_NK", 'melanocyte_1':"Melanocytes", 'Endothelial_0':"Endothelial",
    'Imm_Plasma':"Imm_Plasma", 'Imm_LC KI67+':"Imm_LC",
    'Imm_CD16+ NK':"Imm_NK", 'Imm_NKT':"Imm_NKT", 'fibroblast_2':"Fibroblast", 'Imm_TREM2+ Mac':"Imm_Macrophage", 'Imm_B cell':"Imm_B cell", 
    'fibroblast_1':"Fibroblast", 
    'Endothelial_1':"Endothelial", 'Endothelial_2':"Endothelial", 'fibroblast_0':"Fibroblast", 'Imm_CD169+ Mac':"Imm_Macrophage", 'KC IFN':'KC IFN',
    'Imm_IFN+ Mac':"Imm_Macrophage", 'Imm_PD-1+CTLA4+ CD8Tcm':"Imm_T cell", 'Imm_PD-1+ CD8Tem':"Imm_T cell", 'Endothelial_3':"Endothelial", 'Imm_DC1/DC2':"Imm_DC",
    'fibroblast_3':"Fibroblast"
    
}

adata.obs["Level2_Final"] = [new_cell_dict[x] for x in adata.obs["Level3_final"]]
sc.pl.umap(adata, color=["Level2_Final"])

In [None]:
print(adata.obs['Level2_Final'].unique().tolist())


In [None]:
new_cell_dict = {
  'KC Basal':"KC", 'KC Differentiating':"KC", 'KC Cornified':"KC", 'Imm_LC':"Immune", 'Melanocytes':"Melanocytes", 'Ambiguous':"Ambiguous", 'KC Dysplastic':"KC", 
    'Imm_T cell':"Immune", 'KC Hair':"KC", 'Immune_Mono/Mac':"Immune", 'Imm_DC':"Immune", 'Imm_Monocytes':"Immune", 
    'Imm_NK':"Immune", 'Endothelial':"Endothelial", 'Imm_Plasma':"Immune", 'Imm_NKT':"Immune", 'Fibroblast':"Fibroblast", 'Imm_Macrophage':"Immune", 'Imm_B cell':"Immune",
    'KC IFN':"KC"}

adata.obs["Level1_Final"] = [new_cell_dict[x] for x in adata.obs["Level2_Final"]]
sc.pl.umap(adata, color=["Level1_Final"])

In [None]:
print(scc.obs['Level2_Cancer'].unique().tolist())

In [None]:
# Step 1: Temporarily convert 'Level2_Cancer' in both adata and cancer to string to avoid category mismatches
adata.obs['Level2_Cancer'] = adata.obs['Level2_Final'].astype(str)
cancer.obs['Level2_Cancer'] = cancer.obs['Level2_Cancer'].astype(str)

# Step 2: Get the intersection of indices between adata and cancer, and update only matching cells
matching_indices = adata.obs.index.intersection(cancer.obs.index)
adata.obs.loc[matching_indices, 'Level2_Cancer'] = cancer.obs.loc[matching_indices, 'Level2_Cancer']

# Step 3: Convert 'Level2_Cancer' back to categorical, using the combined categories from both data frames
combined_categories = sorted(set(adata.obs['Level2_Cancer']).union(cancer.obs['Level2_Cancer']))
adata.obs['Level2_Cancer'] = pd.Categorical(adata.obs['Level2_Cancer'], categories=combined_categories)

# Step 4: Plot the UMAP with the updated 'Level2_Cancer' annotations
sc.pl.umap(adata, color=["Level2_Cancer"])


In [None]:
# Step 1: Temporarily convert 'Level2_Cancer' in both adata and cancer to string to avoid category mismatches
adata.obs['Level3_Cancer'] = adata.obs['Level3_final'].astype(str)
adata.obs.loc[matching_indices, 'Level3_Cancer'] = cancer.obs.loc[matching_indices, 'Level2_Cancer']

# Step 3: Convert 'Level2_Cancer' back to categorical, using the combined categories from both data frames
combined_categories = sorted(set(adata.obs['Level3_Cancer']).union(cancer.obs['Level2_Cancer']))
adata.obs['Level3_Cancer'] = pd.Categorical(adata.obs['Level3_Cancer'], categories=combined_categories)

# Step 4: Plot the UMAP with the updated 'Level2_Cancer' annotations
sc.pl.umap(adata, color=["Level3_Cancer"])


In [None]:
adata.obs

In [None]:
adata

In [None]:
scc

In [None]:
adata.obs['cancer_status'].unique()

In [None]:
rna_adata=anndata.read_h5ad("/QRISdata/Q2051/SCC_Paper/resources/data/allintegratedClustered_Seurat_JS_3_withCNV.h5ad")
# Ensure `mel_only` is your AnnData object

def handle_categorical(column):
    if column.dtype.name == 'category':
        # Check if 'nan' is already a category
        if 'nan' not in column.cat.categories:
            column = column.cat.add_categories(['nan'])
        # Fill missing values with 'nan'
        column = column.fillna('nan')
    else:
        # Convert to category and fill missing values
        column = pd.Categorical(column.fillna('nan'))
    return column

adata.obs['Copykat_aneuploid']=rna_adata.obs['Copykat_aneuploid']
adata.obs['InferCNV_Aneuploid']=rna_adata.obs['InferCNV_Aneuploid']



# Handle categorical columns
adata.obs['copykat_pred'] = handle_categorical(adata.obs['Copykat_aneuploid'])
adata.obs['infercnv_pred'] = handle_categorical(adata.obs['InferCNV_Aneuploid'])

# Create a combined clustering column focusing on "Aneuploid" clusters and NA values
def categorize_clusters(row):
    infercnv = row['infercnv_pred']
    copykat = row['copykat_pred']
    
    if infercnv == 'Aneuploid' and copykat == 'Aneuploid':
        return 'overlap'
    elif infercnv == 'Aneuploid' and copykat != 'Aneuploid':
        return 'infercnv_only'
    elif infercnv != 'Aneuploid' and copykat == 'Aneuploid':
        return 'copykat_only'
    else:
        return 'none'

adata.obs['Aneuploid_combined_clusters'] = adata.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
adata.obs['Aneuploid_combined_clusters_color'] = adata.obs['Aneuploid_combined_clusters'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
sc.pl.umap(adata, color='Aneuploid_combined_clusters', title='UMAP of Combined Clusters', size=10, palette=colors)


In [None]:
# Ensure 'leiden_R' and 'leiden_S' (or whichever columns) exist in both obs DataFrames
if 'leiden_R' in idata.obs.columns and 'leiden' in idata.obs.columns:
    # Copy 'leiden_R' and 'leiden_S' from idata.obs to adata.obs, ensuring the indices match
    adata.obs.loc[idata.obs.index, 'KC_leiden_R'] = idata.obs['leiden_R']
    adata.obs.loc[idata.obs.index, 'KC_leiden'] = idata.obs['leiden']
    
    # Verify the update
    print(adata.obs[['KC_leiden_R', 'KC_leiden']].head())
else:
    print("Columns 'leiden_R' and/or 'leiden_S' not found in idata.obs")


In [None]:
adata

In [None]:
columns_to_remove = ['Level1_unnamed', 'Level2_unnamed','Level1.5_unnamed','copykat_pred','infercnv_pred'] #sort_order
adata.obs = adata.obs.drop(columns=columns_to_remove)

# Verify if the columns are removed
adata

In [None]:
adata.obs['integrated_snn_res.0.4'].unique()

In [None]:
columns_to_rename = {
    'sample_ID_corrected': 'sample_ID',
    'diagnosis_corrected': 'diagnosis',
    'cancer_status_corrected':'cancer_status'
}

# Rename the columns in adata.obs
adata.obs = adata.obs.rename(columns=columns_to_rename)

# Verify the changes
adata

In [None]:
adata.obs['2CNV_mod0.5'].unique()

In [None]:
# Ensure `mel_only` is your AnnData object

def handle_categorical(column):
    if column.dtype.name == 'category':
        # Check if 'nan' is already a category
        if 'nan' not in column.cat.categories:
            column = column.cat.add_categories(['nan'])
        # Fill missing values with 'nan'
        column = column.fillna('nan')
    else:
        # Convert to category and fill missing values
        column = pd.Categorical(column.fillna('nan'))
    return column

adata.obs['2CNV_mod0.5']=scc.obs['2CNV_mod0.5']



# Handle categorical columns
adata.obs['2CNV_mod0.5'] = handle_categorical(adata.obs['2CNV_mod0.5'])

# Create a combined clustering column focusing on "Aneuploid" clusters and NA values
def categorize_clusters(row):
    cancer = row['2CNV_mod0.5']    
    if cancer == 'Cancer':
        return 'Cancer'
    else:
        return 'none'

adata.obs['KC_cancer_2CNV_mod0.5'] = adata.obs.apply(categorize_clusters, axis=1)

# Define the color map
colors_cancer = {
    'Cancer': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
adata.obs['KC_cancer_2CNV_mod0.5_color'] = adata.obs['KC_cancer_2CNV_mod0.5'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
sc.pl.umap(adata, color='KC_cancer_2CNV_mod0.5', title='KC Cancer 2CNV+mod0.5', size=10, palette=colors_cancer)


In [None]:
adata.write('SCC_final_object_28Oct.h5ad')


# <a id='section2'></a> KC final figs

In [None]:
adata=anndata.read_h5ad('SCC_final_object_28Oct.h5ad')
idata=anndata.read_h5ad("KC_reanalysis_15Oct.h5ad")

In [None]:
idata.obs['BC']=idata.obs.index

In [None]:
adata

In [None]:
idata

In [None]:
sc.pl.umap(idata, color=['cell_type_PP','leiden_R'], show=True, legend_loc="on data")


In [None]:
KC_only = rna_adata[idata.obs['cell_type_PP'].str.startswith('KC')].copy()


In [None]:
# Ensure 'Imm_Treg' is in the categories for 'Level2_Cancer'
if 'Imm_Treg' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['Imm_Treg'])
# Update 'Level2_Cancer' where 'Level3_Cancer' is 'Imm_Treg'
adata.obs.loc[adata.obs['Level3_Cancer'] == 'Imm_Treg', 'Level2_Cancer'] = 'Imm_Treg'

# Ensure 'Imm_T cell' is in the categories for 'Level2_Cancer'
if 'Imm_T cell' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['Imm_T cell'])
# Replace 'Imm_NKT' with 'Imm_T cell' in 'Level2_Cancer'
adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].replace('Imm_NKT', 'Imm_T cell')

adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].replace('NA', 'Ambiguous')
adata.obs['Level3_Cancer'] = adata.obs['Level3_Cancer'].replace('NA', 'Ambiguous')


In [None]:
# Ensure 'Imm_Treg' is in the categories for 'Level2_Cancer'
if 'Imm_Treg' not in adata.obs['Level2_Final'].cat.categories:
    adata.obs['Level2_Final'] = adata.obs['Level2_Final'].cat.add_categories(['Imm_Treg'])
# Update 'Level2_Cancer' where 'Level3_Cancer' is 'Imm_Treg'
adata.obs.loc[adata.obs['Level2_Final'] == 'Imm_Treg', 'Level2_Final'] = 'Imm_Treg'

# Ensure 'Imm_T cell' is in the categories for 'Level2_Cancer'
if 'Imm_T cell' not in adata.obs['Level2_Final'].cat.categories:
    adata.obs['Level2_Final'] = adata.obs['Level2_Final'].cat.add_categories(['Imm_T cell'])
# Replace 'Imm_NKT' with 'Imm_T cell' in 'Level2_Cancer'
adata.obs['Level2_Final'] = adata.obs['Level2_Final'].replace('Imm_NKT', 'Imm_T cell')

adata.obs['Level2_Final'] = adata.obs['Level2_Final'].replace('NA', 'Ambiguous')
adata.obs['Level2_Final'] = adata.obs['Level2_Final'].replace('NA', 'Ambiguous')


In [None]:
#sc.pl.umap(adata, color='Level2_Cancer', size=10)
adata.obs['Level2_Cancer'].unique().tolist()

In [None]:
# Ensure that the indices are aligned and add the column from `adata` to `idata`
idata.obs['Aneuploid_combined_clusters'] = adata.obs['Aneuploid_combined_clusters'].reindex(idata.obs.index)
idata = idata[idata.obs['cell_type_PP'].str.startswith('KC')].copy()

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
idata.obs['Aneuploid_combined_clusters_color'] = idata.obs['Aneuploid_combined_clusters'].map(colors)

with plt.rc_context({"figure.figsize": (5, 4), "figure.dpi": (300)}):
    sc.pl.umap(idata, color='Aneuploid_combined_clusters', size=4, palette=colors,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/KC_Aneuploid.pdf", bbox_inches="tight")
# Plot the UMAP with reduced spot size and custom colors



In [None]:
import matplotlib.pyplot as plt
import scanpy as sc

# Create a figure and axis to plot both sets of cells on the same plot
#fig, ax = plt.subplots()
fig, ax = plt.subplots(figsize=(5, 5))  # Adjust figure size as needed


# Plot 'none' cells first (background layer)
sc.pl.umap(
    idata[idata.obs['Aneuploid_combined_clusters'] == 'none'],
    color='Aneuploid_combined_clusters',
    size=6,
    palette={'none': 'lightgrey'},
    ax=ax,
    show=False
)

# Overlay other cells with specific colors (foreground layer)
sc.pl.umap(
    idata[idata.obs['Aneuploid_combined_clusters'] != 'none'],
    color='Aneuploid_combined_clusters',
    size=6,
    palette=colors,
    ax=ax,
    show=False
)
plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/KC_Aneuploid_reordered.pdf", format="pdf")
# Display the combined plot
plt.show()


In [None]:
# Ensure that the indices are aligned and add the column from `adata` to `idata`
idata.obs['Aneuploid_combined_clusters'] = adata.obs['Aneuploid_combined_clusters'].reindex(idata.obs.index)

# Define the color map
colors = {
    'infercnv_only': 'blue',
    'copykat_only': 'orange',
    'overlap': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
idata.obs['Aneuploid_combined_clusters_color'] = idata.obs['Aneuploid_combined_clusters'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
sc.pl.umap(idata, color='Aneuploid_combined_clusters', size=10, palette=colors)


In [None]:
adata

In [None]:
idata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].reindex(idata.obs.index)
idata.obs['Level3_Cancer'] = adata.obs['Level3_Cancer'].reindex(idata.obs.index)
idata.obs['Level2_Final'] = adata.obs['Level2_Final'].reindex(idata.obs.index)
idata.obs['Level3_final'] = adata.obs['Level3_final'].reindex(idata.obs.index)


sc.pl.umap(idata, color='Level3_final', size=10)


In [None]:
sc.pl.umap(idata, color='Level2_Final', size=10)


In [None]:
# Check if Level3_Cancer is in idata.obs
if 'Level3_Cancer' in idata.obs.columns:
    # Subset idata to keep only cells where Level3_Cancer starts with 'KC'
    idata_subset = idata[idata.obs['Level3_Cancer'].str.startswith('KC')]
else:
    raise KeyError("Level3_Cancer not found in idata.obs")
# Optional: Reset the index if desired
idata_subset = idata_subset.copy()  # Create a copy to avoid SettingWithCopyWarning
idata_subset.obs.reset_index(drop=True, inplace=True)


# Define color list based on cell types
kc_color_dict = {
   # "pDC"= "#8A2BE2",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
 # "KC Cancer" : "black",  # Darker shade of "saddlebrown"
 'KC Granular':'#008941',
'KC IFN' : '#7b4f4b',
 "KC Dysplastic": "#dac0eb", 'Aneuploid':'grey'
}


with plt.rc_context({"figure.figsize": (4, 4), "figure.dpi": (300)}):
    sc.pl.umap(
    idata_subset,  # Your AnnData object
    color='Level3_final',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=kc_color_dict,  # Use your color dictionary
    show=False, size=10 # Set to False if you don't want to display the plot immediately
)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/KC_subclusters_types.pdf", bbox_inches="tight")


In [None]:
# Check if Level3_Cancer is in idata.obs
if 'Level3_Cancer' in idata.obs.columns:
    # Subset idata to keep only cells where Level3_Cancer starts with 'KC'
    idata_subset = idata[idata.obs['Level3_Cancer'].str.startswith('KC')]
else:
    raise KeyError("Level3_Cancer not found in idata.obs")
# Optional: Reset the index if desired
idata_subset = idata_subset.copy()  # Create a copy to avoid SettingWithCopyWarning
idata_subset.obs.reset_index(drop=True, inplace=True)


# Define color list based on cell types
kc_color_dict = {
   # "pDC"= "#8A2BE2",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
  "KC Cancer" : "black",  # Darker shade of "saddlebrown"
 'KC Granular':'#008941',
'KC IFN' : '#7b4f4b',
 "KC Dysplastic": "#dac0eb", 'Aneuploid':'grey'
}

sc.pl.umap(
    idata_subset,  # Your AnnData object
    color='Level3_Cancer',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=kc_color_dict,  # Use your color dictionary
    show=True, size=10 # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
# Check if Level3_Cancer is in idata.obs
if 'Level3_final' in idata.obs.columns:
    # Subset idata to keep only cells where Level3_Cancer starts with 'KC'
    idata_subset = idata[idata.obs['Level3_final'].str.startswith('KC')]
else:
    raise KeyError("Level3_Cancer not found in idata.obs")
# Optional: Reset the index if desired
idata_subset = idata_subset.copy()  # Create a copy to avoid SettingWithCopyWarning
idata_subset.obs.reset_index(drop=True, inplace=True)


# Define color list based on cell types
kc_color_dict = {
   # "pDC"= "#8A2BE2",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
  "KC Cancer" : "black",  # Darker shade of "saddlebrown"
 'KC Granular':'#008941',
'KC IFN' : '#7b4f4b',
 "KC Dysplastic": "#dac0eb", 'Aneuploid':'grey'
}

sc.pl.umap(
    idata_subset,  # Your AnnData object
    color='Level3_final',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=kc_color_dict,  # Use your color dictionary
    show=True, size=10 # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
sc.pl.umap(idata_subset, color='Aneuploid_combined_clusters', size=10, palette=colors)


In [None]:
sc.pl.umap(idata_subset, color='Level3_Cancer', size=10)


In [None]:
idata_subset

In [None]:
with plt.rc_context({"figure.figsize": (4, 4), "figure.dpi": (300)}):
    sc.pl.umap(
    idata_subset,  # Your AnnData object
    color='sample_ID_corrected',  # Assuming 'Level3' is the categorical variable you want to color by
    show=False, size=10 # Set to False if you don't want to display the plot immediately
)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/KC_samples.pdf", bbox_inches="tight")


In [None]:
idata_subset.write('/QRISdata/Q4386/skin_atlas/SCC_BCC/SCC_KC_only_final_object_21Nov.h5ad')


In [None]:
idata_subset.write('SCC_KC_only_final_object_21Nov.h5ad')


In [None]:
#idata_subset.obs.index=idata_subset.obs['BC']
idata_subset.obs

In [None]:
# Step 1: Temporarily convert 'Level2_Cancer' in both adata and cancer to string to avoid category mismatches
adata.obs['Level2_Cancer'] = adata.obs['Level2_Final'].astype(str)
cancer.obs['Level2_Cancer'] = cancer.obs['Level2_Cancer'].astype(str)

# Step 2: Get the intersection of indices between adata and cancer, and update only matching cells
matching_indices = adata.obs.index.intersection(cancer.obs.index)
adata.obs.loc[matching_indices, 'Level2_Cancer'] = cancer.obs.loc[matching_indices, 'Level2_Cancer']

# Step 3: Convert 'Level2_Cancer' back to categorical, using the combined categories from both data frames
combined_categories = sorted(set(adata.obs['Level2_Cancer']).union(cancer.obs['Level2_Cancer']))
adata.obs['Level2_Cancer'] = pd.Categorical(adata.obs['Level2_Cancer'], categories=combined_categories)

# Step 4: Plot the UMAP with the updated 'Level2_Cancer' annotations
sc.pl.umap(adata, color=["Level2_Cancer"])


In [None]:
sc.pl.umap(adata, color='Level3_Cancer', size=10)


In [None]:
adata.obs['Level2_Cancer'].unique().tolist()

In [None]:
import pandas as pd

# Define the desired order of 'Level3' categories
level2_order = [
    'Imm_NK',
    'Imm_T cell',
    'Imm_Treg',
    'Imm_LC',
    'Imm_DC',
    'Imm_Macrophage',
    'Melanocytes',
    'Endothelial',
    'Fibroblast',
    'KC Cornified',
    'KC Differentiating',
    'KC Basal',
    'KC Cancer',
    'KC Hair',
]

# Set 'Level3' as a categorical variable with the defined order
adata.obs['Level2_Cancer'] = pd.Categorical(
    adata.obs['Level2_Cancer'], 
    categories=level2_order, 
    ordered=True
)

# Now plot the dotplot with the custom order
sc.pl.dotplot(
    adata,
    { "Category1": ['CTSW','KLRB1','NKG7','GNLY','CD52','CD3E','IL32','FOXP3','CD207','CST3','LYZ','BASP1','CD83','CD74','TYROBP',
                   'CD68','AIF1','FCER1G','DCT','MLANA','TYRP1','PMEL','GNG11','IGFBP7','RAMP2','PECAM1','EGFL7','COL1A1','COL1A2','COL6A2',
                   'DCN','PLAC9','SBSN','KRT2','DSC1','KRT15','KRT10','KRTDAP','PKP1','KRT14','KRT5','KRT15','IFI27','S100A8','KRT6A']},
    standard_scale="var",
    color_map="Reds",
    groupby="Level2_Cancer", figsize=(18, 6) 
)


# level3 colors

In [None]:
import matplotlib.pyplot as plt
# Define color list based on cell types
level3_color_dict = {
    'Endothelial cell':"#FFA500", 
     "Fibroblast": "#458B00", 
      "Imm_DC" : "#5F9EA0",
  "Imm_LC" : "#0000CD",
  "Imm_Macrophage" : "#EEEE00",
  "Imm_NK": "#9ACD32",
  "Imm_T cell": "#1874CD",
#  "Imm_Treg" : "#00B2EE",
   # "pDC"= "#8A2BE2",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
  "Melanocyte" : "#8B4513",
  "KC Cancer" : "black",  # Darker shade of "saddlebrown"
# 'KC Granular':'#008941',
'Imm_CD8Tem':'#7a4900',
'Imm_Treg':'#8fb0ff',
'Imm_CD4Tcm' : '#997d87',
'Imm_CX3CR1+ Mono/Mac' : '#5a0007',
'Imm_mRegDC' : '#809693',
'Imm_CD14+ Mono' : '#6a3a4c',
'Imm_CD16- NK' : '#1b4400',
'Imm_Plasma' : '#4a3b53',
'Imm_LC KI67+' : '#ff2f80',
'Imm_CD16+ NK' : '#61615a',
'Imm_NKT' : '#ba0900',
'Imm_TREM2+ Mac' : '#00c2a0',
'Imm_B cell' : '#ffaa92',
'Imm_CD169+ Mac': '#000035',
'KC IFN' : '#7b4f4b',
'Imm_IFN+ Mac' : '#a1c299',
'Imm_PD-1+CTLA4+ CD8Tcm' : '#300018',
'Imm_PD-1+ CD8Tem' : '#0aa6d8',
'Imm_DC1/DC2' : '#00846f', 'fibroblast_0':'#ddefff',
'fibroblast_3':'#372101',
'fibroblast_2':'#6b7900',
'fibroblast_1':'#ff90c9','Endothelial_0':'#3b5dff',
'Endothelial_1':'#b903aa',
'Endothelial_2':'#d16100',
    'Endothelial_3':'#013349',
'melanocyte_0':'#a30059',
'melanocyte_1':'#4fc601',
'melanocyte_2':'#004d43', "Ambiguous":"grey", "KC Dysplastic": "#dac0eb"
}

sc.pl.umap(
    adata,  # Your AnnData object
    color='Level3_Cancer',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level3_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
adata

In [None]:
# Ensure 'Endothelial cell' is a category in Level2_Cancer
if 'Endothelial' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['Endothelial'])
# Set the corresponding 'Level2_Cancer' values to 'Endothelial cell' where 'Level2_Final' is 'Endothelial cell'
adata.obs.loc[adata.obs['Level2_Final'] == 'Endothelial', 'Level2_Cancer'] = 'Endothelial'



if 'KC Dysplastic' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['KC Dysplastic'])
# Set the corresponding 'Level2_Cancer' values to 'Endothelial cell' where 'Level2_Final' is 'Endothelial cell'
adata.obs.loc[adata.obs['Level2_Final'] == 'KC Dysplastic', 'Level2_Cancer'] = 'KC Dysplastic'

if 'KC IFN' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['KC IFN'])
# Set the corresponding 'Level2_Cancer' values to 'Endothelial cell' where 'Level2_Final' is 'Endothelial cell'
adata.obs.loc[adata.obs['Level2_Final'] == 'KC IFN', 'Level2_Cancer'] = 'KC IFN'


adata.obs['Level2_Cancer'].unique().tolist()

In [None]:
sc.pl.umap(
    adata,  # Your AnnData object
    color='Level2_Final',  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
sc.pl.umap(
    adata,  # Your AnnData object
    color=['Level2_Cancer'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
import matplotlib.pyplot as plt
# Define color list based on cell types
level3_color_dict = {
    'Endothelial':"#FFA500", 
     "Fibroblast": "#458B00", 
      "Imm_DC" : "#5F9EA0",
  "Imm_LC" : "#0000CD",
  "Imm_Macrophage" : "#EEEE00",
  "Imm_NK": "#9ACD32",
  "Imm_T cell": "#1874CD",
  "Imm_Treg" : "#00B2EE",
   # "pDC"= "#8A2BE2",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
  "Melanocyte" : "#8B4513",
  "KC Cancer" : "black",  # Darker shade of "saddlebrown"
 'KC Granular':'#008941',
'Imm_CD8Tem':'#7a4900',
'Imm_Treg':'#8fb0ff',
'Imm_CD4Tcm' : '#997d87',
'Imm_CX3CR1+ Mono/Mac' : '#5a0007',
'Imm_mRegDC' : '#809693',
'Imm_CD14+ Mono' : '#6a3a4c',
'Imm_CD16- NK' : '#1b4400',
'Imm_Plasma' : '#4a3b53',
'Imm_LC KI67+' : '#ff2f80',
'Imm_CD16+ NK' : '#61615a',
'Imm_NKT' : '#ba0900',
'Imm_TREM2+ Mac' : '#00c2a0',
'Imm_B cell' : '#ffaa92',
'Imm_CD169+ Mac': '#000035',
'KC IFN' : '#7b4f4b',

}

sc.pl.umap(
    adata,  # Your AnnData object
    color='Level3_Cancer',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level3_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
# Ensure that the indices are aligned and add the column from `adata` to `idata`
idata.obs['KC_cancer_2CNV_mod0.5'] = adata.obs['KC_cancer_2CNV_mod0.5'].reindex(idata.obs.index)

# Define the color map
cancer_colors = {
    'Cancer': 'red',
    'none': 'lightgrey'
}

# Map the colors to the combined clusters
idata.obs['KC_cancer_2CNV_mod0.5_color'] = idata.obs['KC_cancer_2CNV_mod0.5'].map(colors)

# Plot the UMAP with reduced spot size and custom colors
sc.pl.umap(idata, color='KC_cancer_2CNV_mod0.5', size=10, palette=cancer_colors)


In [None]:
# Specify the columns you want to remove
columns_to_remove = ['2CNV_mod0.5', 'KC_cancer_2CNV_mod0.5', 'KC_cancer_2CNV_mod0.5_color']

# Remove the specified columns from idata_subset.obs
idata_subset.obs.drop(columns=columns_to_remove, inplace=True)


In [None]:
sc.pl.umap(idata_subset, color='KC_cancer_2CNV_mod0.5', size=10, palette=cancer_colors)


In [None]:

sc.pl.umap(idata_subset, color='Aneuploid_combined_clusters', size=10, palette=colors)


In [None]:
sc.pl.umap(adata, color='KC_cancer_2CNV_mod0.5', title='KC Cancer 2CNV+mod0.5', size=10)


In [None]:
adata.write('SCC_final_object_31Oct.h5ad')
idata_subset.write('SCC_final_object_31Oct_KCs.h5ad')
adata.obs.to_csv('SCC_final_object_31Oct_metadata.txt',sep="\t")

In [None]:
# Add 'Monocytes' to the categories in Level2_Cancer
#adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories('Monocytes')
#adata.obs.loc[adata.obs['Level3_Cancer'] == 'Imm_CD14+ Mono', 'Level2_Cancer'] = 'Monocytes'
# Check if 'Level2' is categorical, and if so, add 'KC Dysplastic' as a category
#if adata.obs['Level2_Cancer'].dtype.name == 'category':
#    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories('KC Dysplastic')
#adata.obs.loc[adata.obs['Level3_Cancer'] == 'KC Dysplastic', 'Level2_Cancer'] = 'KC Dysplastic'
#adata.obs.loc[adata.obs['Level3_Cancer'] == 'Imm_CX3CR1+ Mono/Mac', 'Level2_Cancer'] = 'Imm_Macrophage'
# Add the new category 'Imm_Plasma' to Level2_Cancer if it doesn't already exist
if 'Imm_Plasma' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['Imm_Plasma'])
# Now set Level2_Cancer to 'Imm_Plasma' where Level3_Cancer is 'Imm_Plasma'
adata.obs.loc[adata.obs['Level3_Cancer'] == 'Imm_Plasma', 'Level2_Cancer'] = 'Imm_Plasma'

sc.pl.umap(
    adata,  # Your AnnData object
    color=['Level2_Cancer','Level3_Cancer'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
# Group by Level1, Level2, and Level3 and count occurrences
tally = adata.obs.groupby(['Level2_Cancer', 'Level3_Cancer']).size().reset_index(name='count')

# Filter to keep only counts greater than 0
tally_positive = tally[tally['count'] > 0]

# Display the tally with counts greater than 0
print(tally_positive)


In [None]:
adata

In [None]:
# Final Level2 colors

In [None]:
adata.obs['Level2_Cancer'].unique().tolist()

In [None]:
import matplotlib.pyplot as plt
# Define color list based on cell types
# Update Level2_Cancer where Level3_Cancer is Imm_Treg
adata.obs.loc[adata.obs['Level3_Cancer'] == 'Imm_Treg', 'Level2_Cancer'] = 'Imm_T cell'


level2_color_dict = {
    'Endothelial':"#FFA500", 
     "Fibroblast": "#458B00", 
      "Imm_DC" : "#5F9EA0",
  "Imm_LC" : "#0000CD",
  "Imm_Macrophage" : "#EEEE00",
  "Imm_NK": "#9ACD32",
  "Imm_T cell": "#1874CD",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
  "Melanocytes" : "#8B4513",
  "KC Cancer" : "black",  # Darker shade of "saddlebrown"
 'KC Granular':'#008941',
'Imm_Plasma' : '#4a3b53',
'Imm_B cell' : '#ffaa92',
'KC IFN' : '#7b4f4b', "Imm_Treg" : "#00B2EE","Monocytes":"#6a3a4c","KC Dysplastic": "#dac0eb","Ambiguous":"grey"

}

sc.pl.umap(
    adata,  # Your AnnData object
    color='Level2_Cancer',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level2_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)

# Optimize layout
plt.tight_layout()
plt.show()


In [None]:
sc.pl.umap(
    adata,  # Your AnnData object
    color=['Level1_Final'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
sc.pl.umap(
    adata,  # Your AnnData object
    color=['Level1_Final','Level2_Cancer','Level3_Cancer'],  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
# Replace NaN values in Level2_Cancer with "Ambiguous"
# Add "Ambiguous" as a category to Level2_Cancer
#adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories("Ambiguous")
# Replace NaN values in Level2_Cancer with "Ambiguous"
#adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].fillna("Ambiguous
if 'Imm_B cell' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['Imm_B cell'])

if 'KC IFN' not in adata.obs['Level2_Cancer'].cat.categories:
    adata.obs['Level2_Cancer'] = adata.obs['Level2_Cancer'].cat.add_categories(['KC IFN'])

adata.obs.loc[adata.obs['Level3_Cancer'] == 'Imm_B cell', 'Level2_Cancer'] = 'Imm_B cell'
adata.obs.loc[adata.obs['Level3_Cancer'] == 'KC Cancer', 'Level2_Cancer'] = 'KC Cancer'
adata.obs.loc[adata.obs['Level3_Cancer'] == 'KC IFN', 'Level2_Cancer'] = 'KC IFN'

# Group by Level1, Level2, and Level3 and count occurrences
tally = adata.obs.groupby(['Level1_Final', 'Level2_Cancer', 'Level3_Cancer']).size().reset_index(name='count')
# Filter to keep only counts greater than 0
tally_positive = tally[tally['count'] > 0]
# Display the tally with counts greater than 0
print(tally_positive)


In [None]:
#adata.obs[adata.obs['Level1_Final'] == 'Ambiguous']
# Filter columns that start with 'Level'
level_columns = [col for col in adata.obs.columns if col.startswith('Level')]

# Display rows where 'Level1_Final' is 'Ambiguous' and only the selected columns
adata.obs.loc[adata.obs['Level1_Final'] == 'Ambiguous', level_columns]


In [None]:

# Remove specified columns from adata.obs
#obs_to_remove = ['Level3_final', 'Level2_Final']
#adata.obs = adata.obs.drop(columns=obs_to_remove)

# Remove specified keys from adata.uns
uns_to_remove = ['Level1_unnamed_colors', 'Level3_final_colors', 'Level2_Final_colors']
for key in uns_to_remove:
    if key in adata.uns:
        del adata.uns[key]


adata

In [None]:
adata.write('SCC_final_object_Nov1.h5ad')
#idata_subset.write('SCC_final_object_Nov1_KCs.h5ad')
adata.obs.to_csv('SCC_final_object_Nov1_metadata.txt',sep="\t")

In [None]:
cell_cycle_genes = [x.strip() for x in open('data/regev_lab_cell_cycle_genes.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in idata.var_names]

In [None]:
sc.tl.score_genes_cell_cycle(idata, s_genes=s_genes, g2m_genes=g2m_genes)

sc.pl.umap(
    idata,
    color="phase",
    color_map=cmp(),
)

In [None]:

sc.pl.umap(
    idata,
    color="phase",
)

In [None]:
sc.pl.umap(idata, color=['sample_ID_corrected'], ncols=3)

In [None]:
# Group by 'cancer' and 'sample_ID_corrected', then count the values
grouped_counts = idata.obs.groupby(['cancer', 'sample_ID_corrected']).size()

# Convert the result to a DataFrame for easier viewing if needed
grouped_counts = grouped_counts.reset_index(name='count')

# Display the grouped counts
print(grouped_counts)

# IEC vs SCC

In [None]:
adata=idata.copy()

In [None]:
# Check the distribution of cell types across diagnoses
cell_type_distribution = adata.obs.groupby(['diagnosis_corrected', 'cell_type_PP']).size()
print(cell_type_distribution)


In [None]:
# Filter for IEC and SCC diagnoses
filtered_adata = adata[adata.obs['diagnosis_corrected'].isin(['IEC', 'SCC']), :]

# Check the filtered data shape
print("Filtered data shape:", filtered_adata.shape)

# Verify available cell types in the filtered data
print("Available cell types in filtered data:", filtered_adata.obs['cell_type_PP'].unique())


In [None]:
import pandas as pd
import numpy as np
import scanpy as sc

# Assuming adata is your AnnData object and you have the necessary parameters defined
# For example, run differential expression analysis for "KC Basal"
cell_type = "KC Basal"

# Example code for running differential expression analysis
# Replace with your actual parameters and method if different
sc.tl.rank_genes_groups(adata, groupby='cell_type_PP', groups=[cell_type], method='t-test')

# Retrieve results
results = adata.uns['rank_genes_groups']

# Create a DataFrame from the results
specific_results = pd.DataFrame({
    'gene': results['names'][cell_type],
    'logfoldchanges': results['logfoldchanges'][cell_type],
    'pvals': results['pvals'][cell_type],
    'pvals_adj': results['pvals_adj'][cell_type],
    'cell_type': cell_type  # Assign the cell type directly
})

# Step 1: Remove rows where cell_type_PP is NA
specific_results = specific_results.dropna(subset=['cell_type'])

# Step 2: Convert structured arrays to regular DataFrame columns
specific_results['logfoldchanges_IEC'] = specific_results['logfoldchanges'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
specific_results['logfoldchanges_SCC'] = specific_results['logfoldchanges'].apply(lambda x: x[1] if isinstance(x, np.ndarray) else x)
specific_results['pvals_IEC'] = specific_results['pvals'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
specific_results['pvals_SCC'] = specific_results['pvals'].apply(lambda x: x[1] if isinstance(x, np.ndarray) else x)
specific_results['pvals_adj_IEC'] = specific_results['pvals_adj'].apply(lambda x: x[0] if isinstance(x, np.ndarray) else x)
specific_results['pvals_adj_SCC'] = specific_results['pvals_adj'].apply(lambda x: x[1] if isinstance(x, np.ndarray) else x)

# Step 3: Drop the original structured array columns
specific_results = specific_results.drop(columns=['logfoldchanges', 'pvals', 'pvals_adj'])

# Optional: Reset the index if needed
specific_results.reset_index(drop=True, inplace=True)

# Check data types
print(specific_results.dtypes)

# Print the modified DataFrame
print(specific_results)


In [None]:
import scanpy as sc
import pandas as pd

# Assuming specific_results is a DataFrame with columns like 'gene', 'pvals_adj_IEC', and 'pvals_adj_SCC'
# Replace this with your actual specific_results data
# For example:
# specific_results = pd.DataFrame({
#     'gene': ['Gene1', 'Gene2', 'Gene3', ...],
#     'pvals_adj_IEC': [...],
#     'pvals_adj_SCC': [...],
#     'logfoldchanges_IEC': [...],
#     'logfoldchanges_SCC': [...],
# })

# Extract top genes for IEC based on adjusted p-values
top_genes_IEC = specific_results.nsmallest(10, 'pvals_adj_IEC')['gene'].tolist()

# Extract top genes for SCC based on adjusted p-values
top_genes_SCC = specific_results.nsmallest(10, 'pvals_adj_SCC')['gene'].tolist()

# Prepare the gene list for the dot plot
genes_to_plot = {
    "IEC": top_genes_IEC,
    "SCC": top_genes_SCC,
}

# Create the dot plot
sc.pl.dotplot(
    adata,  # Your AnnData object
    genes_to_plot,
    standard_scale="var",  # Scale the genes across the specified axis
    color_map="Blues",     # Choose a color map for visualization
    groupby="cell_type_PP"  # Use the correct grouping for your analysis
)


In [None]:
idata
print(idata.n_obs, idata.n_vars)
del idata.raw #.to_adata()


In [None]:
idata.obs['cell_type_PP'].unique()

In [None]:
mito_genes = idata.var_names.str.startswith('MT-')
ribo = idata.var_names.str.startswith(("RPS", "RPL"))

remove = np.add(mito_genes, ribo)
keep = np.invert(remove)

adata = idata[:,keep]

print(adata.n_obs, adata.n_vars)

In [None]:
idata = idata[idata.obs['diagnosis_corrected'].isin(["IEC", "SCC"])].copy()

In [None]:
# adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
sc.tl.rank_genes_groups(adata_KC_Basal, 'diagnosis_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(adata_KC_Basal)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(adata_KC_Basal, n_genes=20, groups=['IEC', 'SCC'])


In [None]:
print("IEC: ",adata_KC_Basal.uns['rank_genes_groups']['names']['IEC'].tolist()[:100])
print("SCC: ",adata_KC_Basal.uns['rank_genes_groups']['names']['SCC'].tolist()[:100])

In [None]:
# adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
from matplotlib import rcParams
adata_KC_Diff = idata[idata.obs['cell_type_PP'] == "KC Differentiating"].copy()
sc.tl.rank_genes_groups(adata_KC_Diff, 'diagnosis_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(adata_KC_Diff)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(adata_KC_Diff, n_genes=20, groups=['IEC', 'SCC'])


In [None]:
print("IEC: ",adata_KC_Diff.uns['rank_genes_groups']['names']['IEC'].tolist()[:100])
print("SCC: ",adata_KC_Diff.uns['rank_genes_groups']['names']['SCC'].tolist()[:100])

In [None]:
# (adata_KC_Basal.var_names.str.startswith('MT-')).sum()
# sc.tl.filter_rank_genes_groups(adata_KC_Basal)
# adata_KC_Basal.uns["rank_genes_groups_filtered"]["names"]
sc.get.rank_genes_groups_df(adata_KC_Basal, group='SCC')

In [None]:
# adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
adata_KC_cornified = idata[idata.obs['cell_type_PP'] == "KC Cornified"].copy()
sc.tl.rank_genes_groups(adata_KC_cornified, 'diagnosis_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(adata_KC_cornified)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(adata_KC_cornified, n_genes=20, groups=['IEC', 'SCC'])


In [None]:
print("IEC: ",adata_KC_cornified.uns['rank_genes_groups']['names']['IEC'].tolist()[:100])
print("SCC: ",adata_KC_cornified.uns['rank_genes_groups']['names']['SCC'].tolist()[:100])

In [None]:
# adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
adata_KC_hair= idata[idata.obs['cell_type_PP'] == "KC Hair"].copy()
sc.tl.rank_genes_groups(adata_KC_hair, 'diagnosis_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(adata_KC_hair)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
axs = sc.pl.rank_genes_groups_dotplot(adata_KC_hair, n_genes=20, groups=['IEC', 'SCC'])


In [None]:
print("IEC: ",adata_KC_hair.uns['rank_genes_groups']['names']['IEC'].tolist()[:100])
print("SCC: ",adata_KC_hair.uns['rank_genes_groups']['names']['SCC'].tolist()[:100])

In [None]:
# adata_KC_Basal = idata[idata.obs['cell_type_PP'] == "KC Basal"].copy()
adata_KC_cancer= idata[idata.obs['Level2_Cancer'] == "KC Cancer"].copy()
sc.tl.rank_genes_groups(adata_KC_cancer, 'diagnosis_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(adata_KC_cancer)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
axs = sc.pl.rank_genes_groups_dotplot(adata_KC_cancer, n_genes=20, groups=['IEC', 'SCC'])


In [None]:
print("IEC: ",adata_KC_cancer.uns['rank_genes_groups']['names']['IEC'].tolist()[:100])
print("SCC: ",adata_KC_cancer.uns['rank_genes_groups']['names']['SCC'].tolist()[:100])

In [None]:
# Prepare the genes for the dot plot
IEC_Cancer = adata_KC_cancer.uns['rank_genes_groups']['names']['IEC'].tolist()[:5]
SCC_Cancer = adata_KC_cancer.uns['rank_genes_groups']['names']['SCC'].tolist()[:5]
IEC_Basal = adata_KC_Basal.uns['rank_genes_groups']['names']['IEC'].tolist()[:5]
SCC_Basal = adata_KC_Basal.uns['rank_genes_groups']['names']['SCC'].tolist()[:5]
IEC_corn = adata_KC_cornified.uns['rank_genes_groups']['names']['IEC'].tolist()[:5]
SCC_corn = adata_KC_cornified.uns['rank_genes_groups']['names']['SCC'].tolist()[:5]
IEC_diff = adata_KC_Diff.uns['rank_genes_groups']['names']['IEC'].tolist()[:5]
SCC_diff = adata_KC_Diff.uns['rank_genes_groups']['names']['SCC'].tolist()[:5]
IEC_hair = adata_KC_hair.uns['rank_genes_groups']['names']['IEC'].tolist()[:5]
SCC_hair = adata_KC_hair.uns['rank_genes_groups']['names']['SCC'].tolist()[:5]


adata_KC_cornified

# Create a dot plot
sc.pl.dotplot(
    idata,
    {
        "Cancer": IEC_Cancer + SCC_Cancer,  # Combine IEC and SCC genes for the 'Cancer' category
        "Basal":IEC_Basal+SCC_Basal,
        "Differentiating":IEC_diff+SCC_diff, "Cornified":IEC_corn+SCC_corn,"Hair":IEC_hair+SCC_hair
    },
    groupby='diagnosis_corrected',
    show=True
)


In [None]:
import scanpy as sc
from matplotlib import rcParams

# Assume `idata` is your AnnData object containing all cell types

# Get unique cell types
cell_types = idata.obs['cell_type_PP'].unique()

# Dictionary to store results
results = {}

# Loop through each cell type and perform DE analysis
for cell_type in cell_types:
    print(f'Analyzing cell type: {cell_type}')
    
    # Subset the data for the current cell type
    adata_subset = idata[idata.obs['cell_type_PP'] == cell_type].copy()
    
    # Check if we have enough cells for the analysis
    if adata_subset.n_obs > 0:
        # Run differential expression analysis for IEC vs SCC
        sc.tl.rank_genes_groups(adata_subset, groupby='diagnosis_corrected', method='wilcoxon', groups=['IEC', 'SCC'], n_genes=200)

        # Store results for the current cell type
        results[cell_type] = adata_subset.uns['rank_genes_groups']

# Example: Print top genes for each cell type
for cell_type, result in results.items():
    top_genes_IEC = result['names']['IEC'][:5]  # Top 5 genes for IEC
    top_genes_SCC = result['names']['SCC'][:5]  # Top 5 genes for SCC
    print(f'Top genes for {cell_type} - IEC: {top_genes_IEC}, SCC: {top_genes_SCC}')

# Optional: Create dot plots for the top genes of each cell type
for cell_type in results.keys():
    # Flatten the lists of top genes for IEC and SCC
    top_genes = list(top_genes_IEC) + list(top_genes_SCC)  # Combine top genes into a single list
    
    sc.pl.dotplot(
        idata,
        var_names=top_genes,  # Pass the flattened list of genes
        groupby='diagnosis_corrected',
        show=True
    )


In [None]:
sc.tl.rank_genes_groups(adata_KC_Basal, 'diagnosis_corrected', method='wilcoxon', groups =['SCC'], reference='IEC')
sc.pl.rank_genes_groups(adata_KC_Basal, groups =['SCC'], n_genes=20)
adata_KC_Basal.uns['rank_genes_groups']['names']['SCC'].tolist()

# <a id='section3'></a> Prep Whole objectFor label transfer

In [None]:
scc=anndata.read("/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/old_versions/SCC_final_object_Nov4.h5ad")

In [None]:
scc.obs['Level2'].unique().to_list()

In [None]:
scc.obs['Level2'] = scc.obs['Level2'].str.replace('cell', 'Cell').str.replace('Imm_', '').str.replace('Endothelial', 'Endothelial Cell')


level2_color_dict = {
    'Endothelial Cell':"#FFA500", 
    "Fibroblast": "#458B00", 
    "DC" : "#5F9EA0",
    "LC" : "#0000CD",
    "Macrophage" : "#EEEE00",
    "NK": "#9ACD32",
    "T Cell": "#1874CD",
    "KC Basal" : "#FF6A6A",
    "KC Cornified" : "#8B3A62",
    "KC Differentiating" : "#ab82ff",
    "KC Hair" : "#FF0000",
    "Melanocytes" : "#8B4513",
   'Plasma' : '#4a3b53',
   'B Cell' : '#ffaa92',
   'KC IFN' : '#7b4f4b', 
   "Treg" : "#00B2EE",
   "Monocytes":"#6a3a4c",
   "KC Dysplastic": "#dac0eb",
   "Ambiguous":"grey"
}

sc.pl.umap(
    scc,  # Your AnnData object
    color='Level2',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level2_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)


In [None]:
scc.obs['Level3'] = scc.obs['Level3'].str.replace('cell', 'Cell')
# Define color list based on cell types
level3_color_dict = {
    'Endothelial Cell':"#FFA500", 
     "Fibroblast": "#458B00", 
      "Imm_DC" : "#5F9EA0",
  "Imm_LC" : "#0000CD",
  "Imm_Macrophage" : "#EEEE00",
  "Imm_NK": "#9ACD32",
  "Imm_T Cell": "#1874CD",
  "Imm_Treg" : "#00B2EE",
   # "pDC"= "#8A2BE2",
  "KC Basal" : "#FF6A6A",
  "KC Cornified" : "#8B3A62",
  "KC Differentiating" : "#ab82ff",
  "KC Hair" : "#FF0000",
  "Melanocyte" : "#8B4513",
  "KC Cancer" : "black",  # Darker shade of "saddlebrown"
 'KC Granular':'#008941',
'Imm_CD8Tem':'#7a4900',
'Imm_Treg':'#8fb0ff',
'Imm_CD4Tcm' : '#997d87',
'Imm_CX3CR1+ Mono/Mac' : '#5a0007',
'Imm_mRegDC' : '#809693',
'Imm_CD14+ Mono' : '#6a3a4c',
'Imm_CD16- NK' : '#1b4400',
'Imm_Plasma' : '#4a3b53',
'Imm_LC KI67+' : '#ff2f80',
'Imm_CD16+ NK' : '#61615a',
'Imm_NKT' : '#ba0900',
'Imm_TREM2+ Mac' : '#00c2a0',
'Imm_B Cell' : '#ffaa92',
'Imm_CD169+ Mac': '#000035',
'KC IFN' : '#7b4f4b',
'Imm_IFN+ Mac' : '#a1c299',
'Imm_PD-1+CTLA4+ CD8Tcm' : '#300018',
'Imm_PD-1+ CD8Tem' : '#0aa6d8',
'Imm_DC1/DC2' : '#00846f', 'fibroblast_0':'#ddefff',
'fibroblast_3':'#372101',
'fibroblast_2':'#6b7900',
'fibroblast_1':'#ff90c9','Endothelial_0':'#3b5dff',
'Endothelial_1':'#b903aa',
'Endothelial_2':'#d16100',
    'Endothelial_3':'#013349',
'melanocyte_0':'#a30059',
'melanocyte_1':'#4fc601',
'melanocyte_2':'#004d43', "Ambiguous":"grey", "KC Dysplastic": "#dac0eb"
}

sc.pl.umap(
    scc,  # Your AnnData object
    color='Level3',  # Assuming 'Level3' is the categorical variable you want to color by
    palette=level3_color_dict,  # Use your color dictionary
    show=True,  # Set to False if you don't want to display the plot immediately
)


In [None]:
sc.pl.umap(
    scc,  # Your AnnData object
    color='Level3',  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
#scc.obs['Level2'] = scc.obs['Level2'].str.replace('cell', 'Cell').str.replace('Imm_', '').str.replace('Endothelial', 'Endothelial Cell')
scc.obs['Level3'].unique().tolist()

In [None]:
scc.write('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/SCC_final_object_Nov10.h5ad')


In [None]:

sc.pl.umap(
    scc,  # Your AnnData object
    color='sample_ID',  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()
split_umap(scc, color = ['Level2'], split_by='sample_ID',legend_loc = "on data")


In [None]:
# Checking the row names of your data
print(idata_subset.obs)


# <a id='section8'></a>  KC Cancer

In [None]:
idata=anndata.read_h5ad("/QRISdata/Q4386/skin_atlas/SCC_BCC/SCC_KC_only_final_object_21Nov.h5ad")
idata #Level3_final

In [None]:
idata.obs['UMAP_color'].unique()

In [None]:
from matplotlib import rcParams

dysplastic_adata = idata[idata.obs['Level3_final'] == 'KC Dysplastic']
sc.tl.rank_genes_groups(dysplastic_adata, 'cancer_status_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(dysplastic_adata)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(dysplastic_adata, n_genes=20, groups=['Cancer', 'Normal'])


In [None]:
axs = sc.pl.rank_genes_groups_dotplot(dysplastic_adata, n_genes=100, groups=['Cancer', 'Normal'])


In [None]:
print("Cancer: ",dysplastic_adata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100])
print("Normal: ",dysplastic_adata.uns['rank_genes_groups']['names']['Normal'].tolist()[:100])

In [None]:
idata.raw= None
mito_genes = idata.var_names.str.startswith('MT-')
ribo = idata.var_names.str.startswith(("RPS", "RPL"))

remove = np.add(mito_genes, ribo)
keep = np.invert(remove)

adata = idata[:,keep]

print(idata.n_obs, idata.n_vars)

In [None]:
from matplotlib import rcParams

sc.tl.rank_genes_groups(idata, 'cancer_status_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(idata)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(idata, n_genes=20, groups=['Cancer', 'Normal'])

print("Cancer: ",idata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100])
print("Normal: ",idata.uns['rank_genes_groups']['names']['Normal'].tolist()[:100])

In [None]:
# genes with raw counts as input

In [None]:
## SCC genes from literature
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()


import scanpy as sc

# Define your gene sets
gene_sets = {
    'Dys_Cancer': dysplastic_adata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100],
    'all_Cancer': idata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100],
    'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(idata, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
split_umap(idata, color=['Dys_Cancer'], size=5,split_by='cancer_status_corrected')
split_umap(idata, color=['all_Cancer'], size=5,split_by='cancer_status_corrected')
split_umap(idata, color=['TSK'], size=5,split_by='cancer_status_corrected')

#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
idata.obs['KC_cancer_all_DE'] 

In [None]:
# Define the condition
condition = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1)
condition25 = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1.25)
condition05 = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 0.5)
condition98 = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 0.984)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
idata.obs['KC_cancer_all_DE'] = ['Cancer' if cond else 'Normal' for cond in condition]
idata.obs['KC_cancer_all_DE_mod125'] = ['Cancer' if cond else 'Normal' for cond in condition25]
idata.obs['KC_cancer_all_DE_mod05'] = ['Cancer' if cond else 'Normal' for cond in condition05]
idata.obs['KC_cancer_all_DE_mod98'] = ['Cancer' if cond else 'Normal' for cond in condition98]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'grey'}

# Plot UMAP
sc.pl.umap(idata, color=['KC_cancer_all_DE','KC_cancer_all_DE_mod125','KC_cancer_all_DE_mod05','KC_cancer_all_DE_mod98'], palette=color_map, size=5)


# Plot UMAP with the new color assignment
#sc.pl.umap(idata, color='UMAP_color', title='UMAP Highlighting Overlap & Cancer > 1.5', frameon=False)


In [None]:
# Assuming `df` is your DataFrame
cancer_count = idata.obs[idata.obs['KC_cancer_all_DE'] == 'Cancer'].shape[0]
print(f"Number of cells marked as 'Cancer' mod>1: {cancer_count}")


cancer_count25=idata.obs[idata.obs['KC_cancer_all_DE_mod125'] == 'Cancer'].shape[0]
cancer_count05=idata.obs[idata.obs['KC_cancer_all_DE_mod05'] == 'Cancer'].shape[0]
cancer_count98=idata.obs[idata.obs['KC_cancer_all_DE_mod98'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' mod>1.25: {cancer_count25}")
print(f"Number of cells marked as 'Cancer' mod>0.5: {cancer_count05}")
print(f"Number of cells marked as 'Cancer' mod>0.98: {cancer_count98}")



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = idata.obs['all_Cancer']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of all_Cancer', fontsize=14)
plt.xlabel('all_Cancer', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
from scipy.stats import median_abs_deviation

# Assuming `adata.obs['all_Cancer']` is a pandas Series or numpy array
data = idata.obs['all_Cancer'].values  # Extracting values as a numpy array
mad_value = median_abs_deviation(data, nan_policy='omit')  # Calculate MAD, ignoring NaNs
print(f"MAD for all_Cancer: {mad_value}")


In [None]:
from scipy.stats import median_abs_deviation
import numpy as np

# Extract 'all_Cancer' values as a numpy array
data = adata.obs['all_Cancer'].values

# Calculate the median of the data
median_value = np.median(data)

# Filter the data to include only values in the upper tail (greater than the median)
upper_tail_data = data[data > median_value]

# Calculate MAD for the upper tail, ignoring NaNs
mad_upper_tail = median_abs_deviation(upper_tail_data, nan_policy='omit')

# Calculate 3 times the MAD (3MAD) for the upper tail
three_mad_upper_tail = 3 * mad_upper_tail

print(f"3MAD for the upper tail of all_Cancer: {three_mad_upper_tail}")


In [None]:
import numpy as np
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = idata.obs['all_Cancer']
# Calculate the 90th percentile
percentile_90 = np.percentile(all_cancer_values, 90)
percentile_95 = np.percentile(all_cancer_values, 95)
percentile_99 = np.percentile(all_cancer_values, 99)
percentile_995 = np.percentile(all_cancer_values, 99.5)


# Print the result
print(f"90th Percentile of 'all_Cancer': {percentile_90}")
print(f"95th Percentile of 'all_Cancer': {percentile_95}")
print(f"99th Percentile of 'all_Cancer': {percentile_99}")
print(f"99.5th Percentile of 'all_Cancer': {percentile_995}")

In [None]:
# Define the condition
condition90p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 0.53)
condition95p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 0.72)
condition99p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 0.98)
condition99_5p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1.06)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
idata.obs['KC_cancer_all_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in condition90p]
idata.obs['KC_cancer_all_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in condition95p]
idata.obs['KC_cancer_all_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in condition99p]
idata.obs['KC_cancer_all_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in condition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(idata, color=['KC_cancer_all_DE_90p','KC_cancer_all_DE_95p','KC_cancer_all_DE_99p','KC_cancer_all_DE_99_5p'], palette=color_map, size=5)


# Assuming `df` is your DataFrame
cancer_count90p = idata.obs[idata.obs['KC_cancer_all_DE_90p'] == 'Cancer'].shape[0]
cancer_count95p=idata.obs[idata.obs['KC_cancer_all_DE_95p'] == 'Cancer'].shape[0]
cancer_count99p=idata.obs[idata.obs['KC_cancer_all_DE_99p'] == 'Cancer'].shape[0]
cancer_count99_5p=idata.obs[idata.obs['KC_cancer_all_DE_99_5p'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {cancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {cancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {cancer_count99_5p}")



In [None]:
grouped = idata.obs.groupby(['Level3_Cancer', 'KC_cancer_all_DE', 'all_Cancer']).size().reset_index(name='count')
print(grouped)

In [None]:
# Print the first few rows of the grouped data
# Print the last few rows to see the end of the grouped data
print(grouped.tail(100))


In [None]:
idata.obs['KC_cancer_all_DE'].value_counts()

In [None]:
#idata.obs[idata.obs['KC_cancer_all_DE'] == 'Cancer']

OLD_CANCER_DF=idata.obs[idata.obs['Level3_Cancer'] == 'KC Cancer']
print(OLD_CANCER_DF['KC_cancer_all_DE'].value_counts())
print(OLD_CANCER_DF['KC_cancer_all_DE_mod98'].value_counts())

In [None]:
#idata.obs.head()
idata.obs.to_csv('/QRISdata/Q4386/skin_atlas/SCC_BCC/KConly_with_mod_score_nov21_meta.txt',sep="\t", index=True)


In [None]:
#idata.var.index = idata.var.index.astype(str)
#idata.obs.index = idata.obs.index.astype(str)

idata.obs.head()


In [None]:
TSK=['CACAGTAGTCAGCTAT-3','CGTGTAACAAGACACG-3','GCTCTGTCATTGAGCT-3','GTCGTAAAGATAGTCA-3','TACTTACGTGATGCCC-3','TAAGCGTTCGCTTGTC-5','ACGGGTCTCGGAATCT-10','AGCGGTCCAAACGTGG-10','AGCTCCTAGGGCACTA-10','AGTAGTCCAAGCTGGA-10','ATCACGAGTCGCTTCT-10','CACACAATCCTAAGTG-10','CACCACTTCCACGTGG-10','CACCTTGAGTGTTTGC-10','CATCAGAGTCGAATCT-10','CCAGCGAAGTCGTACT-10','CCATTCGTCAATCACG-10','CCTAAAGAGAGCTGGT-10','CGCGGTAGTGCGGTAA-10','CGCTTCAGTAAATGAC-10','CGGAGTCTCAACCAAC-10','CGTCACTGTGCCTGGT-10','CGTGTCTGTGAGGCTA-10','CTAGCCTCACCGATAT-10','CTAGTGAGTAGGCTGA-10','CTCGTCACAGGTCTCG-10','CTTAGGATCCACGCAG-10','CTTCTCTGTCGACTGC-10','GCGACCAGTTTGACTG-10','GGCTGGTAGGATGTAT-10','GGGCACTCAATTCCTT-10','GGGTCTGTCCTCATTA-10','GTACTTTGTCTCCACT-10','TAAACCGAGTAGTGCG-10','TAAGAGACATGCAACT-10','TACAGTGCATTTCAGG-10','TGCGCAGCACCCTATC-10','TGGTTCCGTACAAGTA-10','TGTTCCGTCACCTCGT-10','TTGCGTCAGATACACA-10','TTTATGCCACTCGACG-10']

      
# Assuming TSK is a list of cell names
#mask = scc.obs.index.isin(TSK)

# Subset the AnnData object
#scc_subset = scc[mask].copy()
# Plot UMAP
#sc.pl.umap(scc_subset, size=30, vmax=0.5)

# Create a new column 'TSK' initialized with empty strings
idata.obs['TSK'] = 'none'

# Convert TSK list to a set for faster lookup
TSK_set = set(TSK)

# Update the 'TSK' column based on whether the index is in TSK
idata.obs.loc[idata.obs.index.isin(TSK_set), 'TSK'] = 'TSK'


import matplotlib.pyplot as plt
import scanpy as sc

# Create a figure and axis to plot both sets of cells on the same plot
#fig, ax = plt.subplots()
fig, ax = plt.subplots(figsize=(5, 5))  # Adjust figure size as needed

# Define the color map
colors = {
    'TSK': 'red',
    'none': 'lightgrey'
}

# Plot 'none' cells first (background layer)
sc.pl.umap(
    idata[idata.obs['TSK'] == 'none'],
    color='TSK',
    size=1,
    palette={'none': 'lightgrey'},
    ax=ax,
    show=False
)

# Overlay other cells with specific colors (foreground layer)
sc.pl.umap(
    idata[idata.obs['TSK'] != 'none'],
    color='TSK',
    size=4,
    palette=colors,
    ax=ax,
    show=False
)
# Display the combined plot


# Optionally, you can check the result
#sc.pl.umap(idata, color=['TSK'], size=10, vmax=0.5)


In [None]:
# Define the condition
conditionTSK = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['TSK'] > 0.8)
conditionTSK6 = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['TSK'] > 0.6)
# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
idata.obs['TSK_cancermod'] = ['Cancer' if cond else 'Normal' for cond in conditionTSK]
idata.obs['TSK_cancermod6'] = ['Cancer' if cond else 'Normal' for cond in conditionTSK6]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'grey'}

# Plot UMAP
sc.pl.umap(idata, color=['TSK_cancermod','TSK_cancermod6'], palette=color_map, size=5)


# Plot UMAP with the new color assignment
#sc.pl.umap(idata, color='UMAP_color', title='UMAP Highlighting Overlap & Cancer > 1.5', frameon=False)


In [None]:
TSK_df=idata.obs[idata.obs['TSK'] == 'TSK']
print(TSK_df['KC_cancer_all_DE_99p'].value_counts())
print(TSK_df['Level3_Cancer'].value_counts())
print(TSK_df['KC_cancer_all_DE_95p'].value_counts())


In [None]:
len(TSK)

In [None]:
idata.obs['TSK'].unique()

In [None]:
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()
split_umap(idata, color = ['KC_cancer_all_DE_99p'], split_by='diagnosis_corrected',legend_loc = "on data")


In [None]:
split_umap(idata, color = ['KC_cancer_all_DE_95p'], split_by='diagnosis_corrected')


In [None]:
sc.pl.dotplot(
    scc,
    { 'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby="Level2", figsize=(18, 6) ,show=False
)

In [None]:
sc.pl.dotplot(
    idata,
    { 'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby="KC_cancer_all_DE_95p", figsize=(10, 1) ,show=False
)

In [None]:
# Cells cancer in 95p but normal in 99p
# Filter barcodes that are 'Cancer' in 'KC_cancer_all_DE_95p'
cancer_95p = idata.obs[idata.obs['KC_cancer_all_DE_95p'] == 'Cancer'].index

# Filter barcodes that are 'Normal' in 'KC_cancer_all_DE_99p'
normal_99p = idata.obs[idata.obs['KC_cancer_all_DE_99p'] == 'Normal'].index

# Find the intersection of the two conditions
result_barcodes = cancer_95p.intersection(normal_99p)

# Subset the AnnData object to include only the barcodes in the result
idata_subset = idata[result_barcodes, :]

# Display
sc.pl.dotplot(
    idata_subset,
    { 'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby=["KC_cancer_all_DE_95p"], figsize=(10, 1) ,show=False
)

sc.pl.dotplot(
    idata_subset,
    { 'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby=["KC_cancer_all_DE_99p"], figsize=(10, 1) ,show=False
)

In [None]:
sc.pl.dotplot(
    idata,
    { 'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby="KC_cancer_all_DE_99p", figsize=(10, 1) ,show=False
)

### with raw data

In [None]:
from matplotlib import rcParams

sc.tl.rank_genes_groups(idata, 'cancer_status_corrected', method='wilcoxon', n_genes=200)
sc.tl.filter_rank_genes_groups(idata)
rcParams['figure.figsize'] = 4,4
rcParams['axes.grid'] = True
#sc.pl.rank_genes_groups(adata_KC_Basal, key='rank_genes_groups_filtered', ncols=3)

axs = sc.pl.rank_genes_groups_dotplot(idata, n_genes=20, groups=['Cancer', 'Normal'])

print("Cancer: ",idata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100])
print("Normal: ",idata.uns['rank_genes_groups']['names']['Normal'].tolist()[:100])

In [None]:
## SCC genes from literature
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()


import scanpy as sc

# Define your gene sets
gene_sets = {
 #   'Dys_Cancer': i.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100],
    'all_Cancer': idata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100],
    'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(idata, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
split_umap(idata, color=['all_Cancer'], size=5,split_by='cancer_status_corrected')
split_umap(idata, color=['TSK'], size=5,split_by='cancer_status_corrected')

#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = idata.obs['all_Cancer']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of all_Cancer', fontsize=14)
plt.xlabel('all_Cancer', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()
import numpy as np
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = idata.obs['all_Cancer']
# Calculate the 90th percentile
percentile_90 = np.percentile(all_cancer_values, 90)
percentile_95 = np.percentile(all_cancer_values, 95)
percentile_99 = np.percentile(all_cancer_values, 99)
percentile_995 = np.percentile(all_cancer_values, 99.5)


# Print the result
print(f"90th Percentile of 'all_Cancer': {percentile_90}")
print(f"95th Percentile of 'all_Cancer': {percentile_95}")
print(f"99th Percentile of 'all_Cancer': {percentile_99}")
print(f"99.5th Percentile of 'all_Cancer': {percentile_995}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the 'all_Cancer' data from adata.obs
TSK_values = idata.obs['TSK']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of all_Cancer', fontsize=14)
plt.xlabel('all_Cancer', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()
import numpy as np
# Extract the 'all_Cancer' data from adata.obs
# Calculate the 90th percentile
TSKpercentile_90 = np.percentile(TSK_values, 90)
TSKpercentile_95 = np.percentile(TSK_values, 95)
TSKpercentile_99 = np.percentile(TSK_values, 99)
TSKpercentile_995 = np.percentile(TSK_values, 99.5)


# Print the result
print(f"90th Percentile of 'TSK': {TSKpercentile_90}")
print(f"95th Percentile of 'TSK': {TSKpercentile_95}")
print(f"99th Percentile of 'TSK': {TSKpercentile_99}")
print(f"99.5th Percentile of 'TSK': {TSKpercentile_995}")

In [None]:
# Define the condition
condition90p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1.24)
condition95p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1.41)
condition99p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1.61)
condition99_5p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['all_Cancer'] > 1.65)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
idata.obs['KC_cancer_all_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in condition90p]
idata.obs['KC_cancer_all_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in condition95p]
idata.obs['KC_cancer_all_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in condition99p]
idata.obs['KC_cancer_all_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in condition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(idata, color=['KC_cancer_all_DE_90p','KC_cancer_all_DE_95p','KC_cancer_all_DE_99p','KC_cancer_all_DE_99_5p'], palette=color_map, size=5)


# Assuming `df` is your DataFrame
cancer_count90p = idata.obs[idata.obs['KC_cancer_all_DE_90p'] == 'Cancer'].shape[0]
cancer_count95p=idata.obs[idata.obs['KC_cancer_all_DE_95p'] == 'Cancer'].shape[0]
cancer_count99p=idata.obs[idata.obs['KC_cancer_all_DE_99p'] == 'Cancer'].shape[0]
cancer_count99_5p=idata.obs[idata.obs['KC_cancer_all_DE_99_5p'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {cancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {cancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {cancer_count99_5p}")







TSKcondition90p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['TSK'] > 0.24)
TSKcondition95p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['TSK'] > 0.31)
TSKcondition99p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['TSK'] > 0.47)
TSKcondition99_5p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['TSK'] > 0.56)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
idata.obs['TSKKC_cancer_all_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in TSKcondition90p]
idata.obs['TSKKC_cancer_all_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in TSKcondition95p]
idata.obs['TSKKC_cancer_all_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in TSKcondition99p]
idata.obs['TSKKC_cancer_all_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in TSKcondition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(idata, color=['TSKKC_cancer_all_DE_90p','TSKKC_cancer_all_DE_95p','TSKKC_cancer_all_DE_99p','TSKKC_cancer_all_DE_99_5p'], palette=color_map, size=5)


# Assuming `df` is your DataFrame
TSKcancer_count90p = idata.obs[idata.obs['TSKKC_cancer_all_DE_90p'] == 'Cancer'].shape[0]
TSKcancer_count95p=idata.obs[idata.obs['TSKKC_cancer_all_DE_95p'] == 'Cancer'].shape[0]
TSKcancer_count99p=idata.obs[idata.obs['TSKKC_cancer_all_DE_99p'] == 'Cancer'].shape[0]
TSKcancer_count99_5p=idata.obs[idata.obs['TSKKC_cancer_all_DE_99_5p'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' 90th percentile: {TSKcancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {TSKcancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {TSKcancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {TSKcancer_count99_5p}")



In [None]:
TSK_df=idata.obs[idata.obs['TSKKC_cancer_all_DE_90p'] == 'Cancer']
print(TSK_df['KC_cancer_all_DE_99p'].value_counts())
print(TSK_df['KC_cancer_all_DE_95p'].value_counts())


In [None]:
raw=idata.raw

# Define the desired order of 'Level3' categories
SAMPLE_order = [
 'P5_SCC_BCC',
 'P1_SCC',
 'P2_SCC1',
 'P2_SCC2',
 'P3_IEC',
 'P4_SCC',
 'P4_N','P5_N', 'P3_N', 'P2_N', 'P1_N']

# Set 'Level3' as a categorical variable with the defined order
idata.obs['sample_ID_corrected'] = pd.Categorical(
    idata.obs['sample_ID_corrected'], 
    categories=SAMPLE_order, 
    ordered=True
)
#idata.raw=None
sc.pl.dotplot(
    idata,
    {'pyDEraw':idata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100]},
    color_map="Reds",
    groupby="sample_ID_corrected", figsize=(20, 3) ,show=False
)

sc.pl.dotplot(
    idata,
    {'pyDEraw':idata.uns['rank_genes_groups']['names']['Cancer'].tolist()[:100]},
    color_map="Reds",
    groupby="cancer_status_corrected", figsize=(20, 3) ,show=False
)

In [None]:
raw=idata.raw

idata_no_raw=idata.copy()
idata_no_raw.raw=None

sc.pl.dotplot(
    idata_no_raw,
    {'pyDEraw':['IFI27','KRT6A','FABP5','KRT16','S100A7','TMSB10','TYMP','IFITM3','KRT6B','CALML3','ENO1','DBI','TXNDC17','KRT6C','PSME2','S100A6','LY6E','CSTB','GJB2','CHCHD10','SNHG25','IFI6','CD74','SLIRP','ATOX1','WDR66','TNFSF10','KRT17','S100A13','CTSC','IFI16','DSC2','CRABP2','TPD52L1','GJB6']},
    color_map="Reds",
    groupby="cancer_status_corrected", figsize=(20, 3) ,show=False
)

sc.pl.dotplot(
    idata_no_raw,
    {'pyDEraw':['IFI27','KRT6A','FABP5','KRT16','S100A7','TMSB10','TYMP','IFITM3','KRT6B','CALML3','ENO1','DBI','TXNDC17','KRT6C','PSME2','S100A6','LY6E','CSTB','GJB2','CHCHD10','SNHG25','IFI6','CD74','SLIRP','ATOX1','WDR66','TNFSF10','KRT17','S100A13','CTSC','IFI16','DSC2','CRABP2','TPD52L1','GJB6']},
    color_map="Reds",
    groupby="sample_ID_corrected", figsize=(20, 3) ,show=False
)

In [None]:
split_umap(idata, color = ['KC_cancer_all_DE_99p'], split_by='diagnosis_corrected',legend_loc = "on data")

### edgeR genes

In [None]:
raw=idata.raw
#idata.raw=None
sc.pl.dotplot(
    idata,
    { #'DE':['BST2','CCL4','PLA2G16','CSRP2','CFH'],
    'DE2':['RAB5IF','IFI44L','BST2','PITX1','SAMD9','IRF7','TMEM132A','HENMT1','CCL21','USP18','CTPS1','EEF1AKMT4','TIMM13','CKMT1B','COX7B','TPI1','RASIP1','MPZL1','CAPNS1','FCF1','CD3D','LYZ','EIF4EBP1','AKR1B10','CDKN2A','CSRP2','SRM','TMSB10','CCL5','NETO2','XAF1','EPSTI1','SNX10','OASL','CLEC7A','IFI27','APOL3','IFI6','ISG15','MX1','HERC6','OAS1','MYLK','S100A9','KRT6C','CTSB','CXCR4','U62317.1','LINC01006','SLC43A2','TNFRSF21','IFITM3','APOL1','TYMP','IFIT3','IFI44','IFIT1','OAS2','CBX3','PSME2','ATP5MF','PSMA7','ENO1','AGTRAP','FABP6','KRT13','STAT1','GBP1','PARP14','SAMD9L','GTF3C6','ISG20','DDX60','DTX3L','APOL6','PARP9','OAS3','GNG11','SNHG25','MLKL','EIF2S2','OAF','MINOS1','SLIRP','NQO1','PYCR1','POLR3G','GCLM','HLA-DPA1','CTSC','G6PC3','STEAP3','TRIM22','CD52','LTB','ACP2','RTL6','SLC7A8','LGALS3BP','FBLIM1','HLA-DMA']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby="cancer_status_corrected", figsize=(20, 3) ,show=False
)

In [None]:
idata.obs['sample_ID_corrected'].unique().tolist()

In [None]:
raw=idata.raw

# Define the desired order of 'Level3' categories
SAMPLE_order = [
 'P5_SCC_BCC',
 'P1_SCC',
 'P2_SCC1',
 'P2_SCC2',
 'P3_IEC',
 'P4_SCC',
 'P4_N','P5_N', 'P3_N', 'P2_N', 'P1_N']

# Set 'Level3' as a categorical variable with the defined order
idata.obs['sample_ID_corrected'] = pd.Categorical(
    idata.obs['sample_ID_corrected'], 
    categories=SAMPLE_order, 
    ordered=True
)
#idata.raw=None
sc.pl.dotplot(
    idata,
    { #'DE':['BST2','CCL4','PLA2G16','CSRP2','CFH'],
    'DE2':['RAB5IF','IFI44L','BST2','PITX1','SAMD9','IRF7','TMEM132A','HENMT1','CCL21','USP18','CTPS1','EEF1AKMT4','TIMM13','CKMT1B','COX7B','TPI1','RASIP1','MPZL1','CAPNS1','FCF1','CD3D','LYZ','EIF4EBP1','AKR1B10','CDKN2A','CSRP2','SRM','TMSB10','CCL5','NETO2','XAF1','EPSTI1','SNX10','OASL','CLEC7A','IFI27','APOL3','IFI6','ISG15','MX1','HERC6','OAS1','MYLK','S100A9','KRT6C','CTSB','CXCR4','U62317.1','LINC01006','SLC43A2','TNFRSF21','IFITM3','APOL1','TYMP','IFIT3','IFI44','IFIT1','OAS2','CBX3','PSME2','ATP5MF','PSMA7','ENO1','AGTRAP','FABP6','KRT13','STAT1','GBP1','PARP14','SAMD9L','GTF3C6','ISG20','DDX60','DTX3L','APOL6','PARP9','OAS3','GNG11','SNHG25','MLKL','EIF2S2','OAF','MINOS1','SLIRP','NQO1','PYCR1','POLR3G','GCLM','HLA-DPA1','CTSC','G6PC3','STEAP3','TRIM22','CD52','LTB','ACP2','RTL6','SLC7A8','LGALS3BP','FBLIM1','HLA-DMA']},    standard_scale="var",
    color_map="Reds",
    groupby="sample_ID_corrected", figsize=(20, 3) ,show=False
)

In [None]:
## not raw
idatacp=idata
idatacp.raw=None
sc.pl.dotplot(
    idatacp,
    { #'DE':['BST2','CCL4','PLA2G16','CSRP2','CFH'],
    'DE2':['IFI44L','BST2','SAMD9','TMEM132A','HENMT1','CCL21','CD3D','LYZ','EIF4EBP1','AKR1B10','CDKN2A','CSRP2','SRM','TMSB10','CCL5','NETO2','EPSTI1','SNX10','OASL','CLEC7A','IFI27','APOL3','IFI6','ISG15','MX1','MYLK','KRT6C','CTSB','CXCR4','IFITM3','TYMP','IFIT3','IFI44','IFIT1','PSME2','ENO1','FABP6','KRT13','GBP1','ISG20','GNG11','SNHG25','OAF','SLIRP','HLA-DPA1','CTSC','TRIM22','CD52','LTB','SLC7A8','HLA-DMA']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby=["cancer_status_corrected","sample_ID_corrected"], figsize=(20, 3) ,show=False
)

In [None]:
# mod scores from edgeR list

In [None]:
## SCC genes from literature
def split_umap(adata, split_by, ncol=2, nrow=None, **kwargs):
    categories = adata.obs[split_by].cat.categories
    if nrow is None:
        nrow = int(np.ceil(len(categories) / ncol))
    fig, axs = plt.subplots(nrow, ncol, figsize=(5*ncol, 4*nrow))
    axs = axs.flatten()
    for i, cat in enumerate(categories):
        ax = axs[i]
        sc.pl.umap(adata[adata.obs[split_by] == cat], ax=ax, show=False, title=cat, **kwargs)
    plt.tight_layout()


#idata_raw=idata
#idata_raw.X = idata_raw.raw

# Define your gene sets
gene_sets = {
'DE2':['RAB5IF','IFI44L','BST2','PITX1','SAMD9','IRF7','TMEM132A','HENMT1','CCL21','USP18','CTPS1','EEF1AKMT4','TIMM13','CKMT1B','COX7B','TPI1','RASIP1','MPZL1','CAPNS1','FCF1','CD3D','LYZ','EIF4EBP1','AKR1B10','CDKN2A','CSRP2','SRM','TMSB10','CCL5','NETO2','XAF1','EPSTI1','SNX10','OASL','CLEC7A','IFI27','APOL3','IFI6','ISG15','MX1','HERC6','OAS1','MYLK','S100A9','KRT6C','CTSB','CXCR4','U62317.1','LINC01006','SLC43A2','TNFRSF21','IFITM3','APOL1','TYMP','IFIT3','IFI44','IFIT1','OAS2','CBX3','PSME2','ATP5MF','PSMA7','ENO1','AGTRAP','FABP6','KRT13','STAT1','GBP1','PARP14','SAMD9L','GTF3C6','ISG20','DDX60','DTX3L','APOL6','PARP9','OAS3','GNG11','SNHG25','MLKL','EIF2S2','OAF','MINOS1','SLIRP','NQO1','PYCR1','POLR3G','GCLM','HLA-DPA1','CTSC','G6PC3','STEAP3','TRIM22','CD52','LTB','ACP2','RTL6','SLC7A8','LGALS3BP','FBLIM1','HLA-DMA']
} #"GDF15"="PLAB", MLANA=MART1

# Calculate and add module scores
for name, genes in gene_sets.items():
    sc.tl.score_genes(idata, gene_list=genes, score_name=name)

# Inspect the results
#print(mel.obs.head())
#idata.obs['cancer'] = ['cancer' if x == 'KC Cancer' else 'Others' for x in scc.obs['Level3_Cancer']]
#idata.obs['cancer'] = idata.obs['cancer'].astype('category')

# Plot the module scores
split_umap(idata, color=['DE2'], size=5,split_by='cancer_status_corrected')

#split_umap(scc, color = ['InferCNV_and_CopyKAT_aneuploid'], split_by='cancer_status',legend_loc = "right margin")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = idata.obs['DE2']

# Plot a histogram using seaborn for better visualization
plt.figure(figsize=(8, 5))
sns.histplot(all_cancer_values, kde=False, bins=30, color='blue', edgecolor='black')

# Add titles and labels
plt.title('Histogram of all_Cancer', fontsize=14)
plt.xlabel('all_Cancer', fontsize=12)
plt.ylabel('Frequency', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()
import numpy as np
# Extract the 'all_Cancer' data from adata.obs
all_cancer_values = idata.obs['DE2']
# Calculate the 90th percentile
percentile_90 = np.percentile(all_cancer_values, 90)
percentile_95 = np.percentile(all_cancer_values, 95)
percentile_99 = np.percentile(all_cancer_values, 99)
percentile_995 = np.percentile(all_cancer_values, 99.5)


# Print the result
print(f"90th Percentile of 'all_Cancer': {percentile_90}")
print(f"95th Percentile of 'all_Cancer': {percentile_95}")
print(f"99th Percentile of 'all_Cancer': {percentile_99}")
print(f"99.5th Percentile of 'all_Cancer': {percentile_995}")

In [None]:
# Define the condition
condition90p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['DE2'] > 0.24)
condition95p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['DE2'] > 0.31)
condition99p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['DE2'] > 0.42)
condition99_5p = (idata.obs['Aneuploid_combined_clusters'] == 'overlap') & (idata.obs['DE2'] > 0.47)

# Assign colors: 'red' for cells meeting the condition, 'grey' otherwise
idata.obs['eKC_cancer_all_DE_90p'] = ['Cancer' if cond else 'Normal' for cond in condition90p]
idata.obs['eKC_cancer_all_DE_95p'] = ['Cancer' if cond else 'Normal' for cond in condition95p]
idata.obs['eKC_cancer_all_DE_99p'] = ['Cancer' if cond else 'Normal' for cond in condition99p]
idata.obs['eKC_cancer_all_DE_99_5p'] = ['Cancer' if cond else 'Normal' for cond in condition99_5p]

# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}

# Plot UMAP
sc.pl.umap(idata, color=['eKC_cancer_all_DE_90p','eKC_cancer_all_DE_95p','eKC_cancer_all_DE_99p','eKC_cancer_all_DE_99_5p'], palette=color_map, size=5)


# Assuming `df` is your DataFrame
cancer_count90p = idata.obs[idata.obs['eKC_cancer_all_DE_90p'] == 'Cancer'].shape[0]
cancer_count95p=idata.obs[idata.obs['eKC_cancer_all_DE_95p'] == 'Cancer'].shape[0]
cancer_count99p=idata.obs[idata.obs['eKC_cancer_all_DE_99p'] == 'Cancer'].shape[0]
cancer_count99_5p=idata.obs[idata.obs['eKC_cancer_all_DE_99_5p'] == 'Cancer'].shape[0]

print(f"Number of cells marked as 'Cancer' 90th percentile: {cancer_count90p}")
print(f"Number of cells marked as 'Cancer' 95th percentile: {cancer_count95p}")
print(f"Number of cells marked as 'Cancer' 99th percentile: {cancer_count99p}")
print(f"Number of cells marked as 'Cancer' 99.5th percentile: {cancer_count99_5p}")



In [None]:
KC_cancer_all_DE_99p=idata.obs[idata.obs['KC_cancer_all_DE_99p'] == 'Cancer']
KC_cancer_all_DE_99p['Level3_final'].value_counts()

In [None]:
print(KC_cancer_all_DE_99p['Level3_Cancer'].value_counts())
idata.obs['Level3_Cancer'].value_counts()

In [None]:
TSK_df=idata.obs[idata.obs['TSKKC_cancer_all_DE_99p'] == 'Cancer']
print(TSK_df['eKC_cancer_all_DE_99p'].value_counts())
print(TSK_df['eKC_cancer_all_DE_95p'].value_counts())


In [None]:
TSK=['CACAGTAGTCAGCTAT-3','CGTGTAACAAGACACG-3','GCTCTGTCATTGAGCT-3','GTCGTAAAGATAGTCA-3','TACTTACGTGATGCCC-3','TAAGCGTTCGCTTGTC-5','ACGGGTCTCGGAATCT-10','AGCGGTCCAAACGTGG-10','AGCTCCTAGGGCACTA-10','AGTAGTCCAAGCTGGA-10','ATCACGAGTCGCTTCT-10','CACACAATCCTAAGTG-10','CACCACTTCCACGTGG-10','CACCTTGAGTGTTTGC-10','CATCAGAGTCGAATCT-10','CCAGCGAAGTCGTACT-10','CCATTCGTCAATCACG-10','CCTAAAGAGAGCTGGT-10','CGCGGTAGTGCGGTAA-10','CGCTTCAGTAAATGAC-10','CGGAGTCTCAACCAAC-10','CGTCACTGTGCCTGGT-10','CGTGTCTGTGAGGCTA-10','CTAGCCTCACCGATAT-10','CTAGTGAGTAGGCTGA-10','CTCGTCACAGGTCTCG-10','CTTAGGATCCACGCAG-10','CTTCTCTGTCGACTGC-10','GCGACCAGTTTGACTG-10','GGCTGGTAGGATGTAT-10','GGGCACTCAATTCCTT-10','GGGTCTGTCCTCATTA-10','GTACTTTGTCTCCACT-10','TAAACCGAGTAGTGCG-10','TAAGAGACATGCAACT-10','TACAGTGCATTTCAGG-10','TGCGCAGCACCCTATC-10','TGGTTCCGTACAAGTA-10','TGTTCCGTCACCTCGT-10','TTGCGTCAGATACACA-10','TTTATGCCACTCGACG-10']

      
# Assuming TSK is a list of cell names
#mask = scc.obs.index.isin(TSK)

# Subset the AnnData object
#scc_subset = scc[mask].copy()
# Plot UMAP
#sc.pl.umap(scc_subset, size=30, vmax=0.5)

# Create a new column 'TSK' initialized with empty strings
idata.obs['TSK'] = 'none'

# Convert TSK list to a set for faster lookup
TSK_set = set(TSK)

# Update the 'TSK' column based on whether the index is in TSK
idata.obs.loc[idata.obs.index.isin(TSK_set), 'TSK'] = 'TSK'


import matplotlib.pyplot as plt
import scanpy as sc

# Create a figure and axis to plot both sets of cells on the same plot
#fig, ax = plt.subplots()
fig, ax = plt.subplots(figsize=(5, 5))  # Adjust figure size as needed

# Define the color map
colors = {
    'TSK': 'red',
    'none': 'lightgrey'
}

# Plot 'none' cells first (background layer)
sc.pl.umap(
    idata[idata.obs['TSK'] == 'none'],
    color='TSK',
    size=1,
    palette={'none': 'lightgrey'},
    ax=ax,
    show=False
)

# Overlay other cells with specific colors (foreground layer)
sc.pl.umap(
    idata[idata.obs['TSK'] != 'none'],
    color='TSK',
    size=4,
    palette=colors,
    ax=ax,
    show=False
)
# Display the combined plot


# Optionally, you can check the result
#sc.pl.umap(idata, color=['TSK'], size=10, vmax=0.5)


In [None]:
TSK_df_lt=idata.obs[idata.obs['TSK'] == 'TSK']
print(TSK_df_lt['eKC_cancer_all_DE_99p'].value_counts())
print(TSK_df_lt['eKC_cancer_all_DE_95p'].value_counts())


In [None]:
# python DE and edgeR consistency
pyDE=idata.obs[idata.obs['KC_cancer_all_DE_95p'] == 'Cancer']
print(pyDE['eKC_cancer_all_DE_95p'].value_counts())


# python DE and edgeR consistency
pyDE99=idata.obs[idata.obs['KC_cancer_all_DE_99p'] == 'Cancer']
print(pyDE99['eKC_cancer_all_DE_99p'].value_counts())
print(pyDE99['TSKKC_cancer_all_DE_99p'].value_counts())


In [None]:
EDGER=idata.obs[idata.obs['eKC_cancer_all_DE_99p'] == 'Cancer']
print(EDGER['TSKKC_cancer_all_DE_99p'].value_counts())
print(EDGER['KC_cancer_all_DE_99p'].value_counts())



In [None]:
## CANCER IN EDGER , PYDE AND TSK

# Count cells that are 'Cancer' in all three columns
cancer_cells_count = (
    (idata.obs['eKC_cancer_all_DE_99p'] == 'Cancer') &
    (idata.obs['TSKKC_cancer_all_DE_99p'] == 'Cancer') &
    (idata.obs['KC_cancer_all_DE_99p'] == 'Cancer')
).sum()



cancer_cells_count95 = (
    (idata.obs['eKC_cancer_all_DE_95p'] == 'Cancer') &
    (idata.obs['TSKKC_cancer_all_DE_95p'] == 'Cancer') &
    (idata.obs['KC_cancer_all_DE_95p'] == 'Cancer')
).sum()


print(f"Number of cells labeled as 'Cancer' in all three categories 95p: {cancer_cells_count95} ; {cancer_cells_count95/1306*100} %")
print(f"Number of cells labeled as 'Cancer' in all three categories 99p: {cancer_cells_count}; {cancer_cells_count/291*100} %")


In [None]:
# Create a new column based on the conditions
idata.obs['Cancer_across3modscores_99p'] = (
    (idata.obs['eKC_cancer_all_DE_99p'] == 'Cancer') &
    (idata.obs['TSKKC_cancer_all_DE_99p'] == 'Cancer') &
    (idata.obs['KC_cancer_all_DE_99p'] == 'Cancer')
).replace({True: 'Cancer', False: 'Normal'})

idata.obs['Cancer_across3modscores_95p'] = (
    (idata.obs['eKC_cancer_all_DE_95p'] == 'Cancer') &
    (idata.obs['TSKKC_cancer_all_DE_95p'] == 'Cancer') &
    (idata.obs['KC_cancer_all_DE_95p'] == 'Cancer')
).replace({True: 'Cancer', False: 'Normal'})


# Define a color map
color_map = {'Cancer': 'red', 'Normal': 'lightgrey'}
# Plot UMAP
sc.pl.umap(idata, color=['Cancer_across3modscores_95p','Cancer_across3modscores_99p'], palette=color_map, size=5)



In [None]:
split_umap(idata, color = ['Cancer_across3modscores_95p'], split_by='diagnosis_corrected',legend_loc = "on data")

In [None]:
Cancer_across3modscores_95p=idata.obs[idata.obs['Cancer_across3modscores_95p'] == 'Cancer']
Cancer_across3modscores_95p['Level3_final'].value_counts()

In [None]:
KC_cancer_all_DE_99p=idata.obs[idata.obs['KC_cancer_all_DE_99p'] == 'Cancer']
KC_cancer_all_DE_99p['Level3_final'].value_counts()

In [None]:
eKC_cancer_all_DE_99p=idata.obs[idata.obs['eKC_cancer_all_DE_99p'] == 'Cancer']
eKC_cancer_all_DE_99p['Level3_final'].value_counts()

In [None]:
sc.pl.dotplot(
    idata,
    { 'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby="KC_cancer_all_DE_95p", figsize=(10, 1) ,show=False
)

In [None]:
sc.pl.dotplot(
    idata,
    { 'TSK':['PTHLH','FEZ1','INHBA','MMP1','SERPINE2','IGFBP6','TNC','ITGA5','SPINK6','TMSB10','KRT18','FST','ODC1','ITGA6','SORL1','EMP3','EREG','TIMP3','LGALS1','TNFRSF12A','ATP1B1','MBOAT2','F3','ANXA1','KRT17','PHLDA1','LTBP1','CAPN2','IGFBP2','COL17A1','AREG','ARPC1B','GLIPR1','S100A6','CAV1','ADIRF','RBP1','UPP1','SDC1','GJB6','SLC7A8','RAB32','FXYD5','FSCN1']},
        #'TSK':["MMP10","PTHLH","FEZ1","IL24","KCNMA1","INHBA","MAGEA4","NT5E","LAMC2","SLITRK6","MMP1","SERPINE1","SERPINE2","IGFBP6","TNC","ITGA5","ECM1","SPINK6","TMSB10","KRT18","FST","ODC1","LAMB3","ITGA6","ACTB","ITGB1","SORL1","PFN1","EMP3","S100A10","EREG","LAMA3","OCIAD2","CTSV","ANXA3","S100A2","MET","CD99","TMSB4X","TIMP3","TPM4","NEFM","SCG5","SH3BGRL3","PLAU","PKM","CD63","LGALS1","BMP1","TNFRSF12A","ATP1B1","CFL1","MBOAT2","F3","TNFRSF21","CLIC1","CAP1","PDPN","SERINC2","ANXA1","BSG","DSG2","RHOC","KRT17","PDLIM7","PHLDA1","GLO1","LTBP1","TAGLN2","CD151","CAPN2","COL5A2","IGFBP2","P4HA2","COL17A1","PLEK2","AREG","ARPC1B","GLIPR1","S100A6","PLOD3","YWHAZ","CAV1","ADIRF","TGFBI","RBP1","FSTL3","C16orf74","UPP1","TNNT1","ANXA5","SDC1","PRDX5","MYL12A","GJB6","SLC7A8","RAB32","FXYD5","FSCN1"]},
    standard_scale="var",
    color_map="Reds",
    groupby="KC_cancer_all_DE_99p", figsize=(10, 1) ,show=False
)

In [None]:
split_umap(idata, color = ['KC_cancer_all_DE_99p'], split_by='diagnosis_corrected',legend_loc = "on data")


In [None]:
import pickle

# Save the AnnData object to a pickle file
with open('/QRISdata/Q4386/skin_atlas/SCC_BCC/KConly_with_mod_score_nov22_edgeR.pkl', 'wb') as f:
    pickle.dump(idata, f)


In [None]:
idata_old=anndata.read_h5ad("KC_reanalysis_15Oct.h5ad")
idata_old

In [None]:
idata_new.raw.X

In [None]:
#idata.raw = sc.AnnData(X=idata_new.X, var=idata_new.var, obs=idata_new.obs)
idata.raw.X

In [None]:
# Find common genes between idata.X and idata_new.raw.X
common_genes = idata.var.index.intersection(idata_new.raw.var.index)

# Subset both X matrices (data) to match the common genes
idata_new_raw_subset = idata_new.raw.X[:, idata_new.raw.var.index.isin(common_genes)]

# Now assign the subsetted data to idata.raw
idata.raw = sc.AnnData(X=idata_new_raw_subset, 
                       var=idata_new.raw.var.loc[common_genes], 
                       obs=idata.obs)

# Check the updated raw data
print(idata.raw)


In [None]:
import pickle

# Save the AnnData object to a pickle file
with open('/QRISdata/Q4386/skin_atlas/SCC_BCC/KConly_with_mod_score_nov21.pkl', 'wb') as f:
    pickle.dump(idata, f)


In [None]:
# Load the AnnData object from the pickle file
with open('/QRISdata/Q4386/skin_atlas/SCC_BCC/KConly_with_mod_score_nov21.pkl', 'rb') as f:
    idata_loaded = pickle.load(f)
idata_loaded

# <a id='section4'></a>  Whole object Paper figures

In [None]:
scc=anndata.read('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/SCC_final_object_Nov10.h5ad')
scc

In [None]:
with plt.rc_context({"figure.figsize": (8, 6), "figure.dpi": (300)}):
    sc.pl.umap(scc, color="Level2", legend_fontoutline=2,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/Level2.pdf", bbox_inches="tight")

In [None]:
with plt.rc_context({"figure.figsize": (8, 6), "figure.dpi": (300)}):
    sc.pl.umap(scc, color="Level1_Final", legend_fontoutline=2,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/Level1.pdf", bbox_inches="tight")

In [None]:
with plt.rc_context({"figure.figsize": (8, 6), "figure.dpi": (300)}):
    sc.pl.umap(scc, color="Level3", legend_fontoutline=2,show=False)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/Level3.pdf", bbox_inches="tight")

In [None]:
scc.obs['Level2'].unique().tolist()

In [None]:
import pandas as pd

# Define the desired order of 'Level3' categories
level2_order = [
    'NK',
    'T Cell',
    'B Cell',
    'Plasma',
    'LC',
    'DC',
    'Macrophage',
    'Monocytes',
    'Melanocytes',
    'Endothelial Cell',
    'Fibroblast',
    'KC Cornified',
    'KC Differentiating',
    'KC Basal',
    'KC Dysplastic',
    'KC Hair',
    'KC IFN',
    
]

# Set 'Level3' as a categorical variable with the defined order
scc.obs['Level2'] = pd.Categorical(
    scc.obs['Level2'], 
    categories=level2_order, 
    ordered=True
)


with plt.rc_context({"figure.figsize": (8, 8), "figure.dpi": (300)}):
# Now plot the dotplot with the custom order
    sc.pl.dotplot(
    scc,
    { "Category1": ['CTSW','KLRB1','NKG7','GNLY','CD52','CD3E','IL32','CD79A','MS4A1','PLAC8','JCHAIN','IRF7','CD207','CST3','LYZ','BASP1','CD83','CD74','TYROBP',
                   'CD68','AIF1','FCER1G',"CD14", "S100A8", "S100A9", "FCN1", "CX3CR1",'DCT','MLANA','TYRP1','PMEL','GNG11','IGFBP7','RAMP2','PECAM1','EGFL7','COL1A1','COL1A2','COL6A2',
                   'DCN','PLAC9','SBSN','KRT2','DSC1','KRT15','KRT10','KRTDAP','PKP1','KRT14','KRT5','KRT15','IFI27','S100A8','KRT6A']},
    standard_scale="var",
    color_map="Reds",
    groupby="Level2", figsize=(18, 6) ,show=False
)
    plt.savefig("/scratch/project/stseq/Prakrithi/skin_atlas/reanalysis_figs/scc/Level2_dotplot.pdf", bbox_inches="tight")
#

In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

# Define the genes for green and blue
green_genes = ['CTSW', 'KLRB1', 'CD79A', 'IRF7', 'CD14', 'FCER1G']
blue_genes = ['CD3E', 'GNLY', 'MS4A1', 'TYROBP', 'S100A8', 'PECAM1']

# Combine all genes into one list for plotting
all_genes = green_genes + blue_genes

# Create a custom color map: green for green genes, blue for blue genes
colors = ['green' if gene in green_genes else 'blue' for gene in all_genes]
cmap = ListedColormap(colors)

# Create the dot plot using the custom color map
sc.pl.dotplot(
    scc,
    var_names=all_genes,
    groupby="Level2",
    standard_scale="var",
    color_map=cmap,  # Apply custom color map here
    figsize=(18, 6),
    show=False  # Delay showing for further customization
)

# Access the current figure and axes from the dotplot object
fig = plt.gcf()
ax = plt.gca()

# Apply colors to dots (patches) based on our custom color map
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(colors[i])  # Set the color for each gene based on the colors list

# Show the plot
plt.show()


In [None]:
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

# Define the genes for green and blue
green_genes = ['CTSW', 'KLRB1', 'CD79A', 'IRF7', 'CD14', 'FCER1G']
blue_genes = ['CD3E', 'GNLY', 'MS4A1', 'TYROBP', 'S100A8', 'PECAM1']

# Combine all genes into one list for plotting
all_genes = green_genes + blue_genes

# Generate the dotplot using Scanpy (this will not yet apply the custom gradients)
sc.pl.dotplot(
    scc,
    var_names=all_genes,
    groupby="Level2",
    standard_scale="var",
    figsize=(18, 6),
    show=False  # Delay showing for further customization
)

# Access the current figure and axes from the dotplot object
fig = plt.gcf()
ax = plt.gca()

# Get the expression data for each gene (using .raw.X or .X based on availability)
expression_data = scc.raw.X if scc.raw is not None else scc.X

# Normalize the expression data to scale between 0 and 1 for color mapping
norm = Normalize(vmin=expression_data.min(), vmax=expression_data.max())

# Iterate through the patches (dots) in the dotplot to apply custom color gradients
for i, patch in enumerate(ax.patches):
    gene = all_genes[i]  # Get the gene corresponding to the current patch
    expression_value = expression_data[:, scc.var_names.get_loc(gene)].mean()  # Get the mean expression value for the gene
    
    # Apply the gradient color for green genes (using Greens colormap)
    if gene in green_genes:
        color = plt.cm.Greens(norm(expression_value))  # Green gradient for green genes
    # Apply the gradient color for blue genes (using Blues colormap)
    elif gene in blue_genes:
        color = plt.cm.Blues(norm(expression_value))  # Blue gradient for blue genes
    
    # Set the face color for the dot based on the gene's expression value and assigned gradient
    patch.set_facecolor(color)

# Show the plot with updated gradients
plt.show()


# <a id='section5'></a>  Splitting SCC/BCC

In [None]:
scc.obs['sample_ID'].unique().tolist()

In [None]:
# Subset the object to have only 'P5_SCC_BCC' sample in B18_adata
B18_adata = scc[scc.obs['sample_ID'] == 'P5_SCC_BCC']

# Subset the object to have all other samples in SCC_only_adata
SCC_only_adata = scc[scc.obs['sample_ID'] != 'P5_SCC_BCC']


In [None]:
# Subset the object to have 'P5_SCC_BCC' and 'P5_N' samples in B18_adata
B18_adata = scc[scc.obs['sample_ID'].isin(['P5_SCC_BCC', 'P5_N'])]

# Subset the object to have all other samples in SCC_only_adata
SCC_only_adata = scc[~scc.obs['sample_ID'].isin(['P5_SCC_BCC', 'P5_N'])]


In [None]:

sc.pl.umap(
    SCC_only_adata,  # Your AnnData object
    color='sample_ID',  # Assuming 'Level3' is the categorical variable you want to color by
    show=True,  # Set to False if you don't want to display the plot immediately
)

In [None]:
scc.X.toarray()  # 

In [None]:
# First, slice the sparse matrix (10 rows and 10 columns)
subset_sparse = scc.X[:10, :15]

# Then, convert to a dense array
subset_dense = subset_sparse.toarray()

# View the subset
print(subset_dense)


In [None]:
scc.var

In [None]:
B18_adata.write('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/B18_only_SCC_BCC_reference/B18_cancer_only.h5ad')
SCC_only_andNormaladata.write('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/SCC_only_reference/SCConly_Normal.h5ad')

In [None]:
B18_adata=anndata.read_h5ad('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/B18_only_SCC_BCC_reference/B18_cancer_only.h5ad')
SCC_only_andNormaladata=anndata.read_h5ad('/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/SCC_only_reference/SCConly_Normal.h5ad')

In [None]:
B18_adata.obs['sample_ID'].unique()

In [None]:
SCC_only_andNormaladata.obs['sample_ID'].unique()

In [None]:
SCC_cancer = SCC_only_andNormaladata[~SCC_only_andNormaladata.obs['sample_ID'].isin(['P1_N','P2_N','P3_N','P4_N', 'P5_N'])]
SCC_cancer.obs['sample_ID'].unique()

In [None]:
SCC_cancer.write('SCC_cancer_only.h5ad') #/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/SCC_only_reference/


In [None]:
Normal = SCC_only_andNormaladata[SCC_only_andNormaladata.obs['sample_ID'].isin(['P1_N','P2_N','P3_N','P4_N', 'P5_N'])]
Normal.obs['sample_ID'].unique()
Normal.write('Normal_only.h5ad') #/QRISdata/Q2051/SCC_Paper/resources/data/frozen_objects_Nov2024_PP/SCC_only_reference/


In [None]:
Normal.obs['sample_ID'].unique()
