Connected to downstream (Python 3.11.11)

In [None]:
import os
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib import colors
from matplotlib import rcParams
import seaborn as sns
from gprofiler import GProfiler
import sys

In [None]:
import configparser

# Read configuration file
config = configparser.ConfigParser()
config.read("../../utils/config.ini")

rawPath = config.get("DEFAULT", "rawPath")
scriptsPath = config.get("DEFAULT", "scriptsPath")
figPath = config.get("DEFAULT", "figPath")

initDir = rawPath + 'metacells_step0/fibroblasts/'
outDir = rawPath + 'integration/metacells/fibroblasts_testing/'
ooseDir = rawPath + 'out_of_sample_extension/fibroblasts_testing/'
genes = scriptsPath + '4_hdg/Tables/atlas_hdg_dispersion_patients_fibroblasts.csv'

utilsPath = config.get("DEFAULT", "utilsPath")
rawPath = config.get("DEFAULT", "rawPath")
scriptsPath = config.get("DEFAULT", "scriptsPath")

In [None]:
adata = ooseDir + 'integrated_query_seacells_scarches_tissuetreat_predicted_cellstates.h5ad'
adata = sc.read_h5ad(adata)

ad_raw_ref = sc.read_h5ad("/group/testa/Project/OvarianAtlas/atlas_project/raw_data/integration_backup/integration/metacells/fibroblasts/seacells_hdg_patients_batch_corr_scgen_celltypes_HDG.h5ad")
ad_raw_query = sc.read_h5ad("/group/testa/Project/OvarianAtlasTestStep0/raw_data/integration/metacells/fibroblasts/seacells_hdg_patients_batch_corr_scgen_celltypes_HDG.h5ad")

In [None]:
adata_ref = adata[~adata.obs_names.str.startswith("new")]
adata_ref.raw = ad_raw_ref.raw.copy()
adata_query = adata[adata.obs_names.str.startswith("new")]
ad_raw_query.obs_names = ["new_" + name for name in ad_raw_query.obs_names]
adata_query.raw = ad_raw_query.raw.copy()

ValueError: Can only init raw attribute with an AnnData object.

In [None]:
def adata_by_tissue(adata):
    adata_by_tissue = {}
    for tissue in adata.obs["tissue"].unique():
        if sum(adata.obs["tissue"] == tissue) > 10:
            adata_by_tissue[tissue] = adata[adata.obs["tissue"] == tissue]
    return adata_by_tissue

adata_ref_by_tissue = adata_by_tissue(adata_ref)
adata_query_by_tissue = adata_by_tissue(adata_query)

: 

In [None]:
both = adata_ref_by_tissue.keys() & adata_query_by_tissue.keys()

for tissue in both:
    counts = adata_ref_by_tissue[tissue].obs.cell_states.value_counts()
    adata_ref_cell_states = [cell_states for cell_states in counts.index if counts[cell_states] > 10]
    adata_ref_by_tissue[tissue] = adata_ref_by_tissue[tissue][adata_ref_by_tissue[tissue].obs.cell_states.isin(adata_ref_cell_states)]

    counts = adata_query_by_tissue[tissue].obs.predicted_cell_states.value_counts()
    adata_query_cell_states = [cell_states for cell_states in counts.index if counts[cell_states] > 10]
    adata_query_by_tissue[tissue] = adata_query_by_tissue[tissue][adata_query_by_tissue[tissue].obs.predicted_cell_states.isin(adata_query_cell_states)]

for tissue in both:
    sc.tl.rank_genes_groups(adata_ref_by_tissue[tissue], groupby="cell_states", method="wilcoxon", use_raw=True)
    sc.tl.rank_genes_groups(adata_query_by_tissue[tissue], groupby="predicted_cell_states", method="wilcoxon", use_raw=True)
## 2nd strategy

: 

In [None]:
def extract_degs(adata):
    ranks = adata.uns["rank_genes_groups"]
    pvals = pd.DataFrame(ranks["pvals_adj"])
    names = pd.DataFrame(ranks["names"])
    scores = pd.DataFrame(ranks["scores"])
    change = pd.DataFrame(ranks["logfoldchanges"])
    dfs = {}
    for c in change.columns:
        dfs[c] = pd.concat([names[c], pvals[c], scores[c], change[c]], axis=1)
        dfs[c].columns = ["names", "pvals_adj", "scores", "logfoldchanges"]
        dfs[c] = dfs[c][
            ((dfs[c]['logfoldchanges'] > 1) | (dfs[c]['logfoldchanges'] < -1)) &
            (dfs[c]['logfoldchanges']  < 100) &
            (dfs[c]['logfoldchanges'] > -100) &            
            (dfs[c]['pvals_adj'] < 0.05)
        ]
    return dfs

: 

In [None]:
ranks_query = {tissue: extract_degs(adata_query_by_tissue[tissue]) for tissue in both}
ranks_ref = {tissue: extract_degs(adata_ref_by_tissue[tissue]) for tissue in both}
from collections.abc import MutableMapping

def flatten(dictionary, parent_key='', separator='_'):
    items = []
    for key, value in dictionary.items():
        new_key = parent_key + separator + key if parent_key else key
        if isinstance(value, MutableMapping):
            items.extend(flatten(value, new_key, separator=separator).items())
        else:
            items.append((new_key, value))
    return dict(items)
ranks_query = flatten(ranks_query)
ranks_ref = flatten(ranks_ref)

: 

In [None]:
both = ranks_query.keys() & ranks_ref.keys()
#both = [b for b in both if not b.endswith("_up") and not b.endswith("_down")]

: 

In [None]:
def run_gprof(query, background):
    gp = GProfiler(return_dataframe=True)
    enrichment_results = gp.profile(
        organism='hsapiens', 
        query=query,
        no_evidences=False, 
        background=background,
        sources=['GO:CC', 'GO:BP', 'GO:MF', 'REAC', 'KEGG'])
    return enrichment_results

def wrap_gprof(ranks, cluster, background):
    if(ranks[cluster].empty):
        return pd.DataFrame(columns=["source", "native", "name", "p_value", "significant", "description", "term_size", "query_size", "intersection_size", "effective_domain_size", "precision", "recall", "query", "parents", "intersections", "evidences"])
    return run_gprof(ranks[cluster]["names"].to_list(), background)

enrichment = {}
for cluster in both:
    query = wrap_gprof(ranks_query ,cluster, adata_query.var_names.to_list())
    ref = wrap_gprof(ranks_ref, cluster, adata_ref.var_names.to_list())
    enrichment[cluster] = query, ref

: 

In [None]:
import pandas as pd

# Initialize a list to store the results
results = []

# Iterate over the enrichment dictionary
for cluster, (query_df, ref_df) in enrichment.items():
    # Get the sets of names from both dataframes
    query_names = set(query_df['name'])
    ref_names = set(ref_df['name'])
    
    # Calculate the number of overlapping names
    overlap = len(query_names & ref_names)
    
    # Calculate the total number of names in each dataframe
    total_query = len(query_names)
    total_ref = len(ref_names)
    if(total_query == 0 or total_ref == 0):
        perc = 0
    else:
        perc = overlap/min(total_query, total_ref)*100
    
    cluster_tissue = cluster.split("_")[0]
    cluster_cell_state = "_".join(cluster.split("_")[1:])

    no_cells_query = sum(adata_query_by_tissue[cluster_tissue].obs.predicted_cell_states == cluster_cell_state)
    no_cells_ref = sum(adata_ref_by_tissue[cluster_tissue].obs.cell_states == cluster_cell_state)

    # Append the results to the list
    results.append({
        'cluster': cluster,
        'overlap': overlap,
        'total_query': total_query,
        'total_ref': total_ref,
        'no_cells_query': no_cells_query,
        'no_cells_ref': no_cells_ref,
        'percentage': perc
    })

# Convert the results list to a dataframe
overlap_df = pd.DataFrame(results)

# Display the dataframe
overlap_df.to_csv("fibroblasts_overlap_filter.csv")

: 

In [None]:
adata_ref = adata[~adata.obs_names.str.startswith("new")]
adata_ref.raw = ad_raw_ref.raw.to_adata()
adata_query = adata[adata.obs_names.str.startswith("new")]
ad_raw_query.obs_names = ["new_" + name for name in ad_raw_query.obs_names]
adata_query.raw = ad_raw_query.raw.to_adata()

In [None]:
adata_ref.raw

<anndata._core.raw.Raw at 0x1554f9a8f550>

In [None]:
adata_ref.raw.to_adata()

AnnData object with n_obs × n_vars = 6639 × 17063
    obs: 'ID', 'sample_name', 'tissue', 'developmental_stage', 'treatment', 'recurrence', 'tumor_stage', 'paper_ID', 'anatomical_location', 'patient_id', 'dataset', 'cell_type', 'cell_subtype', 'sample_ID', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'CancerMSK', 'EndothelialMSK', 'FibroblastsMSK', 'HematopoieticMSK', 'cell_labels_ratio', 'max', 'assignment', 'leiden-1.8', 'SEACell', '# Single Cells', 'tissue-treatment', 'cell_types', 'reference_map', 'original_batch', 'primary_state', 'ascites_state', 'metastasis_state', 'cell_states', 'ref', 'predicted_cell_states'
    var: 'highly_variable'
    uns: 'cell_states_colors', 'cell_type_colors', 'dataset_colors', 'neighbors', 'paper_ID_colors', 'pca', 'tissue-treatment_colors', 'tissue_colors', 'treatment_colors', 'umap'
    obsm: 'X_pca', 'X_umap', 'latent_corrected'
    obsp: 'connectivities', 'distances'

In [None]:
def adata_by_tissue(adata):
    adata_by_tissue = {}
    for tissue in adata.obs["tissue"].unique():
        if sum(adata.obs["tissue"] == tissue) > 10:
            adata_by_tissue[tissue] = adata[adata.obs["tissue"] == tissue]
    return adata_by_tissue

adata_ref_by_tissue = adata_by_tissue(adata_ref)
adata_query_by_tissue = adata_by_tissue(adata_query)

In [None]:
both = adata_ref_by_tissue.keys() & adata_query_by_tissue.keys()

for tissue in both:
    counts = adata_ref_by_tissue[tissue].obs.cell_states.value_counts()
    adata_ref_cell_states = [cell_states for cell_states in counts.index if counts[cell_states] > 10]
    adata_ref_by_tissue[tissue] = adata_ref_by_tissue[tissue][adata_ref_by_tissue[tissue].obs.cell_states.isin(adata_ref_cell_states)]

    counts = adata_query_by_tissue[tissue].obs.predicted_cell_states.value_counts()
    adata_query_cell_states = [cell_states for cell_states in counts.index if counts[cell_states] > 10]
    adata_query_by_tissue[tissue] = adata_query_by_tissue[tissue][adata_query_by_tissue[tissue].obs.predicted_cell_states.isin(adata_query_cell_states)]

for tissue in both:
    sc.tl.rank_genes_groups(adata_ref_by_tissue[tissue], groupby="cell_states", method="wilcoxon", use_raw=True)
    sc.tl.rank_genes_groups(adata_query_by_tissue[tissue], groupby="predicted_cell_states", method="wilcoxon", use_raw=True)
## 2nd strategy

  adata.uns[key_added] = {}
  adata.uns[key_added] = {}
  adata.uns[key_added] = {}
  adata.uns[key_added] = {}
  adata.uns[key_added] = {}
  adata.uns[key_added] = {}


In [None]:
def extract_degs(adata):
    ranks = adata.uns["rank_genes_groups"]
    pvals = pd.DataFrame(ranks["pvals_adj"])
    names = pd.DataFrame(ranks["names"])
    scores = pd.DataFrame(ranks["scores"])
    change = pd.DataFrame(ranks["logfoldchanges"])
    dfs = {}
    for c in change.columns:
        dfs[c] = pd.concat([names[c], pvals[c], scores[c], change[c]], axis=1)
        dfs[c].columns = ["names", "pvals_adj", "scores", "logfoldchanges"]
        dfs[c] = dfs[c][
            ((dfs[c]['logfoldchanges'] > 1) | (dfs[c]['logfoldchanges'] < -1)) &
            (dfs[c]['logfoldchanges']  < 100) &
            (dfs[c]['logfoldchanges'] > -100) &            
            (dfs[c]['pvals_adj'] < 0.05)
        ]
    return dfs

In [None]:
ranks_query = {tissue: extract_degs(adata_query_by_tissue[tissue]) for tissue in both}
ranks_ref = {tissue: extract_degs(adata_ref_by_tissue[tissue]) for tissue in both}
from collections.abc import MutableMapping

def flatten(dictionary, parent_key='', separator='_'):
    items = []
    for key, value in dictionary.items():
        new_key = parent_key + separator + key if parent_key else key
        if isinstance(value, MutableMapping):
            items.extend(flatten(value, new_key, separator=separator).items())
        else:
            items.append((new_key, value))
    return dict(items)
ranks_query = flatten(ranks_query)
ranks_ref = flatten(ranks_ref)

In [None]:
both = ranks_query.keys() & ranks_ref.keys()
#both = [b for b in both if not b.endswith("_up") and not b.endswith("_down")]

In [None]:
def run_gprof(query, background):
    gp = GProfiler(return_dataframe=True)
    enrichment_results = gp.profile(
        organism='hsapiens', 
        query=query,
        no_evidences=False, 
        background=background,
        sources=['GO:CC', 'GO:BP', 'GO:MF', 'REAC', 'KEGG'])
    return enrichment_results

def wrap_gprof(ranks, cluster, background):
    if(ranks[cluster].empty):
        return pd.DataFrame(columns=["source", "native", "name", "p_value", "significant", "description", "term_size", "query_size", "intersection_size", "effective_domain_size", "precision", "recall", "query", "parents", "intersections", "evidences"])
    return run_gprof(ranks[cluster]["names"].to_list(), background)

enrichment = {}
for cluster in both:
    query = wrap_gprof(ranks_query ,cluster, adata_query.var_names.to_list())
    ref = wrap_gprof(ranks_ref, cluster, adata_ref.var_names.to_list())
    enrichment[cluster] = query, ref

In [None]:
import pandas as pd

# Initialize a list to store the results
results = []

# Iterate over the enrichment dictionary
for cluster, (query_df, ref_df) in enrichment.items():
    # Get the sets of names from both dataframes
    query_names = set(query_df['name'])
    ref_names = set(ref_df['name'])
    
    # Calculate the number of overlapping names
    overlap = len(query_names & ref_names)
    
    # Calculate the total number of names in each dataframe
    total_query = len(query_names)
    total_ref = len(ref_names)
    if(total_query == 0 or total_ref == 0):
        perc = 0
    else:
        perc = overlap/min(total_query, total_ref)*100
    
    cluster_tissue = cluster.split("_")[0]
    cluster_cell_state = "_".join(cluster.split("_")[1:])

    no_cells_query = sum(adata_query_by_tissue[cluster_tissue].obs.predicted_cell_states == cluster_cell_state)
    no_cells_ref = sum(adata_ref_by_tissue[cluster_tissue].obs.cell_states == cluster_cell_state)

    # Append the results to the list
    results.append({
        'cluster': cluster,
        'overlap': overlap,
        'total_query': total_query,
        'total_ref': total_ref,
        'no_cells_query': no_cells_query,
        'no_cells_ref': no_cells_ref,
        'percentage': perc
    })

# Convert the results list to a dataframe
overlap_df = pd.DataFrame(results)

# Display the dataframe
overlap_df.to_csv("fibroblasts_overlap_filter.csv")

In [None]:
overlap_df

Unnamed: 0,cluster,overlap,total_query,total_ref,no_cells_query,no_cells_ref,percentage
0,Ascites_ECM_shaping_cells,0,0,151,25,29,0.0
1,Metastasis_Cycling_cells,4,4,199,80,79,100.0
2,Primary_Cellular_metabolism,0,39,53,612,584,0.0
3,Metastasis_Cellular_metabolism,0,0,63,894,772,0.0
4,Metastasis_Epithelium_development-cell_division,0,2,27,96,116,0.0
5,Ascites_Cellular_metabolism,50,94,59,117,76,84.745763
6,Ascites_Cellular_metabolism-ECM,51,67,70,38,49,76.119403
7,Primary_ECM_shaping_cells,0,0,81,400,474,0.0
8,Metastasis_Vascular_processes_regulation,0,0,64,237,307,0.0
9,Primary_Smooth_muscle_cells_development,0,0,63,258,267,0.0


In [None]:
both

{'Ascites_Angiogenesis',
 'Ascites_Cellular_metabolism',
 'Ascites_Cellular_metabolism-ECM',
 'Ascites_Cycling_cells',
 'Ascites_ECM_shaping_cells',
 'Ascites_Extracellular_tissue_development',
 'Ascites_RNA_metabolism',
 'Ascites_Unknown_ascites',
 'Metastasis_Angiogenesis',
 'Metastasis_Cellular_metabolism',
 'Metastasis_Collagen_degradation',
 'Metastasis_Cycling_cells',
 'Metastasis_ECM_shaping_cells',
 'Metastasis_Epithelium_development-cell_division',
 'Metastasis_Immunoreactive_cells-T_cells',
 'Metastasis_Protein_catabolism',
 'Metastasis_Smooth_muscle_cells_development',
 'Metastasis_Unknown_metastasis',
 'Metastasis_Vascular_processes_regulation',
 'Primary_Angiogenesis',
 'Primary_Cellular_metabolism',
 'Primary_Cycling_cells',
 'Primary_ECM_shaping_cells',
 'Primary_Epithelium_development',
 'Primary_Immunoreactive_cells',
 'Primary_Protein_metabolism-cell_death',
 'Primary_Response_to_stress-ROS',
 'Primary_Smooth_muscle_cells_development',
 'Primary_Unknown_primary'}

In [None]:
ranks_query.keys()

dict_keys(['Ascites_Angiogenesis', 'Ascites_Cellular_metabolism', 'Ascites_Cellular_metabolism-ECM', 'Ascites_Cycling_cells', 'Ascites_ECM_shaping_cells', 'Ascites_Extracellular_tissue_development', 'Ascites_Protein_catabolism', 'Ascites_RNA_metabolism', 'Ascites_Smooth_muscle_cells_development', 'Ascites_Unknown_ascites', 'Ascites_Unknown_primary', 'Metastasis_Angiogenesis', 'Metastasis_Cellular_metabolism', 'Metastasis_Collagen_degradation', 'Metastasis_Cycling_cells', 'Metastasis_ECM_shaping_cells', 'Metastasis_Epithelium_development', 'Metastasis_Epithelium_development-cell_division', 'Metastasis_Immunoreactive_cells', 'Metastasis_Immunoreactive_cells-T_cells', 'Metastasis_Protein_catabolism', 'Metastasis_Protein_metabolism-cell_death', 'Metastasis_Smooth_muscle_cells_development', 'Metastasis_Unknown_ascites', 'Metastasis_Unknown_metastasis', 'Metastasis_Unknown_primary', 'Metastasis_Vascular_processes_regulation', 'Primary_Angiogenesis', 'Primary_Cellular_metabolism', 'Primary_Co

In [None]:
ranks_ref.keys()

dict_keys(['Ascites_Cellular_metabolism', 'Ascites_ECM_shaping_cells', 'Ascites_Cycling_cells', 'Ascites_Angiogenesis', 'Ascites_Extracellular_tissue_development', 'Ascites_Unknown_ascites', 'Ascites_Cellular_metabolism-ECM', 'Ascites_RNA_metabolism', 'Metastasis_Cellular_metabolism', 'Metastasis_ECM_shaping_cells', 'Metastasis_Smooth_muscle_cells_development', 'Metastasis_Cycling_cells', 'Metastasis_Angiogenesis', 'Metastasis_Collagen_degradation', 'Metastasis_Protein_catabolism', 'Metastasis_Unknown_metastasis', 'Metastasis_Vascular_processes_regulation', 'Metastasis_Immunoreactive_cells-T_cells', 'Metastasis_Epithelium_development-cell_division', 'Primary_Cellular_metabolism', 'Primary_ECM_shaping_cells', 'Primary_Unknown_primary', 'Primary_Smooth_muscle_cells_development', 'Primary_Immunoreactive_cells', 'Primary_Protein_metabolism-cell_death', 'Primary_Epithelium_development', 'Primary_Cycling_cells', 'Primary_Angiogenesis', 'Primary_Response_to_stress-ROS'])

In [None]:
both

{'Ascites_Angiogenesis',
 'Ascites_Cellular_metabolism',
 'Ascites_Cellular_metabolism-ECM',
 'Ascites_Cycling_cells',
 'Ascites_ECM_shaping_cells',
 'Ascites_Extracellular_tissue_development',
 'Ascites_RNA_metabolism',
 'Ascites_Unknown_ascites',
 'Metastasis_Angiogenesis',
 'Metastasis_Cellular_metabolism',
 'Metastasis_Collagen_degradation',
 'Metastasis_Cycling_cells',
 'Metastasis_ECM_shaping_cells',
 'Metastasis_Epithelium_development-cell_division',
 'Metastasis_Immunoreactive_cells-T_cells',
 'Metastasis_Protein_catabolism',
 'Metastasis_Smooth_muscle_cells_development',
 'Metastasis_Unknown_metastasis',
 'Metastasis_Vascular_processes_regulation',
 'Primary_Angiogenesis',
 'Primary_Cellular_metabolism',
 'Primary_Cycling_cells',
 'Primary_ECM_shaping_cells',
 'Primary_Epithelium_development',
 'Primary_Immunoreactive_cells',
 'Primary_Protein_metabolism-cell_death',
 'Primary_Response_to_stress-ROS',
 'Primary_Smooth_muscle_cells_development',
 'Primary_Unknown_primary'}

In [None]:
ranks_query

{'Ascites_Angiogenesis':           names  pvals_adj    scores  logfoldchanges
 0       ADAMTS1   0.000084  5.735214        2.633999
 1            F3   0.000084  5.734114        2.752410
 2        FILIP1   0.000195  5.515221        2.117325
 3      ARHGAP26   0.000195  5.400825        1.949493
 4         FXYD1   0.000195  5.325477        2.802981
 ...         ...        ...       ...             ...
 6566      TEX14   0.049334  2.345674        1.086946
 6578     PDLIM3   0.049747  2.341824        1.045981
 6582      CD79B   0.049856  2.340724        1.361407
 17061   SLC34A2   0.027700 -2.631665      -28.409525
 17062    STXBP2   0.017481 -2.855507       -1.697774
 
 [4967 rows x 4 columns],
 'Ascites_Cellular_metabolism':          names     pvals_adj    scores  logfoldchanges
 0         EHD4  8.894864e-11  7.660808        1.765478
 1      BHLHE40  8.894864e-11  7.652471        1.766478
 2         USP4  8.894864e-11  7.625077        1.479364
 3         CERK  8.894864e-11  7.601851      

In [None]:
ranks_query["Metastasis_Collagen_degradation"]

Unnamed: 0,names,pvals_adj,scores,logfoldchanges
4,C1QA,0.0002,5.415452,1.149794
10,CD69,0.0002,5.279945,1.195483
12,C5orf46,0.000236,5.219256,1.118903
13,SPP1,0.000278,5.175065,1.154198
16,TYROBP,0.000544,4.999718,1.074417
24,CD52,0.002222,4.630545,1.298924
28,GZMA,0.002584,4.566311,1.167824
29,CD2,0.002584,4.56091,1.225482
32,RGS1,0.003681,4.471525,1.175726
34,CCL4,0.004043,4.434202,1.203421


In [None]:
ranks_ref["Metastasis_Collagen_degradation"]

Unnamed: 0,names,pvals_adj,scores,logfoldchanges
0,COL11A1,1.241825e-35,13.003778,1.643303
1,MMP11,2.954120e-31,12.114691,1.447345
2,COL10A1,2.431170e-29,11.713821,1.182566
3,INHBA,7.152709e-27,11.160969,1.265542
4,NTM,2.521344e-25,10.803183,1.058459
...,...,...,...,...
17058,ADAMTS1,3.864728e-41,-14.022684,-1.650585
17059,CDC42EP4,1.738494e-41,-14.095017,-1.782606
17060,RERG,2.141902e-44,-14.579988,-2.050480
17061,SLCO3A1,2.139498e-44,-14.607718,-1.283337


In [None]:
ranks_query

{'Ascites_Angiogenesis':           names  pvals_adj    scores  logfoldchanges
 0       ADAMTS1   0.000084  5.735214        2.633999
 1            F3   0.000084  5.734114        2.752410
 2        FILIP1   0.000195  5.515221        2.117325
 3      ARHGAP26   0.000195  5.400825        1.949493
 4         FXYD1   0.000195  5.325477        2.802981
 ...         ...        ...       ...             ...
 6566      TEX14   0.049334  2.345674        1.086946
 6578     PDLIM3   0.049747  2.341824        1.045981
 6582      CD79B   0.049856  2.340724        1.361407
 17061   SLC34A2   0.027700 -2.631665      -28.409525
 17062    STXBP2   0.017481 -2.855507       -1.697774
 
 [4967 rows x 4 columns],
 'Ascites_Cellular_metabolism':          names     pvals_adj    scores  logfoldchanges
 0         EHD4  8.894864e-11  7.660808        1.765478
 1      BHLHE40  8.894864e-11  7.652471        1.766478
 2         USP4  8.894864e-11  7.625077        1.479364
 3         CERK  8.894864e-11  7.601851      

In [None]:
ranks_ref

{'Ascites_Cellular_metabolism':          names     pvals_adj     scores  logfoldchanges
 0         RPN2  5.387669e-40  13.862329        3.758276
 1         HM13  5.387669e-40  13.861666        3.835921
 2         PDPN  5.387669e-40  13.857024        4.307160
 3        DDOST  5.387669e-40  13.845088        3.897570
 4        PDIA6  5.387669e-40  13.834479        3.795961
 ...        ...           ...        ...             ...
 13554  C6orf58  4.972111e-02   2.058860        2.683695
 13555    CXCR6  4.991768e-02   2.057202        1.885497
 17060    CCL21  4.330871e-02  -2.116548      -27.883936
 17061    CCL19  2.617183e-02  -2.315472       -7.100766
 17062      PLN  1.988917e-02  -2.418912      -28.252712
 
 [13368 rows x 4 columns],
 'Ascites_ECM_shaping_cells':            names     pvals_adj    scores  logfoldchanges
 0        THEMIS2  1.419244e-09  7.465194        3.862937
 1          SNTB1  1.721114e-09  7.347641        4.321960
 2          CMTM7  6.511272e-09  7.052724        2.77

In [None]:
pd.read_csv("/group/testa/Project/OvarianAtlas/atlas_project/raw_data/downstream_backup/downstream/clustering/fibroblasts/metastasis/leiden-0.41/rank_gene_groups_df_0.csv")

Unnamed: 0.1,Unnamed: 0,names,scores,logfoldchanges,pvals,pvals_adj
0,0,RUNX2,26.541820,2.986330,3.191964e-155,5.446448e-151
1,1,CREB3L1,26.434372,3.025981,5.519194e-154,4.708700e-150
2,2,GJB2,26.253714,3.252881,6.482652e-152,3.687116e-148
3,3,ZNF469,26.215220,2.919343,1.782287e-151,7.602789e-148
4,4,TMEM158,26.120594,3.072555,2.127757e-150,7.261184e-147
...,...,...,...,...,...,...
17058,17058,TNXB,-6.011717,-1.342610,1.835685e-09,2.806909e-09
17059,17059,IL6R,-6.048578,-1.071474,1.461298e-09,2.236045e-09
17060,17060,PLA2G2A,-6.461748,-1.772861,1.035006e-10,1.602860e-10
17061,17061,INMT,-6.968938,-1.610685,3.193415e-12,5.007281e-12


In [None]:
r0 = pd.read_csv("/group/testa/Project/OvarianAtlas/atlas_project/raw_data/downstream_backup/downstream/clustering/fibroblasts/metastasis/leiden-0.41/rank_gene_groups_df_0.csv")

In [None]:
r0_filtered = ((r0['logfoldchanges'] > 1) | (r0['logfoldchanges'] < -1)) &
            (r0['logfoldchanges']  < 100) &
            (r0['logfoldchanges'] > -100) &            
            (r0['pvals_adj'] < 0.05)


SyntaxError: invalid syntax (<ipython-input-27-882dc2388778>, line 1)

In [None]:
r0_filtered = ((r0['logfoldchanges'] > 1) | (r0['logfoldchanges'] < -1)) & (r0['logfoldchanges']  < 100) & (r0['logfoldchanges'] > -100) & (r0['pvals_adj'] < 0.05)


In [None]:
r0_filtered

0        True
1        True
2        True
3        True
4        True
         ... 
17058    True
17059    True
17060    True
17061    True
17062    True
Length: 17063, dtype: bool

In [None]:
sum(r0_filtered)

9998

In [None]:
overlap_df

Unnamed: 0,cluster,overlap,total_query,total_ref,no_cells_query,no_cells_ref,percentage
0,Ascites_ECM_shaping_cells,0,0,151,25,29,0.0
1,Metastasis_Cycling_cells,4,4,199,80,79,100.0
2,Primary_Cellular_metabolism,0,39,53,612,584,0.0
3,Metastasis_Cellular_metabolism,0,0,63,894,772,0.0
4,Metastasis_Epithelium_development-cell_division,0,2,27,96,116,0.0
5,Ascites_Cellular_metabolism,50,94,59,117,76,84.745763
6,Ascites_Cellular_metabolism-ECM,51,67,70,38,49,76.119403
7,Primary_ECM_shaping_cells,0,0,81,400,474,0.0
8,Metastasis_Vascular_processes_regulation,0,0,64,237,307,0.0
9,Primary_Smooth_muscle_cells_development,0,0,63,258,267,0.0


In [None]:
ranks_ref["Metastasis_Collagen_degradation"]

Unnamed: 0,names,pvals_adj,scores,logfoldchanges
0,COL11A1,1.241825e-35,13.003778,1.643303
1,MMP11,2.954120e-31,12.114691,1.447345
2,COL10A1,2.431170e-29,11.713821,1.182566
3,INHBA,7.152709e-27,11.160969,1.265542
4,NTM,2.521344e-25,10.803183,1.058459
...,...,...,...,...
17058,ADAMTS1,3.864728e-41,-14.022684,-1.650585
17059,CDC42EP4,1.738494e-41,-14.095017,-1.782606
17060,RERG,2.141902e-44,-14.579988,-2.050480
17061,SLCO3A1,2.139498e-44,-14.607718,-1.283337


In [None]:
r0_filtered

0        True
1        True
2        True
3        True
4        True
         ... 
17058    True
17059    True
17060    True
17061    True
17062    True
Length: 17063, dtype: bool

In [None]:
sum(r0_filtered)

9998

In [None]:
#%%

import os
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib import colors
from matplotlib import rcParams
import seaborn as sns
from gprofiler import GProfiler
import sys

In [None]:
#%%

import os
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib import colors
from matplotlib import rcParams
import seaborn as sns
from gprofiler import GProfiler
import sys

In [None]:
#%%

import os
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from matplotlib import colors
from matplotlib import rcParams
import seaborn as sns
from gprofiler import GProfiler
import sys