# Script for getting gene markers for different cell types

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scanpy as sc
import numpy as np
import scanpy.external as sce

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80, facecolor='white', frameon=True, figsize=(5, 5))

-----
anndata     0.9.1
scanpy      1.9.3
-----
PIL                 9.5.0
asciitree           NA
asttokens           NA
backcall            0.2.0
cloudpickle         2.2.1
colorama            0.4.6
comm                0.1.3
cycler              0.10.0
cython_runtime      NA
dask                2023.7.0
dateutil            2.8.2
debugpy             1.6.7
decorator           5.1.1
entrypoints         0.4
executing           1.2.0
fasteners           0.18
h5py                3.9.0
igraph              0.10.5
ipykernel           6.23.3
jedi                0.18.2
jinja2              3.1.2
joblib              1.3.0
kiwisolver          1.4.4
leidenalg           0.10.0
llvmlite            0.40.1
markupsafe          2.1.3
matplotlib          3.7.1
mpl_toolkits        NA
msgpack             1.0.5
natsort             8.4.0
numba               0.57.1
numcodecs           0.11.0
numpy               1.23.4
packaging           23.1
pandas              2.0.3
parso               0.8.3
patsy               

In [5]:
# Fetching the list of genes and cell types from Gene Atlas
tmp = pd.read_csv('resources/tissue_category_rna_Any_Tissue.tsv', sep = '\t')

In [7]:
tmp.head(4)

Unnamed: 0,Gene,Gene synonym,Ensembl,Gene description,Uniprot,Chromosome,Position,Protein class,Biological process,Molecular function,...,Pathology prognostics - Lung cancer,Pathology prognostics - Melanoma,Pathology prognostics - Ovarian cancer,Pathology prognostics - Pancreatic cancer,Pathology prognostics - Prostate cancer,Pathology prognostics - Renal cancer,Pathology prognostics - Stomach cancer,Pathology prognostics - Testis cancer,Pathology prognostics - Thyroid cancer,Pathology prognostics - Urothelial cancer
0,A1BG,,ENSG00000121410,Alpha-1-B glycoprotein,P04217,19,58345178-58353492,"Plasma proteins, Predicted intracellular prote...",,,...,unprognostic (1.09e-1),unprognostic (2.59e-1),unprognostic (2.10e-1),unprognostic (1.47e-2),unprognostic (1.37e-2),unprognostic (4.19e-5),unprognostic (2.37e-2),unprognostic (1.94e-1),unprognostic (1.72e-1),unprognostic (6.72e-2)
1,A1CF,"ACF, ACF64, ACF65, APOBEC1CF, ASP",ENSG00000148584,APOBEC1 complementation factor,Q9NQ94,10,50799409-50885675,Predicted intracellular proteins,mRNA processing,RNA-binding,...,unprognostic (7.38e-3),,unprognostic (1.30e-2),unprognostic (2.46e-2),unprognostic (1.20e-1),unprognostic (1.90e-3),unprognostic (1.97e-2),unprognostic (2.77e-1),unprognostic (2.19e-2),unprognostic (8.50e-4)
2,A2M,"CPAMD5, FWP007, S863-7",ENSG00000175899,Alpha-2-macroglobulin,P01023,12,9067664-9116229,"Cancer-related genes, Candidate cardiovascular...",,"Protease inhibitor, Serine protease inhibitor",...,unprognostic (3.65e-2),unprognostic (2.38e-1),unprognostic (7.19e-2),unprognostic (4.71e-2),unprognostic (2.06e-2),unprognostic (1.28e-2),unprognostic (8.04e-3),unprognostic (2.32e-2),unprognostic (8.58e-2),unprognostic (9.03e-3)
3,A2ML1,"CPAMD9, FLJ25179, p170",ENSG00000166535,Alpha-2-macroglobulin like 1,A8K2U0,12,8822621-8887001,"Disease related genes, Predicted intracellular...",,"Protease inhibitor, Serine protease inhibitor",...,unprognostic (7.58e-3),unprognostic (2.63e-1),unprognostic (1.57e-1),unprognostic (1.15e-3),unprognostic (2.03e-1),unprognostic (1.06e-9),unprognostic (2.28e-1),unprognostic (3.07e-1),unprognostic (5.88e-2),unprognostic (2.42e-2)


In [10]:
# Interesting columns
tmp[['RNA tissue specificity','RNA tissue distribution', 'RNA tissue specificity score',
       'RNA tissue specific nTPM', 'RNA single cell type specificity',
       'RNA single cell type distribution',
       'RNA single cell type specificity score','RNA single cell type specific nTPM']].head(4)

Unnamed: 0,RNA tissue specificity,RNA tissue distribution,RNA tissue specificity score,RNA tissue specific nTPM,RNA single cell type specificity,RNA single cell type distribution,RNA single cell type specificity score,RNA single cell type specific nTPM
0,Tissue enriched,Detected in single,1513.0,liver: 1194.2,Cell type enhanced,Detected in many,,Hepatocytes: 18.2;Oligodendrocytes: 8.1;Plasma...
1,Tissue enriched,Detected in some,6.0,liver: 148.5,Cell type enhanced,Detected in some,,Cholangiocytes: 152.6;Distal enterocytes: 91.2...
2,Tissue enhanced,Detected in all,,liver: 1212.9;lung: 1626.5,Cell type enhanced,Detected in many,,Adipocytes: 1247.2;Cardiomyocytes: 393.1;Endot...
3,Tissue enhanced,Detected in some,,esophagus: 540.7;vagina: 198.6,Group enriched,Detected in some,5.0,Squamous epithelial cells: 157.1;Suprabasal ke...


In [14]:
# Select genes + Distribution + celltype&Score (need to be separted)
tmp2 = tmp[["Gene","RNA single cell type distribution","RNA single cell type specific nTPM"]]
tmp2.head(4)

Unnamed: 0,Gene,RNA single cell type distribution,RNA single cell type specific nTPM
0,A1BG,Detected in many,Hepatocytes: 18.2;Oligodendrocytes: 8.1;Plasma...
1,A1CF,Detected in some,Cholangiocytes: 152.6;Distal enterocytes: 91.2...
2,A2M,Detected in many,Adipocytes: 1247.2;Cardiomyocytes: 393.1;Endot...
3,A2ML1,Detected in some,Squamous epithelial cells: 157.1;Suprabasal ke...


In [15]:
# Split the celltype and score into different columns and make new df
tmp2 = pd.concat([tmp2.Gene,tmp2["RNA single cell type distribution"],tmp2['RNA single cell type specific nTPM'].str.split(';', expand = True)],axis = 1)
tmp2.head(4)

Unnamed: 0,Gene,RNA single cell type distribution,0,1,2,3,4,5,6,7,8,9,10
0,A1BG,Detected in many,Hepatocytes: 18.2,Oligodendrocytes: 8.1,Plasma cells: 8.2,,,,,,,,
1,A1CF,Detected in some,Cholangiocytes: 152.6,Distal enterocytes: 91.2,Enteroendocrine cells: 92.5,Hepatocytes: 340.5,Intestinal goblet cells: 59.4,Paneth cells: 90.4,Proximal enterocytes: 148.7,Proximal tubular cells: 91.8,,,
2,A2M,Detected in many,Adipocytes: 1247.2,Cardiomyocytes: 393.1,Endothelial cells: 861.7,Hepatocytes: 654.0,Microglial cells: 339.1,Smooth muscle cells: 471.0,,,,,
3,A2ML1,Detected in some,Squamous epithelial cells: 157.1,Suprabasal keratinocytes: 477.0,,,,,,,,,


In [37]:
#Select cell types of interest, create new df with genes, scores, distribution for specific cell types
for j in ['B-cells','Cardiomyocytes','Endothelial cells','Macrophages','T-cells','Fibroblasts']: #[, 'Monocytes',  'Myeloid cells', 'Lymphoid cells',  ]:

    temp1 = pd.DataFrame()
    for i in range(tmp2.columns[-1]):
        tmp2[i] = tmp2[i].fillna("")
        
        temp = tmp2[tmp2[i].str.contains(f"{j}")]
        
        temp.rename(columns={i:"Cell Type"}, inplace = True)
        #print(temp.iloc[:,[0,1,i+2]])
        temp1 = pd.concat([temp1,temp.iloc[:,[0,1,i+2]]], axis = 0, ignore_index = True)
        #print(temp1.head(4))
    temp = pd.concat([temp1.Gene,temp1["RNA single cell type distribution"],temp1['Cell Type'].str.split(':', expand = True)], axis = 1)
    temp.rename(columns={0:"Cell Type", 1:"Score"}, inplace = True)
    temp.to_csv(f'{j}_human_atlas.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rename(columns={i:"Cell Type"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rename(columns={i:"Cell Type"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rename(columns={i:"Cell Type"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rena

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rename(columns={i:"Cell Type"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rename(columns={i:"Cell Type"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rename(columns={i:"Cell Type"}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp.rena

In [38]:
test = pd.read_csv('B-cells_human_atlas.csv')
test

Unnamed: 0,Gene,RNA single cell type distribution,Cell Type,Score
0,ADAM28,Detected in many,B-cells,223.9
1,ADGRG5,Detected in some,B-cells,6.2
2,ADORA2A,Detected in some,B-cells,6.1
3,AICDA,Detected in some,B-cells,1.3
4,AIM2,Detected in some,B-cells,70.0
...,...,...,...,...
300,MFNG,Detected in many,B-cells,33.0
301,SLC2A3,Detected in many,B-cells,326.6
302,SNX22,Detected in many,B-cells,38.8
303,TNFSF9,Detected in many,B-cells,51.8
