# a python script for creating the counts and cell annotation files for the SyS dataset that inferCNV will use (the GTF file was xreated in R)

In [1]:
import pandas as pd 
import numpy as np
import anndata as ad
import scanpy as sc

In [2]:
sys_counts = pd.read_csv("gs://rebecca-summer23/sys_jerby_data/GSM3770931_SyS.tumors_counts.csv")


In [3]:
sys_counts

Unnamed: 0.1,Unnamed: 0,SS7CD3posP1_G09,SS7CD3posP1_C03,SS7CD45posP1_G07,SS7CD3posP1_A01,SS7CD3posP1_E02,SS5CD45posP1_D02,SS7CD45posP1_G04,SS7CD3posP1_A10,SS5CD45posP1_B04,...,SS1posP2_C02,SS5posP2_B02,SS1posP2_B01,SS5P10_E01,SS5posP2_F05,SS1posP2_C05,SS5posP2_H09,SS5posP2_H12,SS5posP2_D12,SS5posP2_D02
0,C9orf152,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,RPS11,141,358,141,316,0,737,247,236,531,...,0,130,121,163,83,309,1069,348,379,468
2,ELMO2,476,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CREB3L1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,PNMA1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,263,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23681,PIK3IP1,280,0,0,0,0,0,0,0,698,...,7,447,191,0,479,260,0,269,0,0
23682,SNRPD2,0,0,0,64,0,2,50,0,0,...,0,0,36,0,0,19,147,109,2,0
23683,SLC39A6,0,0,0,844,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23684,CTSC,0,0,2,367,430,170,0,0,2,...,1,1,35,1,0,293,240,1,0,38


In [4]:
#set index correctly
sys_counts = sys_counts.set_index("Unnamed: 0").rename_axis(None, axis=0)

In [5]:
#number of genes that are not zero everywhere
np.sum(sys_counts.sum(axis=1)!=0)

22618

In [47]:
pd.DataFrame(sys_counts,
             index = sys_counts.index, columns = sys_counts.columns).to_csv(
    "data/infercnv_input_files/sys/counts.txt", sep="\t")




In [48]:
cell_annots = pd.read_csv("gs://rebecca-summer23/sys_jerby_data/GSM3770931_SyS.tumors_cell.annotations.csv")

In [49]:
cell_annots

Unnamed: 0,Sample name,title,source name,organism,characteristics: sample,characteristics: cell.type,characteristics: tag,molecule,description,processed data file,raw file
0,Cell_1,SS7CD3posP1_G09,Synovial sarcoma tumor,Homo sapiens,SyS7,NK,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
1,Cell_2,SS7CD3posP1_C03,Synovial sarcoma tumor,Homo sapiens,SyS7,B.cell,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
2,Cell_3,SS7CD45posP1_G07,Synovial sarcoma tumor,Homo sapiens,SyS7,B.cell,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
3,Cell_4,SS7CD3posP1_A01,Synovial sarcoma tumor,Homo sapiens,SyS7,T.CD8,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
4,Cell_5,SS7CD3posP1_E02,Synovial sarcoma tumor,Homo sapiens,SyS7,NK,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
...,...,...,...,...,...,...,...,...,...,...,...
6946,Cell_6947,SS1posP2_C05,Synovial sarcoma tumor,Homo sapiens,SyS1,Macrophage,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
6947,Cell_6948,SS5posP2_H09,Synovial sarcoma tumor,Homo sapiens,SyS5,B.cell,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
6948,Cell_6949,SS5posP2_H12,Synovial sarcoma tumor,Homo sapiens,SyS5,T.cell,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,
6949,Cell_6950,SS5posP2_D12,Synovial sarcoma tumor,Homo sapiens,SyS5,T.CD8,CD45+,RNA,Full-length RNAseq (SMART-Seq2) reads,,


In [50]:
cell_annots = cell_annots[["title", "characteristics: cell.type", "characteristics: sample"]]

In [51]:
cell_annots.columns=["cell", "type", "sample"]

In [52]:
cell_annots.type = cell_annots.type.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_annots.type = cell_annots.type.str.lower()


In [53]:
cell_annots['label'] = cell_annots['type'] + "_" + cell_annots['sample']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_annots['label'] = cell_annots['type'] + "_" + cell_annots['sample']


In [54]:
cell_annots['final_label'] = np.where(cell_annots['type']=="malignant", cell_annots['label'], cell_annots['type'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cell_annots['final_label'] = np.where(cell_annots['type']=="malignant", cell_annots['label'], cell_annots['type'])


In [55]:
cell_annots['final_label'].value_counts()

final_label
macrophage             943
t.cd8                  659
malignant_SyS12        550
malignant_SyS11.met    473
malignant_SyS11        458
malignant_SyS5         433
malignant_SyS7         402
malignant_SyS14        373
malignant_SyS1         349
malignant_SyS13        349
malignant_SyS12pt      334
malignant_SyS2         321
malignant_SyS16        296
t.cd4                  235
t.cell                 206
mastocyte              185
nk                     102
b.cell                  90
fibroblast              81
endothelial             79
malignant_SyS10         33
Name: count, dtype: int64

In [57]:
cell_annots[['cell','final_label']].to_csv("data/infercnv_input_files/sys/cell_annots.txt", sep="\t", header=False, index=False)