! description


given a csv or such table, where rows are samples and columuns are genes.
create a h5ad file where the count values are the .X matrix, the gene are in .var as "gene_hgnc" and the sample are in .obs as "sample_id"

In [1]:
# import
import anndata
import pandas as pd
import os
import short_utils
from pathlib import Path

In [2]:
# globals
base_dir = short_utils.get_base_dir()
base_dir

PosixPath('/home/shair/Desktop/STAMP_2023/jesse/trans_stamp_curr')

In [3]:
#print working dir
print(os.getcwd())

/home/shair/Desktop/STAMP_2023/jesse/trans_stamp_curr/scgpt/j_scgpt_utils


In [4]:
def preview_file(bulk_genexp, num_rows = 3):

    #print first few elements of index, first column, colnames, and first 4x4
    print(f"first {num_rows}x{num_rows}: \n", bulk_genexp.iloc[0:num_rows, 0:num_rows])
    #print first 4 elements of index
    print("start of idx: \n", bulk_genexp.index[:(num_rows+1)])
    #print start of colnames
    print("start of colnames: \n", bulk_genexp.columns[:(num_rows+1)])

In [5]:
def get_obs(bulk_genexp,meta_cols):
    '''
    create the obs df containng the meta data about the samples
    :param bulk_genexp: the genexp data as a df
    :param meta_cols: a list of colnames with sample metadata
    :return: a df of the meta data
    '''
    obs_df = bulk_genexp[meta_cols]
    return obs_df

In [6]:
def add_cell_type(metadata_df,tcga_cancer):
    '''

    :param metadata: a df of sample meta data
    :param tcga_cancer: the cancer type of the bulk genexp data
    :return: add a 'cell_type" col with the cancer type
    '''
    metadata_df["cell_type"] = tcga_cancer
    return metadata_df

In [7]:
def add_metadata(file_path, metadata_df):
    '''

    :param file_path: path to table with additional metadata
    :param metadata_df: the metadata df
    :return: metadata_df with additional metadata
    '''
    pass

In [8]:
def get_var(genexp,gene_cols,hgnc_col):
    '''
    create the var df containing data about the variables, usually genes
    :param genexp: a df of genexp data
    :param gene_cols: a list of colnames with gene names
    :param hgnc_col: the colname with the hgnc gene names
    :return: a df of genes with col 'gene_hgnc'
    '''
    #subset the df to only the gene cols
    var_df = genexp[gene_cols]
    #rename hgnc_col gene_hgnc
    var_df = var_df.rename(columns={hgnc_col: "gene_hgnc"})
    #reorder cols and make hgnc the first col
    
    # Reorder columns to make 'gene_hgnc' the first column
    reordered_cols = ['gene_hgnc'] + [col for col in gene_cols if col != hgnc_col]
    var_df = var_df[reordered_cols]

    return var_df

# create an anndata object: .X is the counts, .var "gene_hgnc" is the genes, .obs "sample_id" is the sample_id and .obs "oncosig_label_ERBB2" is the oncosig_labels
adata = anndata.AnnData(X=counts, var=pd.DataFrame(index=genes, data=genes, columns=["gene_hgnc"]), obs=obs_df)


In [9]:
def create_anndata(counts_df, var_df,obs_df):
    '''
    :param counts_df: a df of counts with no names
    :param var_df: a df of genes with col 'gene_hgnc' as index
    :param obs_df: a df of sample meta data
    :return: an anndata object
    '''
    return anndata.AnnData(X=counts_df, var=var_df, obs=obs_df)


In [10]:
# def bulk_2_h5ad(bulk_genexp,tcga_cancer ='brca',metadata_cols = [],samples_metadata_path = '',\
#         genes_meta_path = '', save_name = ''):
#     #create the obs df: sample_id and oncosig_label_ERBB2 where the smaple id's are the idx
#     obs_df = get_sample_meta(bulk_genexp,metadata_cols)
#     #add a "cell_type" column to the obs_df with the value "brca"
#     obs_df = add_cell_type(obs_df,"brca")
#     #create the var and X df: gene_hgnc and the counts
#     #take subset, excluding the meta_data_cols
#     #drop meta_data_cols
#     bulk_genexp.drop(metadata_cols, axis=1, inplace=True)
# 
#     #save colnames to genes list
#     genes_var = get_genes(bulk_genexp)
#     #SAVE genexp counts to a counts df
#     counts = bulk_genexp.values
#     adata = create_anndata(counts, genes_var, obs_df)
# 
#     #save the anndata object
#     # raise err if no save name provided
#     if not save_name:
#         raise ValueError("no save name provided")
#     adata.write(base_dir + '/data/' +  save_name)



! run the script:
read the data, identify metadate cols, then create the anndata object

In [11]:
#set path to file
data_path = Path(base_dir / 'training_data/cell_lines/broad_CCLE/genexp/exp_protein_genes_tpml_logp1.csv')
data_path

PosixPath('/home/shair/Desktop/STAMP_2023/jesse/trans_stamp_curr/training_data/cell_lines/broad_CCLE/genexp/exp_protein_genes_tpml_logp1.csv')

In [12]:
#read file: tab sepereated
bulk_genexp = pd.read_csv(data_path, sep=',')

In [14]:
preview_file(bulk_genexp, 3)
#pritn colnames
print(bulk_genexp.columns)

first 3x3: 
   cell_line_id  TSPAN6 (7105)  TNMD (64102)
0   ACH-001113       4.331992      0.000000
1   ACH-001289       4.567424      0.584963
2   ACH-001339       3.150560      0.000000
start of idx: 
 RangeIndex(start=0, stop=4, step=1)
start of colnames: 
 Index(['cell_line_id', 'TSPAN6 (7105)', 'TNMD (64102)', 'DPM1 (8813)'], dtype='object')
Index(['cell_line_id', 'TSPAN6 (7105)', 'TNMD (64102)', 'DPM1 (8813)',
       'SCYL3 (57147)', 'C1orf112 (55732)', 'FGR (2268)', 'CFH (3075)',
       'FUCA2 (2519)', 'GCLC (2729)',
       ...
       'H3C2 (8358)', 'H3C3 (8352)', 'AC098582.1 (8916)',
       'DUS4L-BCAP29 (115253422)', 'C8orf44-SGK3 (100533105)',
       'ELOA3B (728929)', 'NPBWR1 (2831)', 'ELOA3D (100506888)',
       'ELOA3 (162699)', 'CDR1 (1038)'],
      dtype='object', length=19194)


In [39]:
#drop any chosen cols
bulk_genexp = bulk_genexp.drop(['DesignElementAccession'], axis=1)

create the obs df - containing sample names and any other metadata

In [16]:
#rename first column as 'cell_line_id'
bulk_genexp = bulk_genexp.rename(columns={'Unnamed: 0': "cell_line_id"})

#save the cell line id col as the obs df
obs_df = pd.DataFrame(bulk_genexp['cell_line_id'])

create the var df - containing gene names and aliases

In [19]:
#in this case the genes names are the colnames, starting from the second col
gene_names = list(bulk_genexp.columns[1:])

In [23]:
#process the egens into tuples of hgnc and ncbi_id

# Using list comprehension to process each element
split_gene_names = [(gene.split(' (')[0], gene.split(' (')[1].replace(')', '')) for gene in gene_names]

# Create a DataFrame from the list of tuples
var_df = pd.DataFrame(split_gene_names, columns=['hgnc_gene', 'ncbi_id'])


create the counts df - containing the counts. ensure samples are rows and cols are features (genes)

In [22]:
#create counts df by removing the var cols and no colnames
counts_df = bulk_genexp.drop(['cell_line_id'], axis=1).values

In [24]:
#create anndata object
adata = create_anndata(counts_df, var_df, obs_df)



In [126]:
#save the anndata object
save_folder = Path(base_dir / 'training_data/cell_lines/broad_CCLE/genexp/')
save_name = 'exp_protein_genes_tpml_logp1_erbb2_drug_rnai_crisp.h5ad'
save_path = Path(save_folder / save_name)
save_path

PosixPath('/home/shair/Desktop/STAMP_2023/jesse/trans_stamp_curr/training_data/cell_lines/broad_CCLE/genexp/exp_protein_genes_tpml_logp1_erbb2_drug_rnai_crisp.h5ad')

#incoporate metadate from other files

# add clinical data to adata

In [119]:
#load clinical data
clin_path = Path(base_dir / 'training_data/cell_lines/broad_CCLE/pertubation/CRISPR + RNA/CRISPR_(DepMap_Public_23Q4+Score,_Chronos).csv')

clin_df = pd.read_csv(clin_path, sep=',', index_col=0)

In [120]:
print(clin_df.shape)
clin_df.iloc[0:5,:]

(1100, 18424)


Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
ACH-000001,-0.122637,0.025881,0.034217,-0.128082,-0.031285,0.338046,-0.006439,-0.093642,0.189186,0.042783,...,-0.131727,-0.039829,0.179405,0.283552,0.204513,-0.289724,-0.062972,0.07418,0.111244,-0.467908
ACH-000004,0.019756,-0.08364,-0.060118,-0.027417,-0.036116,-0.001056,0.312876,-0.086897,0.204434,0.188715,...,-0.170329,-0.454263,0.194583,0.098989,0.126948,0.032983,-0.410392,0.113156,0.234388,-0.088306
ACH-000005,-0.107208,-0.023211,0.200204,0.116039,-0.172227,-0.071294,0.20327,-0.127806,-0.090981,0.073349,...,-0.301695,-0.454969,-0.061959,-0.036427,0.016602,-0.201273,-0.178877,-0.055349,-0.002161,-0.186842
ACH-000007,-0.031027,-0.13785,0.067704,0.107988,0.007992,0.124945,0.049548,-0.220824,-0.165669,0.230047,...,-0.30339,-0.507272,-0.0254,0.236659,0.07201,-0.100344,-0.46216,-0.001555,-0.325964,-0.48666
ACH-000009,0.008888,-0.146566,0.084471,0.089419,0.065109,0.027841,0.087943,-0.161369,0.041121,0.136621,...,-0.255466,-0.288739,-0.037132,0.261444,-0.062391,-0.112703,-0.598698,0.095877,-0.026742,-0.320759


examine a data frame

In [106]:
#get the dtype, na count and unique values for each col in gdsc
clin_df.info()
clin_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 710 entries, ACH-001270 to ACH-002331
Columns: 16810 entries, A1BG to SCO2
dtypes: float64(16810)
memory usage: 91.1+ MB


Unnamed: 0,A1BG,NAT2,ADA,CDH2,AKT3,MED6,NR2E3,NAALAD2,DUXB,PDZK1P1,...,RCE1,HNRNPDL,DMTF1,PPP4R1,CDH1,SLC12A6,KCNE2,DGCR2,CASP8AP2,SCO2
count,545.0,545.0,545.0,708.0,708.0,708.0,708.0,545.0,500.0,545.0,...,708.0,706.0,708.0,708.0,669.0,545.0,708.0,545.0,708.0,545.0
mean,-0.037563,-0.064273,0.004964,0.01557,0.03753,-0.330078,-0.122684,0.097244,0.151758,0.034675,...,-0.079499,-0.096593,-0.017149,-0.009336,-0.051,0.102559,-0.01933,0.050729,-0.54223,-0.002759
std,0.184556,0.217855,0.192259,0.163059,0.157416,0.237158,0.201381,0.128367,0.124254,0.138083,...,0.145781,0.179288,0.182053,0.142044,0.154082,0.116829,0.151618,0.181749,0.381612,0.19294
min,-0.69131,-1.268613,-0.79478,-0.830639,-0.791714,-1.051937,-1.166444,-0.380075,-0.234508,-0.431979,...,-0.606379,-1.037848,-0.593413,-0.699956,-0.718029,-0.228656,-0.744133,-0.684951,-2.873168,-0.634959
25%,-0.142142,-0.162662,-0.128375,-0.078536,-0.039442,-0.46961,-0.238956,0.025468,0.07682,-0.043908,...,-0.170429,-0.191932,-0.129356,-0.090074,-0.141393,0.035554,-0.104217,-0.057165,-0.754426,-0.120638
50%,-0.042158,-0.056757,0.021688,0.017762,0.050559,-0.325631,-0.125637,0.096659,0.155945,0.03197,...,-0.077126,-0.088621,-0.018945,-0.007832,-0.048601,0.099928,-0.012896,0.050646,-0.454698,0.001383
75%,0.069985,0.049407,0.128444,0.1126,0.124822,-0.175591,-0.006659,0.173484,0.231217,0.111672,...,0.010693,0.009735,0.096201,0.079343,0.036093,0.169073,0.068114,0.161217,-0.264533,0.122669
max,0.907063,0.744008,0.611368,0.705452,0.517325,0.410725,0.61633,0.497967,0.693237,0.519837,...,0.534168,0.723699,0.609256,0.479402,0.661053,0.487694,0.599642,0.680716,0.304642,0.724764


possibly relevant drugs with their gdsc id:
1032	Afatinib
1032	Afatinib
119	Lapatinib
1377	Afatinib
1416	AZD8931
1549	Sapitinib
1558	Lapatinib
255	CP724714
273	CUDC-101


drug_ids = [1032, 119, 1377, 1416, 1549, 1558, 255, 273]
#filter gdsc to only these drugs
gdsc_cell_drug = gdsc_cell_drug[gdsc_cell_drug['DRUG_ID'].isin(drug_ids)]
gdsc_selcted = gdsc_cell_drug.loc[:,['COSMIC_ID','DRUG_ID','DRUG_NAME','ARXSPAN_ID','IC50_PUBLISHED','Z_SCORE_PUBLISHED','AUC_PUBLISHED']]



#find the cell lines that are in obs_df that are in gdsc selected
cell_lines_both = list(set(obs_df.cell_line_id).intersection(set(gdsc_selcted['ARXSPAN_ID'])))
cell_lines_both.__len__()

In [121]:
#filter clin df
clin_df_filtered = pd.DataFrame(clin_df.loc[:,'ERBB2'])
print(clin_df_filtered.shape)
clin_df_filtered.info()

(1100, 1)
<class 'pandas.core.frame.DataFrame'>
Index: 1100 entries, ACH-000001 to ACH-002926
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ERBB2   1100 non-null   float64
dtypes: float64(1)
memory usage: 17.2+ KB


In [None]:
#for each unique drug in selected, add to the obs a col with the ic50_pulished using the drug as colname


In [122]:
#set the index column to have the right name
clin_df_filtered.index.name = 'cell_line_id'
#rename col ERBB2 as ERBB2_CRISPR
clin_df_filtered = clin_df_filtered.rename(columns={'ERBB2': "ERBB2_CRISPR"})
# First, merge the DataFrames on ARXSPAN_ID and cell_line_id
merged_df = pd.merge(obs_df, clin_df_filtered, on='cell_line_id', how='left')
merged_df.head()

Unnamed: 0,cell_line_id,AFATINIB_ic50,AZD8931_ic50,CP-724714_ic50,CUDC-101_ic50,LAPATINIB_ic50,AFATINIB_Z,AFATINIB_AUC,ERBB2_RNAi,ERBB2_CRISPR
0,ACH-001113,,,,,,,,-0.323286,
1,ACH-001289,,,,,,,,,-0.021037
2,ACH-001339,,,,,,,,,
3,ACH-001538,,,,,,,,,-0.264159
4,ACH-000242,8.304031,35.649353,40.658585,0.225703,46.706017,0.889121,0.985634,,


In [123]:
#if no prcosseing required, join
obs_df=obs_df.join(clin_df_filtered, on='cell_line_id', how='left')
obs_df.head()

Unnamed: 0,cell_line_id,AFATINIB_ic50,AZD8931_ic50,CP-724714_ic50,CUDC-101_ic50,LAPATINIB_ic50,AFATINIB_Z,AFATINIB_AUC,ERBB2_RNAi,ERBB2_CRISPR
0,ACH-001113,,,,,,,,-0.323286,
1,ACH-001289,,,,,,,,,-0.021037
2,ACH-001339,,,,,,,,,
3,ACH-001538,,,,,,,,,-0.264159
4,ACH-000242,8.304031,35.649353,40.658585,0.225703,46.706017,0.889121,0.985634,,


In [118]:

#rename col ERBB2 as ERBB2_RNAi
obs_df = obs_df.rename(columns={'ERBB2': "ERBB2_RNAi"})
obs_df.head()

Unnamed: 0,cell_line_id,AFATINIB_ic50,AZD8931_ic50,CP-724714_ic50,CUDC-101_ic50,LAPATINIB_ic50,AFATINIB_Z,AFATINIB_AUC,ERBB2_RNAi
0,ACH-001113,,,,,,,,-0.323286
1,ACH-001289,,,,,,,,
2,ACH-001339,,,,,,,,
3,ACH-001538,,,,,,,,
4,ACH-000242,8.304031,35.649353,40.658585,0.225703,46.706017,0.889121,0.985634,


In [82]:

#remove dupliactes of cell_line_id and drug name, by keeping the first appearence
merged_df = merged_df.drop_duplicates(subset=['cell_line_id', 'DRUG_NAME'], keep='first')


KeyError: "['AFATINIB'] not found in axis"

In [101]:

# Pivot the merged DataFrame
pivot_df = merged_df.pivot(index='cell_line_id', columns='DRUG_NAME', values='AUC_PUBLISHED')
pivot_df=pd.DataFrame(pivot_df.loc[:,'AFATINIB'])
pivot_df.head()

Unnamed: 0_level_0,AFATINIB
cell_line_id,Unnamed: 1_level_1
ACH-000001,0.964208
ACH-000002,0.963948
ACH-000004,0.965114
ACH-000006,0.972685
ACH-000007,0.918839


In [102]:
#rename column 'AFATINIB_Z'
pivot_df = pivot_df.rename(columns={'AFATINIB': "AFATINIB_AUC"})
pivot_df.index.name = 'cell_line_id'

In [98]:
#rename cols 1:end in obs: add '_ic50' to the end
# Assuming obs_df is your DataFrame
# Select columns from index 1 to the end
columns_to_rename = obs_df.columns[1:]

# Rename these columns by appending '_ic50'
new_column_names = [col + '_ic50' for col in columns_to_rename]

# Update the DataFrame with new column names
obs_df.columns = [obs_df.columns[0]] + new_column_names

In [103]:
temp_obs = obs_df.join(pivot_df, on='cell_line_id', how='left')
obs_df = temp_obs.copy()

In [124]:
#assign obs_df to adata
adata.obs = obs_df

In [125]:
save_path

PosixPath('/home/shair/Desktop/STAMP_2023/jesse/trans_stamp_curr/training_data/cell_lines/broad_CCLE/genexp/exp_protein_genes_tpml_logp1.h5ad')

In [None]:
#save the anndata object


old session

In [47]:
cols = pd.Series(clin_df.columns)
keep_cols = [0,1,2,3,5,6,19,21,28,29,30,33,34,36,38,40,41,46,49,50,51,56]
#keep selected cols
clin_df = clin_df.iloc[:,keep_cols]

In [48]:
clin_df = clin_df.set_index(['Sample ID'])
clin_df.index = clin_df.index.str.replace('-','.')


In [49]:
#keep only samples in my_adata.obs['Sample_ID']
clin_df = clin_df.loc[my_adata.obs['Sample_ID'],:]

In [55]:
#add clinical data to adata by joining on sample id
new_obs = pd.merge(my_adata.obs.copy(), clin_df, left_on='Sample_ID', right_on='Sample ID', how='left')

In [56]:
#for each col, count na:
for col in new_obs.columns:
    print(col, new_obs[col].isna().sum())

Sample_ID 0
oncosig_label_ERBB2 0
cell_type 0
Patient ID 0
Diagnosis Age 0
Neoplasm Disease Stage American Joint Committee on Cancer Code 5
Aneuploidy Score 24
Buffa Hypoxia Score 0
Fraction Genome Altered 2
Neoplasm Histologic Grade 949
MSI MANTIS Score 34
MSIsensor Score 0
Mutation Count 0
Overall Survival (Months) 0
Overall Survival Status 0
American Joint Committee on Cancer Metastasis Stage Code 0
American Joint Committee on Cancer Tumor Stage Code 0
Progress Free Survival (Months) 1
Progression Free Status 0
Ragnum Hypoxia Score 0
Sex 0
Somatic Status 0
Subtype 0
TMB (nonsynonymous) 0


In [58]:

new_obs = new_obs.drop(['Neoplasm Histologic Grade'], axis=1) if 'Neoplasm Histologic Grade' in new_obs.columns else new_obs

In [59]:
#add the new obs to cell em
cell_embbed.obs = new_obs

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [67]:
cell_embbed.write_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_scgpt_emb_oncosig_sub_genes_clin.h5ad')

In [65]:
#add new obs to my adata and make smaple id the index
my_adata.obs = new_obs
my_adata.obs.set_index('Sample_ID', inplace=True)
#full_adata = sc.read_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_oncosig_sub_genes.h5ad')
# save my_adata as 'tcga_brca_erbb2_oncosig_sub_genes_clin.h5ad'


In [127]:
#write adata
adata.write(save_path)

! create meta data df

In [None]:
'''
def bulk_2_h5ad(bulk_genexp,tcga_cancer ='brca',metadata_cols = [],samples_metadata_path = '',\
        genes_meta_path = '', save_name = ''):
'''
args = { 'bulk_genexp' : bulk_genexp,
        'tcga_cancer' : 'brca',
        'metadata_cols' : meta_data_cols,
        'samples_metadata_path' : '',
        'genes_meta_path' : '',
        'save_name' : save_name

}
bulk_2_h5ad(**args)

! create var and X df