! description


given a csv or such table, where rows are samples and columuns are genes.
create a h5ad file where the count values are the .X matrix, the gene are in .var as "gene_hgnc" and the sample are in .obs as "sample_id"

In [None]:
# import
import anndata
import pandas as pd
import os
import short_utils
from pathlib import Path

In [None]:
# globals
base_dir = short_utils.get_base_dir()
base_dir

In [None]:
#print working dir
print(os.getcwd())

In [None]:
def preview_file(bulk_genexp, num_rows = 3):

    #print first few elements of index, first column, colnames, and first 4x4
    print(f"first {num_rows}x{num_rows}: \n", bulk_genexp.iloc[0:num_rows, 0:num_rows])
    #print first 4 elements of index
    print("start of idx: \n", bulk_genexp.index[:(num_rows+1)])
    #print start of colnames
    print("start of colnames: \n", bulk_genexp.columns[:(num_rows+1)])

In [None]:
def get_obs(bulk_genexp,meta_cols):
    '''
    create the obs df containng the meta data about the samples
    :param bulk_genexp: the genexp data as a df
    :param meta_cols: a list of colnames with sample metadata
    :return: a df of the meta data
    '''
    obs_df = bulk_genexp[meta_cols]
    return obs_df

In [None]:
def add_cell_type(metadata_df,tcga_cancer):
    '''

    :param metadata: a df of sample meta data
    :param tcga_cancer: the cancer type of the bulk genexp data
    :return: add a 'cell_type" col with the cancer type
    '''
    metadata_df["cell_type"] = tcga_cancer
    return metadata_df

In [None]:
def add_metadata(file_path, metadata_df):
    '''

    :param file_path: path to table with additional metadata
    :param metadata_df: the metadata df
    :return: metadata_df with additional metadata
    '''
    pass

In [None]:
def get_var(genexp,gene_cols,hgnc_col):
    '''
    create the var df containing data about the variables, usually genes
    :param genexp: a df of genexp data
    :param gene_cols: a list of colnames with gene names
    :param hgnc_col: the colname with the hgnc gene names
    :return: a df of genes with col 'gene_hgnc'
    '''
    #subset the df to only the gene cols
    var_df = genexp[gene_cols]
    #rename hgnc_col gene_hgnc
    var_df = var_df.rename(columns={hgnc_col: "gene_hgnc"})
    #reorder cols and make hgnc the first col
    
    # Reorder columns to make 'gene_hgnc' the first column
    reordered_cols = ['gene_hgnc'] + [col for col in gene_cols if col != hgnc_col]
    var_df = var_df[reordered_cols]

    return var_df

# create an anndata object: .X is the counts, .var "gene_hgnc" is the genes, .obs "sample_id" is the sample_id and .obs "oncosig_label_ERBB2" is the oncosig_labels
adata = anndata.AnnData(X=counts, var=pd.DataFrame(index=genes, data=genes, columns=["gene_hgnc"]), obs=obs_df)


In [None]:
def create_anndata(counts_df, var_df,obs_df):
    '''
    :param counts_df: a df of counts with no names
    :param var_df: a df of genes with col 'gene_hgnc' as index
    :param obs_df: a df of sample meta data
    :return: an anndata object
    '''
    return anndata.AnnData(X=counts_df, var=var_df, obs=obs_df)


In [None]:
# def bulk_2_h5ad(bulk_genexp,tcga_cancer ='brca',metadata_cols = [],samples_metadata_path = '',\
#         genes_meta_path = '', save_name = ''):
#     #create the obs df: sample_id and oncosig_label_ERBB2 where the smaple id's are the idx
#     obs_df = get_sample_meta(bulk_genexp,metadata_cols)
#     #add a "cell_type" column to the obs_df with the value "brca"
#     obs_df = add_cell_type(obs_df,"brca")
#     #create the var and X df: gene_hgnc and the counts
#     #take subset, excluding the meta_data_cols
#     #drop meta_data_cols
#     bulk_genexp.drop(metadata_cols, axis=1, inplace=True)
# 
#     #save colnames to genes list
#     genes_var = get_genes(bulk_genexp)
#     #SAVE genexp counts to a counts df
#     counts = bulk_genexp.values
#     adata = create_anndata(counts, genes_var, obs_df)
# 
#     #save the anndata object
#     # raise err if no save name provided
#     if not save_name:
#         raise ValueError("no save name provided")
#     adata.write(base_dir + '/data/' +  save_name)



! run the script:
read the data, identify metadate cols, then create the anndata object

In [None]:
#set path to file
data_path = Path(base_dir / 'training_data/tcga/genexp_data/Xena_pan_can_hi_seq')
data_path

In [None]:
#read file: tab sepereated
bulk_genexp = pd.read_csv(data_path, sep='\t')

In [None]:
preview_file(bulk_genexp, 3)
#pritn colnames
print(bulk_genexp.columns)

In [None]:
#drop any chosen cols
bulk_genexp = bulk_genexp.drop(['DesignElementAccession'], axis=1)

create the obs df - containing sample names and any other metadata

In [None]:
#rename first column as 'cell_line_id'
#bulk_genexp = bulk_genexp.rename(columns={'Unnamed: 0': "cell_line_id"})

#extract the col names starting from the second col
samples = list(bulk_genexp.columns[1:])
#create the obs df using creating a column 'sample_id' with the sample names
obs_df = pd.DataFrame(samples, columns=['sample_id'])

create the var df - containing gene names and aliases

In [None]:
#in this case the genes names are the colnames, starting from the second col
#gene_names = list(bulk_genexp.columns[1:])
gene_names = list(bulk_genexp.iloc[:,0])

In [None]:
#process the egens into tuples of hgnc and ncbi_id

# Using list comprehension to process each element
#split_gene_names = [(gene.split(' (')[0], gene.split(' (')[1].replace(')', '')) for gene in gene_names]

# Create a DataFrame from the list of tuples
var_df = pd.DataFrame(gene_names, columns=['hgnc_gene'])


create the counts df - containing the counts. ensure samples are rows and cols are features (genes)

In [None]:
#create counts df by removing the var cols and no colnames
counts_df = bulk_genexp.drop(['Sample'], axis=1).values.T

In [None]:
#create anndata object
adata = create_anndata(counts_df, var_df, obs_df)

In [83]:
#save the anndata object
save_folder = Path(base_dir / 'training_data/tcga/genexp_data')
save_name = 'xena_pan_can_genexp_clin.h5ad'
save_path = Path(save_folder / save_name)
save_path

PosixPath('/home/shair/Desktop/STAMP_2023/jesse/trans_stamp_curr/training_data/tcga/genexp_data/xena_pan_can_genexp_clin.h5ad')

#incoporate metadate from other files

# add clinical data to adata

In [77]:
#load clinical data
clin_path = Path(base_dir / 'training_data/tcga/clinical/Subtype_Immune_Model_Based.txt')

clin_df = pd.read_csv(clin_path, sep='\t', index_col=0)

In [78]:
print(clin_df.shape)
clin_df.iloc[0:5,:]

(9126, 1)


Unnamed: 0_level_0,Subtype_Immune_Model_Based
sample,Unnamed: 1_level_1
TCGA-A5-A0GI-01,Wound Healing (Immune C1)
TCGA-S9-A7J2-01,Immunologically Quiet (Immune C5)
TCGA-EK-A2RE-01,IFN-gamma Dominant (Immune C2)
TCGA-D5-5538-01,IFN-gamma Dominant (Immune C2)
TCGA-F4-6854-01,Wound Healing (Immune C1)


examine a data frame

In [79]:
clin_df.index.name

'sample'

In [80]:
#rename sample id col as 'sample_id'
clin_df.index.rename('sample_id', inplace=True)

In [None]:
#get the dtype, na count and unique values for each col in gdsc
clin_df.info()
clin_df.describe()

tcga clinical data

In [69]:
adata.obs.head()

Unnamed: 0,sample_id,tumor_type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
0,TCGA-S9-A7J2-01,LGG,LGr1,LGm2,,,,,Codel,GBM_LGG.Codel
1,TCGA-G3-A3CH-11,,,,,,,,,
2,TCGA-EK-A2RE-01,CESC,,,,,,,,
3,TCGA-44-6778-01,LUAD,,,,,,,,
4,TCGA-VM-A8C8-01,LGG,LGr3,LGm2,,,,,G-CIMP-high,GBM_LGG.G-CIMP-high


In [68]:
#clin_df = clin_df.iloc[:,:2]
clin_df = clin_df.loc[:,['cancer type abbreviation','OS','OS.time','gender']]
clin_df.head()

Unnamed: 0_level_0,cancer type abbreviation,OS,OS.time,gender
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-OR-A5J1-01,ACC,1.0,1355.0,MALE
TCGA-OR-A5J2-01,ACC,1.0,1677.0,FEMALE
TCGA-OR-A5J3-01,ACC,0.0,2091.0,FEMALE
TCGA-OR-A5J4-01,ACC,1.0,423.0,FEMALE
TCGA-OR-A5J5-01,ACC,1.0,365.0,MALE


In [54]:
#replace '.' with '-' in index col
clin_df.index = clin_df.index.str.replace('.','-')

  clin_df.index = clin_df.index.str.replace('.','-')


In [81]:
#join on to adata.obs: in obs join on sample_id and on SampleID in clin df
adata.obs = adata.obs.join(clin_df, on='sample_id', how='left')

In [82]:
adata.obs.head()

Unnamed: 0,sample_id,tumor_type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected,OS,OS.time,gender,Subtype_Immune_Model_Based
0,TCGA-S9-A7J2-01,LGG,LGr1,LGm2,,,,,Codel,GBM_LGG.Codel,0.0,62.0,MALE,Immunologically Quiet (Immune C5)
1,TCGA-G3-A3CH-11,LIHC,,,,,,,,,0.0,780.0,MALE,
2,TCGA-EK-A2RE-01,CESC,,,,,,,,,0.0,57.0,FEMALE,IFN-gamma Dominant (Immune C2)
3,TCGA-44-6778-01,LUAD,,,,,,,,,0.0,1864.0,MALE,
4,TCGA-VM-A8C8-01,LGG,LGr3,LGm2,,,,,G-CIMP-high,GBM_LGG.G-CIMP-high,0.0,1397.0,FEMALE,Immunologically Quiet (Immune C5)


In [57]:
adata.obs = adata.obs.drop(['CCND1'], axis=1)

In [None]:
cols = pd.Series(clin_df.columns)
cols

In [28]:

keep_cols = [0,1,2,5,6,8,19,22,30,33,34,45,46,49,50,51,56,60]
#keep selected cols
clin_df = clin_df.iloc[:,keep_cols]

In [None]:
clin_df = clin_df.set_index(['Sample ID'])
clin_df.index = clin_df.index.str.replace('-','.')


In [None]:
#keep only samples in my_adata.obs['Sample_ID']
clin_df = clin_df.loc[my_adata.obs['Sample_ID'],:]

In [None]:
#add clinical data to adata by joining on sample id
new_obs = pd.merge(my_adata.obs.copy(), clin_df, left_on='Sample_ID', right_on='Sample ID', how='left')

In [None]:
#for each col, count na:
for col in new_obs.columns:
    print(col, new_obs[col].isna().sum())

In [None]:

new_obs = new_obs.drop(['Neoplasm Histologic Grade'], axis=1) if 'Neoplasm Histologic Grade' in new_obs.columns else new_obs

In [None]:
#add the new obs to cell em
cell_embbed.obs = new_obs

In [None]:
cell_embbed.write_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_scgpt_emb_oncosig_sub_genes_clin.h5ad')

In [None]:
#add new obs to my adata and make smaple id the index
my_adata.obs = new_obs
my_adata.obs.set_index('Sample_ID', inplace=True)
#full_adata = sc.read_h5ad(base_dir / 'scgpt/data/bulk_brca_erbb2/tcga_brca_erbb2_oncosig_sub_genes.h5ad')
# save my_adata as 'tcga_brca_erbb2_oncosig_sub_genes_clin.h5ad'


In [84]:
#write adata
adata.write(save_path)

tcga data

cell line data

possibly relevant drugs with their gdsc id:
1032	Afatinib
1032	Afatinib
119	Lapatinib
1377	Afatinib
1416	AZD8931
1549	Sapitinib
1558	Lapatinib
255	CP724714
273	CUDC-101


drug_ids = [1032, 119, 1377, 1416, 1549, 1558, 255, 273]
#filter gdsc to only these drugs
gdsc_cell_drug = gdsc_cell_drug[gdsc_cell_drug['DRUG_ID'].isin(drug_ids)]
gdsc_selcted = gdsc_cell_drug.loc[:,['COSMIC_ID','DRUG_ID','DRUG_NAME','ARXSPAN_ID','IC50_PUBLISHED','Z_SCORE_PUBLISHED','AUC_PUBLISHED']]



#find the cell lines that are in obs_df that are in gdsc selected
cell_lines_both = list(set(obs_df.cell_line_id).intersection(set(gdsc_selcted['ARXSPAN_ID'])))
cell_lines_both.__len__()

In [None]:
#filter clin df
clin_df_filtered = pd.DataFrame(clin_df.loc[:,'ERBB2'])
print(clin_df_filtered.shape)
clin_df_filtered.info()

In [None]:
#for each unique drug in selected, add to the obs a col with the ic50_pulished using the drug as colname


In [None]:
#set the index column to have the right name
clin_df_filtered.index.name = 'cell_line_id'
#rename col ERBB2 as ERBB2_CRISPR
clin_df_filtered = clin_df_filtered.rename(columns={'ERBB2': "ERBB2_CRISPR"})
# First, merge the DataFrames on ARXSPAN_ID and cell_line_id
merged_df = pd.merge(obs_df, clin_df_filtered, on='cell_line_id', how='left')
merged_df.head()

In [None]:
#if no prcosseing required, join
obs_df=obs_df.join(clin_df_filtered, on='cell_line_id', how='left')
obs_df.head()

In [None]:

#rename col ERBB2 as ERBB2_RNAi
obs_df = obs_df.rename(columns={'ERBB2': "ERBB2_RNAi"})
obs_df.head()

In [None]:

#remove dupliactes of cell_line_id and drug name, by keeping the first appearence
merged_df = merged_df.drop_duplicates(subset=['cell_line_id', 'DRUG_NAME'], keep='first')


In [None]:

# Pivot the merged DataFrame
pivot_df = merged_df.pivot(index='cell_line_id', columns='DRUG_NAME', values='AUC_PUBLISHED')
pivot_df=pd.DataFrame(pivot_df.loc[:,'AFATINIB'])
pivot_df.head()

In [None]:
#rename column 'AFATINIB_Z'
pivot_df = pivot_df.rename(columns={'AFATINIB': "AFATINIB_AUC"})
pivot_df.index.name = 'cell_line_id'

In [None]:
#rename cols 1:end in obs: add '_ic50' to the end
# Assuming obs_df is your DataFrame
# Select columns from index 1 to the end
columns_to_rename = obs_df.columns[1:]

# Rename these columns by appending '_ic50'
new_column_names = [col + '_ic50' for col in columns_to_rename]

# Update the DataFrame with new column names
obs_df.columns = [obs_df.columns[0]] + new_column_names

In [None]:
temp_obs = obs_df.join(pivot_df, on='cell_line_id', how='left')
obs_df = temp_obs.copy()

In [None]:
#assign obs_df to adata
adata.obs = obs_df

In [None]:
save_path

In [None]:
#save the anndata object


! create meta data df

In [None]:
'''
def bulk_2_h5ad(bulk_genexp,tcga_cancer ='brca',metadata_cols = [],samples_metadata_path = '',\
        genes_meta_path = '', save_name = ''):
'''
args = { 'bulk_genexp' : bulk_genexp,
        'tcga_cancer' : 'brca',
        'metadata_cols' : meta_data_cols,
        'samples_metadata_path' : '',
        'genes_meta_path' : '',
        'save_name' : save_name

}
bulk_2_h5ad(**args)

! create var and X df