In [1]:
import h5py
import numpy as np
import pandas as pd
from pathlib import Path
from hdf5_ids import obtain_sample_ids

In [48]:
# The following code is to create hdf5 files to store RNA-seq data (or open previously created file). 
f = h5py.File('data.hdf5', 'a')
# f = h5py.File("data.hdf5", 'r+')

In [49]:
# Create groups (dictionaries) called data and meta (or open them). 
data = f.create_group("data") 
meta = f.create_group("meta")
# data = f['data']
# meta = f['meta']

In [50]:
df = pd.read_csv("data/larger_files/Glioblastoma/data.csv")
df_symbols = df[['symbol']]

In [2]:
# Create list of the diseases with >150 cases 
pathlist = Path("data/larger_files").glob('*')
list_cancers = []
for path in pathlist:
    cancer = str(path).split("/")[2]
    if cancer != ".DS_Store":
        list_cancers.append(cancer)

In [52]:
# count the number of cases for each cancer by counting number of columns minus the symbol and type of gene
total_cases = 0
for cancer in list_cancers:
    df = pd.read_csv(f'data/larger_files/{cancer}/data.csv')
    total_cases += len(df.columns[1:-1])
print("There are {} total cases".format(total_cases))

There are 7641 total cases


In [53]:
# create an array in which each element is a vector of a single case's RNA-seq counts
case_vectors = np.empty((total_cases, len(df_symbols)))
cancer_order = []
case_count = 0
for cancer in list_cancers:
    cancer_order.append(cancer)
    df = pd.read_csv(f'data/larger_files/{cancer}/data.csv')
    df_cols = df.columns[1:-1]
    np_cols = list(range(case_count, case_count+len(df_cols)))
    case_vectors[np_cols,:] = df[df_cols].T
    case_count += len(df_cols)
print("There are {} cases, {} genes and {} cancers".format(np.shape(case_vectors)[1], np.shape(case_vectors)[0], len(cancer_order)))

There are 60483 cases, 7641 genes and 14 cancers


In [54]:
# create data entry (dataset) that is the vector of RNA-seq vectors
data_expression = data.create_dataset("expression", data=case_vectors)

In [62]:
# meta entry that is the vector of gene symbols 
list_symbols = df_symbols.values.astype("S").tolist()
meta_genes = meta.create_dataset("genes", data=list_symbols)

In [4]:
# obtain ensembl ids by first going through files of random cancer (with few cases)
# create dataset that is ensembl ids in order 
sample_id_list, ensembl_id_list = obtain_sample_ids('Nodular melanoma', ensembl = True)
meta_ensembl_ids = meta.create_dataset("ensembl_id", data=ensembl_id_list)

  ensembl_id_list = obtain_ensembl_ids(cancer_type, case_uuid_list, file_uuid_list)


In [6]:
# obtain sample ids by going through the list of cancers
# create dataset that is sample ids in order 
sample_ids = []
for cancer_type in list_cancers:
    sample_id_list = obtain_sample_ids(cancer_type)
    sample_ids += sample_id_list
sample_ids = (pd.DataFrame(sample_ids)).values.astype("S").tolist()
meta_sample_ids = meta.create_dataset("sample_id", data=sample_ids)