In [1]:
import h5py
import pandas as pd
import numpy as np

In [2]:
path = "../data/yao/expression_matrix.hdf5"
f = h5py.File(path, "r")
f

<HDF5 file "expression_matrix.hdf5" (mode r)>

In [3]:
for key in f['data'].keys():
    print(f['data'][key])

<HDF5 dataset "counts": shape (31053, 1169320), type "<i4">
<HDF5 dataset "gene": shape (31053,), type "|S30">
<HDF5 dataset "samples": shape (1169320,), type "|S36">
<HDF5 dataset "shape": shape (2,), type "<i4">


In [4]:
f['data']['counts'][[0,1],0]

array([8, 0], dtype=int32)

In [31]:
df = pd.read_csv("../data/yao/metadata.csv")
df.shape

(1169213, 57)

In [36]:
count_samples = np.array(f['data']['samples'], dtype=str)
#df.set_index("sample_name", inplace=True)
obs_samples = np.array(df.index)
shared_samples = set(obs_samples).intersection(count_samples)
#shared_samples = list(shared_samples)
print(len(shared_samples))

1169213


Subset shared samples (most) and GABAergic cells

In [50]:
# 15m
count_ix = np.where([sample in shared_samples and df.loc[sample]['class_label'] == "GABAergic" for sample in count_samples])
obs_ix = np.where([sample in shared_samples and df.loc[sample]['class_label'] == "GABAergic"  for sample in obs_samples])[0] #all of them

Next, need same ordering. No, not yet. First get count df, then can do indexing

In [54]:
count_ix = count_ix[0]

In [55]:
len(count_ix), len(obs_ix)

(177614, 177614)

In [60]:
np.array(f['data']['samples'])[count_ix]

array([b'AAACCTGCACGTCTCT-L8TX_180221_01_F09',
       b'AAACGGGAGACTTTCG-L8TX_180221_01_F09',
       b'AAAGATGCATGCCTAA-L8TX_180221_01_F09', ...,
       b'TTTGTCAGTAAATGAC-L8TX_200611_02_A05',
       b'TTTGTCAGTCGCGGTT-L8TX_200611_02_A05',
       b'TTTGTCATCAGGCGAA-L8TX_200611_02_A05'], dtype='|S36')

In [63]:
samples = np.array(f['data']['samples'], dtype = str)[count_ix]
samples

array(['AAACCTGCACGTCTCT-L8TX_180221_01_F09',
       'AAACGGGAGACTTTCG-L8TX_180221_01_F09',
       'AAAGATGCATGCCTAA-L8TX_180221_01_F09', ...,
       'TTTGTCAGTAAATGAC-L8TX_200611_02_A05',
       'TTTGTCAGTCGCGGTT-L8TX_200611_02_A05',
       'TTTGTCATCAGGCGAA-L8TX_200611_02_A05'], dtype='<U36')

In [67]:
genes = np.array(f['data']['gene'],dtype=str)
genes

array(['Xkr4', 'Gm1992', 'Gm37381', ..., 'CAAA01118383.1', 'Vmn2r122',
       'CAAA01147332.1'], dtype='<U30')

In [68]:
counts = f['data']['counts'][:, count_ix[:10]]
counts

array([[30, 11,  3, ..., 14,  8,  5],
       [ 0,  0,  0, ...,  0,  0,  1],
       [ 0,  0,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int32)

In [73]:
df.loc[samples].shape

(177614, 56)

In [None]:
samples =  f['data']['samples'][count_ix]
counts = f['data']['counts'][:, count_ix] # numpy _
counts = counts.T # genes x samples -> samples x genes
pd.DataFrame(counts, columns =  f['data']['gene'], index = samples)

In [48]:
gaba_ix = np.where([df.loc[sample]['class_label'] == "GABAergic" for sample in obs_samples])[0] #all of them
gaba_ix

array([ 39,  44,  46,  57,  66,  68,  69,  77,  79,  81,  83, 100, 116,
       128, 144, 149, 174, 188, 194, 199, 206, 227, 232, 237, 243, 260,
       264, 273, 274, 283, 285, 288, 291, 295, 297, 301, 309, 310, 317,
       321, 323, 326, 328, 330, 331, 332, 337, 344, 349, 350, 351, 353,
       363, 366, 381, 388, 389, 391, 394, 395, 406, 413, 429, 438, 442,
       452, 467, 468, 484, 485, 486, 487, 496, 497, 499, 504, 517, 518,
       530, 533, 535, 537, 549, 552, 561, 569, 587, 590, 594, 604, 613,
       616, 620, 628, 646, 661, 673, 688, 693, 700, 713, 720, 722, 725,
       742, 766, 770, 787, 791, 794, 796, 798, 799, 806, 827, 842, 849,
       852, 854, 858, 866, 868, 898, 905, 908, 912, 914, 917, 928, 931,
       937, 939, 940, 941, 943, 945, 953, 955, 958, 970, 976, 995])

In [12]:
counts = f['data']['counts'][:, count_ix]

(1169213,)

In [16]:
gaba_ix = df['class_label'] == "GABAergic"
sum(gaba_ix), np.mean(gaba_ix)

(177614, 0.15190901914364618)

In [41]:
count_samples = np.array(f['data']['samples'], dtype=str)
obs_samples = np.array(df['sample_name'])
shared_samples = set(obs_samples).intersection(count_samples)



In [45]:
obs_samples[0], count_samples[0]

('ACGCAGCAGACCGGAT-L8TX_180221_01_C11', 'AAACCTGAGAAACGCC-L8TX_180221_01_F09')

In [46]:
shared_samples = set(obs_samples).intersection(count_samples)
len(shared_samples)

1169213

In [None]:
[]