# Multimodal Single-Cell Integration: Creating a Sparse Matrix Dataset

In [1]:
!conda install pytables -y

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
!dir /s "Dataset"

 Volume in drive F is New Volume
 Volume Serial Number is 92F6-A0F6

 Directory of F:\Minor Project\Dataset

19-12-2022  03:01    <DIR>          .
19-12-2022  03:01    <DIR>          ..
07-09-2022  19:37     2,418,406,934 evaluation_ids.csv
07-09-2022  19:40         9,770,334 metadata.csv
07-09-2022  19:40           234,920 metadata_cite_day_2_donor_27678.csv
07-09-2022  19:40       843,563,244 sample_submission.csv
19-12-2022  18:16            10,048 test_cite_inputs.h5
07-09-2022  19:45       307,964,530 test_cite_inputs_day_2_donor_27678.h5
07-09-2022  19:46     6,473,530,657 test_multi_inputs.h5
07-09-2022  20:03     2,498,128,492 train_cite_inputs.h5
07-09-2022  20:10        38,539,123 train_cite_targets.h5
07-09-2022  20:10    11,334,840,656 train_multi_inputs.h5
07-09-2022  20:38     3,215,261,538 train_multi_targets.h5
              11 File(s) 27,140,250,476 bytes

     Total Files Listed:
              11 File(s) 27,140,250,476 bytes
               2 Dir(s)  115,220,365,312 by

In [3]:
import pandas as pd
import numpy as np
import scipy.sparse



In [4]:
import numpy as np
import h5py

arr = np.random.randn(1000)
 
# creating a file
with h5py.File('Dataset/test_cite_inputs.h5', 'w') as f:
    dset = f.create_dataset("default", data = arr)
    
print(dset)

<Closed HDF5 dataset>


# Conversion Functions

In [5]:
def convert_to_parquet(filename, out_filename):
    df = pd.read_csv(filename)
    df.to_parquet(out_filename + ".parquet")

In [6]:
import pandas as pd
from scipy import sparse
import numpy as np
import pyarrow
def convert_h5_to_sparse_csr(filename, out_filename, chunksize=2500):
    start = 0
    total_rows = 0

    sparse_chunks_data_list = []
    chunks_index_list = []
    columns_name = None
    while True:
        df_chunk = pd.read_hdf(filename, start=start, stop=start+chunksize)
        if len(df_chunk) == 0:
            break
        chunk_data_as_sparse = scipy.sparse.csr_matrix(df_chunk.to_numpy())
        sparse_chunks_data_list.append(chunk_data_as_sparse)
        chunks_index_list.append(df_chunk.index.to_numpy())

        if columns_name is None:
            columns_name = df_chunk.columns.to_numpy()
        else:
            assert np.all(columns_name == df_chunk.columns.to_numpy())

        total_rows += len(df_chunk)
        print(total_rows)
        if len(df_chunk) < chunksize: 
            del df_chunk
            break
        del df_chunk
        start += chunksize
        
    all_data_sparse = scipy.sparse.vstack(sparse_chunks_data_list)
    del sparse_chunks_data_list
    
    all_indices = np.hstack(chunks_index_list)
    
    scipy.sparse.save_npz(out_filename+"_values.sparse", all_data_sparse)
    np.savez(out_filename+"_idxcol.npz", index=all_indices, columns =columns_name)    

In [8]:
import os

directory = 'Dataset'

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f):
        split_tup = os.path.splitext(f)
        if(split_tup[1]=='.h5'):
            split_tup[0].replace('Dataset/', ' ')
            print(split_tup[0])
            convert_h5_to_sparse_csr(f,f"New/{split_tup[0]}")
            
        if(split_tup[1]=='.csv'):
            convert_to_parquet(f, f"New/{split_tup[0]}")

def run_sparse():
    print("train_multi_targets.h5")
    convert_h5_to_sparse_csr("Dataset/train_multi_targets.h5", "New/Dataset/train_multi_targets")
    print("train_multi_inputs.h5")
    convert_h5_to_sparse_csr("Dataset/train_multi_inputs.h5", "New/Dataset/train_multi_inputs")
    print("train_cite_targets.h5")
    convert_h5_to_sparse_csr("Dataset/train_cite_targets.h5", "New/Dataset/train_cite_targets")
    print("train_cite_inputs.h5")
    convert_h5_to_sparse_csr("Dataset/train_cite_inputs.h5", "New/Dataset/train_cite_inputs")
    print("test_multi_inputs.h5")
    convert_h5_to_sparse_csr("Dataset/test_multi_inputs.h5", "New/Dataset/test_multi_inputs")

train_multi_targets.h5
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
72500
75000
77500
80000
82500
85000
87500
90000
92500
95000
97500
100000
102500
105000
105942
train_multi_inputs.h5
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
72500
75000
77500
80000
82500
85000
87500
90000
92500
95000
97500
100000
102500
105000
105942
train_cite_targets.h5
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
70988
train_cite_inputs.h5
2500
5000
7500
10000
12500
15000
17500
20000
22500
25000
27500
30000
32500
35000
37500
40000
42500
45000
47500
50000
52500
55000
57500
60000
62500
65000
67500
70000
70988
test_multi_inputs.h5
2500
5000
7500
10000
12500
1500

FileNotFoundError: File Datatset/test_cite_inputs.h5 does not exist

In [11]:
# print("test_cite_inputs.h5")
# convert_h5_to_sparse_csr("Dataset/test_cite_inputs.h5", "New/Dataset/test_cite_inputs")

In [29]:
conda install -c conda-forge pyarrow

Retrieving notices: ...working... done
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: E:\Anaconda

  added / updated specs:
    - pyarrow


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    arrow-cpp-8.0.0            |   py39hbd6f097_0         6.2 MB
    aws-c-common-0.4.57        |       ha925a31_1         147 KB
    aws-c-event-stream-0.1.6   |       hd77b12b_5          26 KB
    aws-checksums-0.1.9        |       ha925a31_0          50 KB
    aws-sdk-cpp-1.8.185        |       hd77b12b_0         2.5 MB
    blosc-1.21.0               |       h19a0ad4_1         145 KB
    boost-cpp-1.78.0           |       h5b4e17d_0        17.0 MB  conda-forge
    c-ares-1.18.1              |       h8ffe710_0         114 KB  conda-forge
    ca-certificates-2022.12.7  |       h5b45459_0         14














utf8proc-2.6.1       | 312 KB    | ####6      |  46% [A[A[A[A[A[A[A[A[A[A[A[A[A[A








boost-cpp-1.78.0     | 17.0 MB   | ######7    |  68% [A[A[A[A[A[A[A[A







re2-2022.04.01       | 472 KB    | #######4   |  75% [A[A[A[A[A[A[A










imagecodecs-2021.8.2 | 6.0 MB    | #######1   |  71% [A[A[A[A[A[A[A[A[A[A














utf8proc-2.6.1       | 312 KB    | ########## | 100% [A[A[A[A[A[A[A[A[A[A[A[A[A[A













conda-22.11.1        | 908 KB    | ##1        |  21% [A[A[A[A[A[A[A[A[A[A[A[A[A















aws-sdk-cpp-1.8.185  | 2.5 MB    |            |   1% [A[A[A[A[A[A[A[A[A[A[A[A[A[A[A










imagecodecs-2021.8.2 | 6.0 MB    | ########   |  80% [A[A[A[A[A[A[A[A[A[A








boost-cpp-1.78.0     | 17.0 MB   | #######1   |  72% [A[A[A[A[A[A[A[A







re2-2022.04.01       | 472 KB    | #########8 |  98% [A[A[A[A[A[A[A















aws-sdk-cpp-1.8.185  |

In [9]:
!dir /s "New/Dataset"

 Volume in drive F is New Volume
 Volume Serial Number is 92F6-A0F6

 Directory of F:\Minor Project\New\Dataset

19-12-2022  18:14    <DIR>          .
19-12-2022  18:14    <DIR>          ..
19-12-2022  18:14         3,910,641 metadata.parquet
19-12-2022  18:05                 0 train_multi_targets_values.sparse.npz
               2 File(s)      3,910,641 bytes

     Total Files Listed:
               2 File(s)      3,910,641 bytes
               2 Dir(s)  115,190,116,352 bytes free


In [8]:
convert_to_parquet("Dataset/metadata.csv", "New/Dataset/metadata")
convert_to_parquet("Dataset/evaluation_ids.csv", "New/Dataset/evaluation")
convert_to_parquet("Dataset/sample_submission.csv", "New/Dataset/sample_submission")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.