In [1]:
import numpy as np
from scipy import sparse
from scanpy import read_h5ad

In [7]:
acinar_sce = read_h5ad("/home/julian/Uni/MasterThesis/data/acinar_sce.h5ad")
k = acinar_sce.obs.donor_age.unique().size - 1

## Old Implementation

In [15]:
def restructure_X_to_bin(X_orig, n_thresholds):

    # X training matrix
    X_bin = np.concatenate([X_orig.copy()] * (n_thresholds))
    # Add thresholds
    num_el = X_orig.shape[0] * (n_thresholds)

    for ki in range(n_thresholds):
        temp = np.repeat(0, num_el).reshape(X_orig.shape[0], (n_thresholds))
        temp[:,ki] = 1
        if ki > 0:
            thresholds = np.concatenate([thresholds, temp])
        else:
            thresholds = temp

    X_bin = np.concatenate([X_bin, thresholds], axis=1)

    return X_bin

In [48]:
%time restructure_X_to_bin(acinar_sce.X, k).shape

CPU times: user 116 ms, sys: 129 ms, total: 246 ms
Wall time: 244 ms


(2877, 23375)

## New Implementation

In [34]:
def restructure_X_to_bin_new(X_orig, n_thresholds):

    n = X_orig.shape[0]
    binarized_index = np.arange(n * n_thresholds)
    index_mod_n = binarized_index % n
    thresholds = np.identity(n_thresholds)
    
    if sparse.issparse(X_orig):
        thresholds = sparse.csr_matrix(thresholds)
        X_bin = sparse.hstack((X_orig[index_mod_n], thresholds[binarized_index // n]))
    else:
        X_bin = np.hstack((X_orig[index_mod_n], thresholds[binarized_index // n]))

    return X_bin

In [11]:
%time restructure_X_to_bin_new(acinar_sce.X, k).shape

CPU times: user 111 ms, sys: 183 ms, total: 294 ms
Wall time: 294 ms


(2877, 23375)

## Timeit

In [20]:
import timeit, functools

In [21]:
t_old = timeit.Timer(functools.partial(restructure_X_to_bin, acinar_sce.X, k))
t_old.timeit(100)

19.872561967000365

In [35]:
t_new = timeit.Timer(functools.partial(restructure_X_to_bin_new, acinar_sce.X, k))
t_new.timeit(100)

16.619350277993362

In [36]:
X_sparse = sparse.csr_matrix(acinar_sce.X)
t_sparse = timeit.Timer(functools.partial(restructure_X_to_bin_new, X_sparse, k))
t_sparse.timeit(100)

8.406985062989406

## Validation

In [37]:
np.all(np.equal(restructure_X_to_bin(acinar_sce.X, k), restructure_X_to_bin_new(acinar_sce.X, k)))

True

In [47]:
np.all(np.equal(restructure_X_to_bin(acinar_sce.X, k), restructure_X_to_bin_new(X_sparse, k).todense()))

True