# playing around with the gene set scoring

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import  scsingscore.scsingscore  as si
from scipy import sparse

In [2]:
n_genes = 100
n_cells = 1000

In [9]:
X = sparse.random(n_cells, n_genes, density=0.1, dtype=np.int32)

In [10]:
varnames = [f'g{i}' for i in range(n_genes)]
obsnames = [f'cell{i}' for i in range(n_cells)]

In [11]:
adata = sc.AnnData(X,
                   var=pd.DataFrame(index=varnames),
                   obs=pd.DataFrame(index=obsnames)
                  )
sc.pp.pca(adata)
sc.pp.neighbors(adata)

In [15]:
adata = sc.read_h5ad('/home/michi/IS-05_export.h5ad')

In [54]:
gene_set=['MT-CO1', 'MT-ND4']

In [29]:
si.sc_score_one(adata, 0, noise_trials=10, num_neighbors=10, samp_neighbors=5, gene_set=gene_set, compute_neighbors=False)

0.47904984423676006

In [34]:
celli=0
num_neighbors= 10
samp_neighbors = 5

In [36]:
from scsingscore.scsingscore import _refactor1

In [39]:
gene_counts = _refactor1(adata, celli, num_neighbors, samp_neighbors)

In [45]:
gene_counts.A

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [41]:
def to_dense_transpose_list(gene_counts):
    """
    gene_counts: cell x gene sparse matrix

    returns: summed gene vector; the vectors nonzero indices
    """
    # sum each gene across all cells
    gene_mat = gene_counts.todense().transpose().sum(axis=1)
    # suggestion (sum before dense!):
    # gene_mat2 = gene_counts.sum(axis=0).A.flatten()
    # assert np.all(gene_mat == gene_mat2)

    gdx = numpy.argwhere(gene_mat > 0)
    return ((gene_mat, [x[0] for x in gdx]))  #TODO why this double tuple (())


In [46]:
    (gene_mat, gdx) = to_dense_transpose_list(gene_counts)


In [47]:
    df = pandas.DataFrame(gene_mat, index=adata.var.index)
    df = df.iloc[gdx,:]
    df.columns = ['gene_counts']

In [48]:
df

Unnamed: 0,gene_counts
AAK1,1.0
AASDHPPT,2.0
ABCA7,1.0
ABCB1,1.0
ABCC1,1.0
...,...
ZNF708,1.0
ZNF761,1.0
ZNF770,1.0
ZNHIT3,1.0


In [51]:
noise_trials = 10
df_noise = add_noise(df, noise_trials, 0.01, 0.99) ## slow part .. fixed

In [52]:
df_noise

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
AAK1,1.278173,1.610346,1.342867,1.971786,1.079267,1.060364,1.662122,1.778045,1.386309,1.427358
AASDHPPT,2.641344,2.533873,2.971781,2.839987,2.855214,2.978598,2.884642,2.522951,2.110913,2.094457
ABCA7,1.068667,1.363238,1.416418,1.438647,1.179994,1.670359,1.456383,1.475286,1.098192,1.077189
ABCB1,1.304807,1.960396,1.691135,1.466404,1.565718,1.771457,1.816967,1.075508,1.810190,1.545215
ABCC1,1.844696,1.495774,1.819656,1.844031,1.854859,1.752314,1.633316,1.552612,1.309472,1.477699
...,...,...,...,...,...,...,...,...,...,...
ZNF708,1.338104,1.798728,1.698730,1.540789,1.259118,1.650405,1.386305,1.873403,1.288001,1.368655
ZNF761,1.697869,1.482916,1.600712,1.530682,1.372345,1.231559,1.283503,1.159303,1.080118,1.171343
ZNF770,1.714776,1.739455,1.320338,1.622169,1.193825,1.836380,1.307995,1.336498,1.890217,1.172072
ZNHIT3,1.756336,1.241553,1.198204,1.196001,1.759084,1.264214,1.932001,1.040678,1.596260,1.546397


In [56]:
s = score(up_gene=gene_set, sample=df_noise, norm_method='standard', full_data=False)  # standard workin gbetter here than theoretical

In [57]:
s

Unnamed: 0,total_score
0,0.464631
1,0.466865
2,0.468354
3,0.468354
4,0.469471
5,0.469099
6,0.466865
7,0.465376
8,0.469844
9,0.465376


In [59]:
df_noise

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
AAK1,1.278173,1.610346,1.342867,1.971786,1.079267,1.060364,1.662122,1.778045,1.386309,1.427358
AASDHPPT,2.641344,2.533873,2.971781,2.839987,2.855214,2.978598,2.884642,2.522951,2.110913,2.094457
ABCA7,1.068667,1.363238,1.416418,1.438647,1.179994,1.670359,1.456383,1.475286,1.098192,1.077189
ABCB1,1.304807,1.960396,1.691135,1.466404,1.565718,1.771457,1.816967,1.075508,1.810190,1.545215
ABCC1,1.844696,1.495774,1.819656,1.844031,1.854859,1.752314,1.633316,1.552612,1.309472,1.477699
...,...,...,...,...,...,...,...,...,...,...
ZNF708,1.338104,1.798728,1.698730,1.540789,1.259118,1.650405,1.386305,1.873403,1.288001,1.368655
ZNF761,1.697869,1.482916,1.600712,1.530682,1.372345,1.231559,1.283503,1.159303,1.080118,1.171343
ZNF770,1.714776,1.739455,1.320338,1.622169,1.193825,1.836380,1.307995,1.336498,1.890217,1.172072
ZNHIT3,1.756336,1.241553,1.198204,1.196001,1.759084,1.264214,1.932001,1.040678,1.596260,1.546397


In [60]:
s

Unnamed: 0,total_score
0,0.464631
1,0.466865
2,0.468354
3,0.468354
4,0.469471
5,0.469099
6,0.466865
7,0.465376
8,0.469844
9,0.465376


In [151]:
    gene_mat = nn_smoothing(adata.X, adata, 'connectivity', samp_neighbors=10)

In [153]:
gene_mat.nnz / (gene_mat.shape[0] * gene_mat.shape[1])

0.08961648757276029

In [156]:
gene_mat.T

<33538x23788 sparse matrix of type '<class 'numpy.float64'>'
	with 71496208 stored elements in Compressed Sparse Column format>

In [157]:
adata.var.index.shape

(33538,)

In [158]:
adata.obs.index.shape

(23788,)

In [166]:
gdx = df.sum(1) == 0

In [None]:
a= df.loc[gdx,:]

In [None]:
a

In [164]:
    df = pd.DataFrame(gene_mat.T.A, index=adata.var.index, columns=adata.obs.index)

In [165]:
df

index,AAACCTGAGACTAGAT-1-0,AAACCTGAGCCACGCT-1-0,AAACCTGAGCCAGTAG-1-0,AAACCTGAGCTCAACT-1-0,AAACCTGAGGTCGGAT-1-0,AAACCTGAGTGGGCTA-1-0,AAACCTGAGTGTGGCA-1-0,AAACCTGCAATGCCAT-1-0,AAACCTGCAATGGAGC-1-0,AAACCTGCACAACGTT-1-0,...,TTTGTCAGTCCTGCTT-1-1,TTTGTCAGTCGGCACT-1-1,TTTGTCAGTCTAACGT-1-1,TTTGTCAGTTACCAGT-1-1,TTTGTCATCAACCATG-1-1,TTTGTCATCCGTAGGC-1-1,TTTGTCATCGAATGGG-1-1,TTTGTCATCGGCGCTA-1-1,TTTGTCATCTCAAACG-1-1,TTTGTCATCTGTACGA-1-1
A1BG,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.02439,0.000000,0.000000,...,0.000000,0.224524,0.050000,0.000000,0.000000,0.000000,0.066667,0.000000,0.000000,0.000000
A1BG-AS1,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A1CF,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
A2M,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.050000,0.000000,0.000000
A2M-AS1,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.066667,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZYG11B,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ZYX,0.066667,0.0,0.0,0.000000,0.081081,0.0,0.066667,0.04939,0.414379,0.000000,...,0.207816,0.765224,0.117552,0.150583,0.117022,0.058824,0.000000,0.140591,0.022222,0.103704
ZZEF1,0.000000,0.0,0.0,0.026316,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.000000,0.000000,0.062500,0.000000,0.000000,0.000000,0.000000,0.050000,0.029412,0.000000
