In [1]:
const genes = readlines("/home/kwat/github/kraft/notebook/genes.txt")[2:end];

const scores = fill(
    1 / length(genes),
    length(genes),
);

const gene_set_dict = read_gmt("/home/kwat/garden/data/gene_set/msigdb_v6.2/h.all.v6.2.symbols.gmt")

const gene_set_genes = sort(gene_set_dict[sort(collect(keys(gene_set_dict)))[1]]);

In [2]:
using BenchmarkTools

In [3]:
function make_hits(
    elements::Array{String, 1},
    elements_to_find::Array{String, 1},
)
    
    n = length(elements)
    
    hits = Array{Int64, 1}(
        undef,
        n,
    )
    
    elements_to_find_ = Dict(e=>nothing for e in elements_to_find)
    
    @inbounds @fastmath @simd for i in 1:n

        if haskey(
            elements_to_find_,
            elements[i],
        )
            
            hit = 1

        else
            
            hit = 0

        end
        
        hits[i] = hit

    end
    
    hits

end

make_hits (generic function with 1 method)

In [4]:
function sum_hit_scores(
    scores::Array{Float64, 1},
    hits::Array{Int64, 1},
)
    
    sum_ = 0.0
    
    @inbounds @fastmath @simd for i in 1:length(scores)
        
        if hits[i] == 1
        
            sum_ += abs(scores[i])
            
        end
        
    end
    
    sum_
    
end 

sum_hit_scores (generic function with 1 method)

In [5]:
function sum_hits(hits::Array{Int64, 1})
    
    sum_ = 0
    
    @inbounds @fastmath @simd for i in 1:length(hits)
        
         sum_ += hits[i]
        
    end
    
    sum_
    
end 

sum_hits (generic function with 1 method)

In [6]:
function pick_float_by_abs(
    float_1::Float64,
    float_2::Float64,
)
    
    @fastmath if abs(float_1) < abs(float_2)

        float = float_2

    else

        float = float_1

    end
    
    return float
    
end

pick_float_by_abs (generic function with 1 method)

In [7]:
const gene_rank = Dict(zip(
    genes,
    1:length(genes),
))

const hits = make_hits(
    genes,
    gene_set_genes,
)

57954-element Array{Int64,1}:
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 ⋮
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0
 0

In [10]:
function compute_gene_set_enrichment(
    genes::Array{String, 1},
    scores::Array{Float64, 1},
    gene_set_genes::Array{String, 1},
    hits::Array{Int64, 1},
)
    
    # 1ns
    n = length(genes)
    
    # 2us
    cumulative_sum = Array{Float64, 1}(
        undef,
        n,
    )
    
    # 13us
    hit_scores_sum = sum_hit_scores(
        scores,
        hits,
    )
    
    # 6us
    d_down = -1 / (n - sum_hits(hits))
    
    # 0.01ns
    value = 0.0
    
    # 0.01ns
    auc = 0.0
    
    # 0.01ns
    min_ = 0.0
    
    # 0.01ns
    max_ = 0.0
    
    @inbounds @fastmath @simd for i in 1:n
        
        if hits[i] == 1
            
            value += abs(scores[i]) / hit_scores_sum
            
        else
            
            value += d_down
            
        end
        
        # 50us
        cumulative_sum[i] = value
        
        auc += value
        
        if value < min_
            
            min_ = value
            
        elseif max_ < value
            
            max_ = value
            
        end
            
    end
    
    cumulative_sum, auc, min_, max_
    
end


@benchmark compute_gene_set_enrichment(
    genes,
    scores,
    gene_set_genes,
    hits,
)

BenchmarkTools.Trial: 
  memory estimate:  452.94 KiB
  allocs estimate:  3
  --------------
  minimum time:     99.528 μs (0.00% GC)
  median time:      103.191 μs (0.00% GC)
  mean time:        111.654 μs (6.27% GC)
  maximum time:     30.679 ms (99.55% GC)
  --------------
  samples:          10000
  evals/sample:     1

In [11]:
using PyCall

In [12]:
pd = pyimport("pandas")

kraft = pyimport("kraft")

PyObject <module 'kraft' from '/home/kwat/github/kraft/kraft/__init__.py'>

In [13]:
gene_score = pd.Series(
    scores,
    index=genes,
)

PyObject MIR628        0.000017
RNU6-871P     0.000017
MIR626        0.000017
AC012314.7    0.000017
GHRLOS        0.000017
                ...   
KP420441.5    0.000017
KP420440.2    0.000017
KP420440.6    0.000017
KP420440.5    0.000017
KP420446.1    0.000017
Length: 57954, dtype: float64

In [15]:
println(kraft.run_single_sample_gsea(
    gene_score,
    gene_set_genes,
    hit=hits,
    plot=false,
))

@benchmark kraft.run_single_sample_gsea(
    gene_score,
    gene_set_genes,
    hit=hits,
    plot=false,
)

-0.11891713503802033


BenchmarkTools.Trial: 
  memory estimate:  5.94 KiB
  allocs estimate:  258
  --------------
  minimum time:     519.096 μs (0.00% GC)
  median time:      539.163 μs (0.00% GC)
  mean time:        577.080 μs (1.74% GC)
  maximum time:     90.850 ms (64.30% GC)
  --------------
  samples:          8622
  evals/sample:     1