In [None]:
using BenchmarkTools
using CSV
using DataFrames
using Statistics

include("../src/Kraft.jl")

In [None]:
gene_x_sample = Kraft.read_gct(
    "/Users/kwatme/Downloads/CCLE_expression_full.reformatted.gct",
    "Gene",
)

In [None]:
using PyCall

pd = pyimport("pandas")
kraft = pyimport("kraft")

In [None]:
for name in names(gene_x_sample)[2:end]
    
    gene_x_sample[!, name] = convert(
        Vector{Float64},
        gene_x_sample[!, name],
    )
    
end

gene_x_sample

In [None]:
for column in eachcol(gene_x_sample[!, 2:end])
    
    column .= log.(
        2,
        column,
    )
    
end

gene_x_sample

In [None]:
for name in names(gene_x_sample)[2:end]
    
    gene_x_sample[!, name] = replace(
        gene_x_sample[!, name],
        -Inf => NaN,
    )
    
end

gene_x_sample

In [None]:
function normalize_vector__0_(vector::Vector{T} where T<:Real)
    
    vector = convert.(
        Float64,
        vector,
    )
    
    is_not_nan = .!isnan.(vector)
    
    vector_not_nan = vector[is_not_nan]
    
    vector[is_not_nan] .= (vector_not_nan .- mean(vector_not_nan)) / std(vector_not_nan)
    
    vector
    
end

for vector in (
    [-2, 0, 2],
    [NaN, -2, 0, 2, NaN],
    [NaN, NaN, NaN],
    [-2, 2],
    [0, 0],
    [0],
)
    
    println(normalize_vector__0_(vector))
    
end

In [None]:
for name in names(gene_x_sample)[2:end]
    
    gene_x_sample[!, name] = normalize_vector__0_(gene_x_sample[!, name])
    
end

gene_x_sample

In [None]:
gene_set_keywords = [
    "VANTVEER_BREAST_CANCER_ESR1",
    "DOANE_BREAST_CANCER_ESR1",
    "YANG_BREAST_CANCER_ESR1",
    "CHARAFE_BREAST_CANCER_LUMINAL_VS_BASAL",
    "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION",
]

In [None]:
gene_set_genes = Kraft.read_gmt([
    "c2.all.v6.2.symbols.gmt",
    "h.all.v6.2.symbols.gmt",
])

selected_gene_set_genes = Dict{String, Vector{String}}()

for (gene_set, genes) in gene_set_genes
    
    if any(occursin(
        gene_set_keyword,
        gene_set,
    ) for gene_set_keyword in gene_set_keywords)
        
        selected_gene_set_genes[gene_set] = genes
        
    end
    
end

selected_gene_set_genes

In [None]:
statistic = "ks"

In [None]:
gene_set_x_sample = Kraft.combine_gene_sets_dn_up(Kraft.compute_set_enrichment(
    gene_x_sample,
    selected_gene_set_genes,
    statistic,
))

In [None]:
protein_x_sample = Kraft.read_gct(
    "/Users/kwatme/Downloads/CCLE_RPPA_20180123.gct",
    "Protein",
)

target_row_indices = findall(protein_x_sample[:, Symbol("Protein")] .== "ER-alpha")

if length(target_row_indices) != 1
    
    error("there should be only 1 target.")
    
end

target_names = names(protein_x_sample)[2:end]

target_values = [protein_x_sample[target_row_indices[1], i] for i in 2:size(
    protein_x_sample,
    2,
)]

target = pd.Series(
    target_values,
    index = target_names,
    name = "ER Protein",
)

In [None]:
data = pd.DataFrame(
    Dict(name => gene_set_x_sample[!, name] for name in names(gene_set_x_sample)[2:end]),
    index=gene_set_x_sample[!, 1],
)

In [None]:
kraft.make_match_panel(
    target,
    data,
)