In [None]:
using Revise

using Kate

### Test Environment

In [None]:
Kate.test()

### Set Parameters

In [None]:
# Experiment 
ex = "human_cdna_valeria"

# Fragment length (not needed if paired end)
fr = 51

# Standard deviation of read length (not needed if paired end)
sd = 0.05

# Reference transcriptome
paf = "../input/human_reference_transcriptome/Homo_sapiens.GRCh38.cdna.all.fa.gz"

# paf = "../input/mouse_reference_transcriptome/Mus_musculus.GRCm38.cdna.all.fa.gz"

pat = "../input/human_reference_transcriptome/enst_gene_name.tsv"

n_jo = 10;

In [None]:
pai = joinpath("../input/")

pae = joinpath(pai, ex)

pao = joinpath("../output/", ex);

### Check reads

In [None]:
re_ = Kate.find(pae)

In [None]:
Kate.check(re_, joinpath(pao, string("check_raw")), n_jo)

### Count transcript

In [None]:
na_ = ["R1", "read1", "_1.fq"]

naf = ""

pap = joinpath(pao, "psuedoalign/")

if !isdir(pap)

    for fq1 in re_

        for na in na_ if occursin(na, fq1)

                naf = na

                nar = replace(naf, "1" => "2")

                fq2 = replace(fq1, naf => nar)

                if !isfile(fq2)

                    fq2 = nothing

                end

                sa = last(splitdir(splitext(split(fq1, naf)[1])[1]))

                println("Working on sample: $sa\n")

                pas = joinpath(pap, sa)

                Kate.count_transcript(
                    paf,
                    pas,
                    n_jo,
                    fq1,
                    fq2,
                    fr,
                    sd,
                )

            end

        end

    end

else

    println("Skipping psuedoalignment because directory already exists: $pap")
    
end

### TODO: Check mapping rate and abundance output

### Make transcript by sample

In [None]:
using Pkg;

Pkg.add("DataFrames")

In [None]:
using CSV

using DataFrames

tp_tr_sa = DataFrame()

for di in readdir(joinpath("../output/", ex, "psuedoalign/"), join = true)

    if !occursin("DS_Store", di)

        pa = string(joinpath(di, "abundance.tsv"))

        tpm = DataFrame(CSV.File(pa, delim = "	"))[:, [:target_id, :tpm]]

        sa = last(splitdir(di))

        tpm = rename!(tpm, :tpm => sa)

        if isempty(tp_tr_sa)

            tp_tr_sa = tpm

        else

            println(ncol(tp_tr_sa))

            id = ncol(tp_tr_sa) + 1

            insertcols!(tp_tr_sa, id, sa => tpm[:, sa])

        end

    end

end

tp_tr_sa = rename!(tp_tr_sa, :target_id => :id)

println(first(tp_tr_sa))

CSV.write(joinpath("../output/", ex, "transcript_x_sample.tsv"), tp_tr_sa)

### Make gene by sample

In [None]:
using Statistics

tr_ge = DataFrame(CSV.File(pat, delim = "	"))

tr_ge = rename!(tr_ge, Dict("Transcript stable ID version" => :id, "Gene name" => :gene))


# Map transcript to gene name

tp_trge_sa = sort!(innerjoin(tp_tr_sa, tr_ge, on = :id), :gene)

tp_ge__sa = select!(tp_trge_sa, [n for n in names(tp_trge_sa) if n != "id"])


# Save the mean tpm for each gene

gr = groupby(tp_ge__sa, :gene)

sa_ = [n for n in names(tp_trge_sa) if n != "gene"]

tp_ge_sa = DataFrame()

for sa in sa_

    ge_sa = combine(gr, sa => sum)

    if isempty(tp_ge_sa)

        append!(tp_ge_sa, ge_sa)

    else

        tp_ge_sa = innerjoin(tp_ge_sa, ge_sa, on = :gene)

    end

end

println(tp_ge_sa[1:5, :])

# Save gene by sample

CSV.write(joinpath("../output/", ex, "gene_x_sample.tsv"), tp_ge_sa)