In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import os
from pprint import pprint
import sys

import numpy as np
import pandas as pd
import matplotlib as mpl

sys.path.insert(0, os.path.expanduser('~/Jumis/github_kwatme'))
from sequencing_process.sequencing_process.process_fastq_gz import count_transcripts_using_kallisto_quant
from plot.plot.plot_distribution import plot_distribution

In [None]:
n_job = 16

In [None]:
paired_fastq_gz_file_paths = [(
    '../data/{}_1.fq.gz'.format(sample_id),
    '../data/{}_2.fq.gz'.format(sample_id), )
                              for sample_id in (
                                  'C_1',
                                  'C_2',
                                  'C_3',
                                  'VS_1',
                                  'VS_2',
                                  'VS_3', )]

pprint(paired_fastq_gz_file_paths)

for fastq_gz_file_path_0, fastq_gz_file_path_1 in paired_fastq_gz_file_paths:

    assert os.path.isfile(fastq_gz_file_path_0), fastq_gz_file_path_0
    assert os.path.isfile(fastq_gz_file_path_1), fastq_gz_file_path_1

In [None]:
cdna_fasta_gz_file_path = '../../../data/grch/Homo_sapiens.GRCh38.cdna.all.fa.gz'
assert os.path.isfile(cdna_fasta_gz_file_path), cdna_fasta_gz_file_path

In [None]:
enst = pd.read_table('../../../data/enst.tsv', index_col=1)

pprint(enst.head())

enst_gene_name = enst['Gene name'].to_dict()

len(enst_gene_name)

In [None]:
tpms = []

for paired_fastq_gz_file_path_0, paired_fastq_gz_file_path_1 in paired_fastq_gz_file_paths:

    sample_id = os.path.commonprefix((
        paired_fastq_gz_file_path_0.split('/')[-1],
        paired_fastq_gz_file_path_1.split('/')[-1], )).strip('_.')
    print(sample_id)

    try:
        count_transcripts_using_kallisto_quant(
            (
                paired_fastq_gz_file_path_0,
                paired_fastq_gz_file_path_1, ),
            cdna_fasta_gz_file_path,
            '../output/kallisto/{}'.format(sample_id),
            n_job=n_job)

    except FileExistsError:
        pass

    tpm = pd.read_table(
        '../output/kallisto/{}/abundance.tsv'.format(sample_id),
        index_col=0)['tpm']
    tpm.name = '{}_tpm'.format(sample_id)

    tpms.append(tpm)

    print(tpm.describe())

    plot_distribution(
        np.log(np.where(0 < tpm, tpm, sorted(set(tpm))[1])),
        decorate_ax_kwargs={
            'title':
            'Distribution of log(TPM with 0 Replaced with Non-0 Min])',
        })

    plot_distribution(
        np.log(tpm[tpm != 0]),
        decorate_ax_kwargs={
            'title': 'Distribution of log(TPM with 0 Removed)',
        })

    mpl.pyplot.show()

enst_x_sample = pd.concat(tpms, axis=1)

enst_x_sample

In [None]:
gene_x_sample = enst_x_sample.copy()
gene_x_sample.index = enst_x_sample.index.map(enst_gene_name.get)
print(gene_x_sample.shape)

gene_x_sample = gene_x_sample.loc[~gene_x_sample.index.isnull()]
print(gene_x_sample.shape)

print(gene_x_sample.index.unique().size)

gene_x_sample

In [None]:
max_tpm__gene_x_sample = gene_x_sample.groupby(level=0).max()
print(max_tpm__gene_x_sample.shape)

max_tpm__gene_x_sample = max_tpm__gene_x_sample.sort_index()

max_tpm__gene_x_sample.index.name = None
max_tpm__gene_x_sample.columns.nAme = None

max_tpm__gene_x_sample.to_csv('../output/max_tpm__gene_x_sample.tsv', sep='\t')

plot_distribution(
    max_tpm__gene_x_sample.values.flatten(),
    decorate_ax_kwargs={
        'title': 'Maximum TPM',
    })

max_tpm__gene_x_sample

In [None]:
max_tpm_log__gene_x_sample = np.log(
    max_tpm__gene_x_sample.replace(
        0, sorted(set(max_tpm__gene_x_sample.values.flatten()))[1]))

max_tpm_log__gene_x_sample.to_csv(
    '../output/max_tpm_log__gene_x_sample.tsv', sep='\t')

plot_distribution(
    max_tpm_log__gene_x_sample.values.flatten(),
    decorate_ax_kwargs={
        'title': 'log(Maximum TPM with 0 Replaced with Non-0 Min)'
    })

max_tpm_log__gene_x_sample