# GSE112622 transcription TMP vs H3K36me3

See `GSE112622Experiment` in `epigenome` project.
This experiment provides information per gene.
For each gene:
* _PEAKS - number of peaks intersecting with gene body for TOOL
* _COVERAGE - information about percentage of gene body length covered by peaks for TOOL
* _RPM / _RPKM - normalized reads coverage in gene body
* _TPM / _FPKM - transcript per million reads provided by RSEM

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from IPython.display import display
import pandas as pd

import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
import scipy as sp
import os

In [None]:
FOLDER='/mnt/stripe/bio/experiments/gse112622'

df = pd.read_csv(os.path.join(FOLDER, 'data.tsv'), sep='\t', comment='#')
# Filter out only protein coding genes
df = df.loc[df['coding'] == 1]
print(f'Number of protein coding genes {len(df)}')

In [None]:
df['loc'] = df['chr'] + ':' + df['start'].astype(str) + '-' + df['end'].astype(str)
df['len'] = df['end'] - df['start']
df.sort_values(by=['len'], ascending=False, inplace=True)
print('Longest genes')
display(df[['gene_symbol', 'loc', 'len']].reset_index(drop=True).iloc[:200,:])
print('Gene lengths')
print(df['len'].describe())

In [None]:
# Ignore extemely long genes
df = df.iloc[200:, :]

# Peaks statistics

In [None]:
import glob
MACS_PEAKS = glob.glob('/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/chipseq/macs2/*.broadPeak')
SICER_PEAKS = glob.glob('/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/chipseq/sicer/*.scoreisland')
SPAN_PEAKS = glob.glob('/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/chipseq/span_tuned/*.peak')

ts = []
for peaksfile in MACS_PEAKS + SICER_PEAKS + SPAN_PEAKS:
    name = os.path.basename(peaksfile)
    if 'broadPeak' in name:
        caller = 'MACS2'
    elif 'scoreisland' in name:
        caller = 'SICER'
    else:
        caller = 'SPAN'
    t = pd.read_csv(peaksfile, sep='\t', header=None)
    t = t.iloc[:, :3]
    t.columns = ['chr', 'start', 'end']
    t['len'] = t['end'] - t['start']
    rpms = pd.read_csv(f'{peaksfile}.rpm', header=None)
    t['rpm'] = rpms.loc[:, 0]
    t['rpkm'] = t['rpm'] / (t['len'] / 1000)
    t['name'] = name
    t['caller'] = caller
    t.sort_values(by=['len'], ascending=False, inplace=True)
    t['loc'] = t['chr'] + ':' + t['start'].astype(str) + '-' + t['end'].astype(str)
    print(peaksfile)
    print(t['rpkm'].describe())
    print(f'Longest peaks {name}')
    display(t[['loc', 'len']].reset_index(drop=True).iloc[:5,:])
    ts.append(t)
t = pd.concat(ts)

plt.figure(figsize=(10, 3))
# plt.xlim(0, 500)
plt.ylim(0, 0.18)
for name in set(t['name']):
    tc = t.loc[t['name'] == name].copy()
    # For visualization
    tc['rpkm'].clip(upper=100, inplace=True)
    sns.kdeplot(tc['rpkm'], shade=True, label=name)
plt.suptitle(f'RPKMs per file')
plt.show()

In [None]:
# Top SICER RPKM peak
display(t.loc[t['caller'] == 'SICER'].sort_values(by=['rpkm'], ascending=False)[['loc', 'rpkm']].iloc[:10, ])

plt.figure(figsize=(10, 3))
plt.xlim(0, 100)
# plt.ylim(0, 0.08)
for caller in set(t['caller']):
    tc = t.loc[t['caller'] == caller]
    sns.kdeplot(tc['rpkm'], shade=True, label=caller)
plt.suptitle(f'RPKMs')
plt.show()

In [None]:
# Ignore mitochondrial chromosome
df = df.loc[df['chr']!='chrM']

In [None]:
# Lenghts distribution
plt.figure(figsize=(10, 4))
for caller in ['SICER', 'SPAN']:
    tc = t.loc[t['caller'] == caller]
    print(caller)
    print(tc['len'].describe())
    sns.kdeplot(tc['len'], shade=True, label=caller)

sns.kdeplot(df['len'], shade=True, label='Gene length')
plt.xlim(0, 100000)
plt.suptitle(f'Lenghts')
plt.show()

# H3K36me3 correspondence between conditions

In [None]:
print('Compute correlation between intersection of peak and gene between 2 replicates')
TOOLS = ["MACS2", "SICER", "SPAN"]
for tool in TOOLS:
    for sample in [0, 1]:
        df['k36me3_{}_intersect_{}'.format(tool, sample)] = df['k36me3_{}_PEAKS_{}'.format(tool, sample)] > 0
    print(tool + '\t' + str(sp.stats.pearsonr(
        df['k36me3_{}_intersect_0'.format(tool)],
        df['k36me3_{}_intersect_1'.format(tool)])[0]))

In [None]:
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

print('Processing RPKM')
with PdfPages(os.path.join(FOLDER, 'gse112622_rpkm_all.pdf')) as pdf:        
    plt.figure(figsize=(10, 10))
    sns.jointplot(df['k36me3_RPKM_0'], 
                  df['k36me3_RPKM_1'], kind="reg", stat_func=r2,
                  joint_kws = {'scatter_kws': dict(alpha=0.1, s=0.5)})
    plt.suptitle('Per gene H3K36me3 RPKMs')    
    pdf.savefig()

print('Processing RPKM without outliers')
rpkm_filter = np.logical_and(df['k36me3_RPKM_0'] < 300, 
                             df['k36me3_RPKM_1'] < 300)
display(df.loc[np.logical_not(rpkm_filter)])

print('Filter dataframe by RPKM')
df = df.loc[rpkm_filter]

with PdfPages(os.path.join(FOLDER, 'gse112622_rpkm.pdf')) as pdf:        
    plt.figure(figsize=(5, 5))
    sns.jointplot(df['k36me3_RPKM_0'], 
                  df['k36me3_RPKM_1'], kind="reg", stat_func=r2,
                 joint_kws = {'scatter_kws':dict(alpha=0.2, s=0.5)})
    plt.suptitle('Per gene H3K36me3 RPKMs limited')
    pdf.savefig()

# H3K36me3 peaks and coverage correspondence

In [None]:
TOOLS = ["MACS2", "SICER", "SPAN"]
for tool in TOOLS:
    print('Processing', tool)
    with PdfPages(os.path.join(FOLDER, 'gse112622_{}_coverage.pdf'.format(tool))) as pdf:
        plt.figure(figsize=(5, 5))
        sns.jointplot(df['k36me3_{}_COVERAGE_0'.format(tool)], 
                      df['k36me3_{}_COVERAGE_1'.format(tool)], kind="reg", stat_func=r2,
                      joint_kws = {'scatter_kws': dict(alpha=0.2, s=0.5)})
        plt.suptitle('Per gene coverage by peaks')
        pdf.savefig()
        print('Done gse112622_{}_coverage.pdf'.format(tool))
    with PdfPages(os.path.join(FOLDER, 'gse112622_{}_peaks.pdf'.format(tool))) as pdf:        
        plt.figure(figsize=(5, 5))
        sns.jointplot(df['k36me3_{}_PEAKS_0'.format(tool)], 
                      df['k36me3_{}_PEAKS_1'.format(tool)], kind="reg", stat_func=r2,
                     joint_kws = {'scatter_kws':dict(alpha=0.2, s=0.5)})
        plt.suptitle('Per gene number of intersecting by peaks')
        pdf.savefig()
        print('Done, gse112622_{}_peaks.pdf'.format(tool))

# RNA-Seq correspondence between conditions and replicates

In [None]:
# # Quantile normalization
# def quantile(df):
#     dfq = df.copy()
#     #compute rank
#     dic = {}
#     for col in dfq:
#         dic.update({col : sorted(dfq[col])})
#     sorted_df = pd.DataFrame(dic)
#     rank = sorted_df.mean(axis = 1).tolist()
#     #sort
#     for col in dfq:
#         t = np.searchsorted(np.sort(dfq[col]), dfq[col])
#         dfq[col] = [rank[i] for i in t]
#     return dfq

dfq = df[['gene_id', 'FPKM_0_0', 'FPKM_0_1', 'FPKM_1_0', 'FPKM_1_1']]

# Log2 normalize 
for c in ['TPM_0_0', 'TPM_0_1', 'TPM_1_0', 'TPM_1_1']:
    dfq[c] = np.log1p(df[c])

# Quantile normalize
# dfq = quantile(dfq)

with PdfPages(os.path.join(FOLDER, 'gse112622_tpm.pdf')) as pdf:
    plt.figure(figsize=(5, 5))
    sns.pairplot(dfq[['TPM_0_0', 'TPM_0_1', 'TPM_1_0', 'TPM_1_1']])
    plt.tight_layout()
    plt.suptitle('TPM correspondence')
    pdf.savefig()

with PdfPages(os.path.join(FOLDER, 'gse112622_fpkm.pdf')) as pdf:
    plt.figure(figsize=(5, 5))
    sns.pairplot(dfq[['FPKM_0_0', 'FPKM_0_1', 'FPKM_1_0', 'FPKM_1_1']])
    plt.tight_layout()
    plt.suptitle('FPKM correspondence')
    pdf.savefig()    

# Average expression / K36me3 values for condition

In [None]:
dfq['TPM_0'] = (dfq['TPM_0_0'] + dfq['TPM_0_1']) / 2
dfq['TPM_1'] = (dfq['TPM_1_0'] + dfq['TPM_1_1']) / 2
dfq['TPM'] = (dfq['TPM_0'] + dfq['TPM_1']) / 2

dfq['FPKM_0'] = (dfq['FPKM_0_0'] + dfq['FPKM_0_1']) / 2
dfq['FPKM_1'] = (dfq['FPKM_1_0'] + dfq['FPKM_1_1']) / 2
dfq['FPKM'] = (dfq['FPKM_0'] + dfq['FPKM_1']) / 2

# Top expressed genes and not expressed genes
print('Top expressed genes')
print('\t'.join(dfq.sort_values(by=['TPM'], ascending=False)[:20]['gene_id']))

print('Top not expressed genes')
print('\t'.join(dfq.sort_values(by=['TPM'])[:20]['gene_id']))

dff = df.copy()
dff[dfq.columns] = dfq

In [None]:
ORANGE = (243/256, 135/256, 47/256)
BLUE = (35/256, 110/256, 150/256)

plt.figure(figsize=(10, 6))
expressed = dff.loc[dff['TPM'] > 1]['TPM']
plt.hist(expessed, bins=120, density=True, color=ORANGE, edgecolor=ORANGE, linewidth=2, 
         label=f'Expressed TPM>1 ({len(expressed)})')

nexpressed = dff.loc[dff['TPM'] <= 1]['TPM']
plt.hist(nexpressed, bins=20, density=True, color=BLUE, edgecolor=BLUE, linewidth=2,
         label=f'Not expressed TPM<=1 ({len(nexpressed)})')
plt.gca().set(title='Frequency histogram of TPM', ylabel='Frequency')
plt.legend()

plt.show()

In [None]:
print('Compute average coverage/peaks')
dff['k36me3_RPM'.format(tool)] = (dff['k36me3_RPM_0'] + dff['k36me3_RPM_1']) / 2    
dff['k36me3_RPKM'.format(tool)] = (dff['k36me3_RPKM_0'] + dff['k36me3_RPKM_1']) / 2    

for tool in TOOLS:
    dff['k36me3_{}_COVERAGE'.format(tool)] =\
        (dff['k36me3_{}_COVERAGE_0'.format(tool)] + dff['k36me3_{}_COVERAGE_1'.format(tool)]) / 2
    dff['k36me3_{}_PEAKS'.format(tool)] =\
        (dff['k36me3_{}_PEAKS_0'.format(tool)] + dff['k36me3_{}_PEAKS_1'.format(tool)]) / 2

In [None]:
TOOLS = ["MACS2", "SICER", "SPAN"]
for tool in TOOLS:
    print('Processing', tool)
    with PdfPages(os.path.join(FOLDER, 'gse112622_{}_coverage_average.pdf'.format(tool))) as pdf:
        plt.figure(figsize=(5, 5))
        g = sns.jointplot(dff['k36me3_RPKM'], 
                          dff['k36me3_{}_COVERAGE'.format(tool)], kind="reg", stat_func=r2,
                          joint_kws = {'scatter_kws':dict(alpha=0.2, s=0.5)})
        g.ax_marg_x.set_xlim(0, 250)
        g.ax_marg_y.set_ylim(0, 1)
        plt.suptitle('Per gene coverage by peaks')
        pdf.savefig()

# RNA-Seq TPM vs k36me3 RPKM

In [None]:
from scipy import stats
def r2(x, y):
    return stats.pearsonr(x, y)[0] ** 2

# print('TPM vs H3K36me3 RPKM')
# for s in [0, 1]:
#     plt.figure(figsize=(5, 5))
#     sns.jointplot(dff['TPM_{}'.format(s)], dff['k36me3_RPKM_{}'.format(s)], kind="reg", stat_func=r2,
#                   joint_kws = {'scatter_kws':dict(alpha=0.1, s=0.5)})
#     plt.suptitle('TPM vs K36me3 RPKM {}'.format(s))
#     plt.show()


plt.figure(figsize=(5, 5))
colors = [ORANGE if tpm > 1 else BLUE for tpm in dff['TPM']]
g = sns.jointplot(dff['TPM'], dff['k36me3_RPKM'], kind="reg", stat_func=r2,
                  joint_kws = {'scatter_kws':dict(alpha=0.2, s=0.5, color=colors)})
g.ax_marg_x.set_xlim(0, 10)
g.ax_marg_y.set_ylim(0, 300)
plt.suptitle('TPM vs K36me3 RPKM')
plt.show()    

# Intersection ChIP-Seq vs Expressed RNA-Seq

In [None]:
print('tool\treplicate\t(TPM>1)&intsct\t(TPM>1)&Nintsct\t(TPM<=1)&intsct\t(TPM<=1)&Nintsct')
for t in TOOLS:
    for s in [0, 1]:
        dff['expressed_{}'.format(s)] = dff['TPM_{}'.format(s)] > 1
        
        expr_intersect_filter = np.logical_and(
            dff['expressed_{}'.format(s)], 
            dff['k36me3_{}_intersect_{}'.format(t, s)])
        expr_intersect = sum(expr_intersect_filter)
#         print('\t'.join(dff.loc[expr_intersect_filter][:10]['gene_id']))

        expr_nintersect_filter = np.logical_and(
            dff['expressed_{}'.format(s)], 
            np.logical_not(dff['k36me3_{}_intersect_{}'.format(t, s)]))
        expr_nintersect = sum(expr_nintersect_filter)
#         print('\t'.join(dff.loc[expr_nintersect_filter][:10]['gene_id']))

        nexpr_intersect_filter = np.logical_and(
            np.logical_not(dff['expressed_{}'.format(s)]), 
            dff['k36me3_{}_intersect_{}'.format(t, s)])
        nexpr_intersect = sum(nexpr_intersect_filter)
#         print('\t'.join(dff.loc[nexpr_intersect_filter][:10]['gene_id']))

        nexpr_nintersect_filter = np.logical_and(
            np.logical_not(dff['expressed_{}'.format(s)]), 
            np.logical_not(dff['k36me3_{}_intersect_{}'.format(t, s)]))
        nexpr_nintersect = sum(nexpr_nintersect_filter)
#         print('\t'.join(dff.loc[nexpr_nintersect_filter][:10]['gene_id']))
        print(f'{t}\t{s}\t{expr_intersect}\t{expr_nintersect}\t{nexpr_intersect}\t{nexpr_nintersect}')

In [None]:
print("Transcription vs TOOLS coverage/peaks")
colors = [ORANGE if tpm > 1 else BLUE for tpm in dff['TPM']]
for tool in TOOLS:
    with PdfPages(os.path.join(FOLDER, 'gse112622_transcription_vs_{}_coverage.pdf'.format(tool))) as pdf:
        plt.figure(figsize=(5, 5))
        g = sns.jointplot(dff['TPM'], dff['k36me3_{}_COVERAGE'.format(tool)], kind="reg", stat_func=r2,
                  joint_kws = {'scatter_kws':dict(alpha=0.2, s=0.5, color=colors)})
        g.ax_marg_x.set_xlim(0, 10)
        g.ax_marg_y.set_ylim(0, 1)
        plt.suptitle(f'TPM vs K36me3 gene fraction covered by {tool}')        
        plt.tight_layout()
        pdf.savefig()

    with PdfPages(os.path.join(FOLDER, 'gse112622_transcription_vs_{}_peaks.pdf'.format(tool))) as pdf:
        plt.figure(figsize=(5, 5))
        g = sns.jointplot(dff['TPM'], dff['k36me3_{}_PEAKS'.format(tool)], kind="reg", stat_func=r2,
                  joint_kws = {'scatter_kws':dict(alpha=0.2, s=0.5, color=colors)})
        plt.tight_layout()
        plt.suptitle(f'TPM vs K36me3 # of peaks {tool} intersecting')                
        pdf.savefig()

# RNA-Seq expressed / not expressed vs ChIP-Seq Coverage

In [None]:
def mod(x):
    if "expr" in x:
        return "Expression"
    if "RPKM" in x:
        return "RPKM"
    if "MACS2" in x:
        return "MACS2"
    if "SICER" in x:
        return "SICER"
    if "SPAN" in x: 
        return "SPAN"
    return "NA"

# print('Processing per sample')    
# for sample in [0, 1]:
#     # Split expressed and not expressed genes
#     dff['expressed_{}'.format(sample)] = dff['TPM_{}'.format(sample)] > 1
#     with PdfPages(os.path.join(FOLDER, 'gse112622_transcription_vs_coverage_{}.pdf'.format(sample))) as pdf:
#         plt.figure(figsize=(10, 5))
#         dfm = pd.melt(dff, id_vars=['expressed_{}'.format(sample)], 
#                 value_vars=['k36me3_MACS2_COVERAGE_{}'.format(sample), 
#                             'k36me3_SICER_COVERAGE_{}'.format(sample),
#                             'k36me3_SPAN_COVERAGE_{}'.format(sample)])
#         dfm['type'] = [mod(x) for x in dfm['variable']]
#         sns.violinplot(x='type', y='value', hue='expressed_{}'.format(sample), data=dfm)
#         plt.suptitle('RNA-Seq expressed (TPM>1) / non-expressed vs COVERAGE {}'.format(sample))
#         pdf.savefig()


print('Processing average values')
with PdfPages(os.path.join(FOLDER, 'gse112622_summary_transcription_vs_coverage.pdf')) as pdf:
    plt.figure(figsize=(10, 5))
    # Split expressed and not expressed genes
    dff['expressed'] = dff['TPM'] > 1
    dfm = pd.melt(dff, id_vars=['expressed'], 
            value_vars=['k36me3_MACS2_COVERAGE', 
                        'k36me3_SICER_COVERAGE',
                        'k36me3_SPAN_COVERAGE'])
    dfm['type'] = [mod(x) for x in dfm["variable"]]
    sns.violinplot(x='type', y='value', hue='expressed', data=dfm)
    plt.suptitle('RNA-Seq expressed (TPM>1) / non-expressed vs COVERAGE')
    pdf.savefig()

# Difference in k36me3 intersect => RNA-Seq?

In [None]:
##########################################################
# Work only with genes, expressed in at least one sample #
##########################################################
dft = dff.loc[np.logical_or(dff['TPM_0'] > 1, dff['TPM_1'] > 1)]

for tool in TOOLS:
    print('Processing', tool)
    with PdfPages(os.path.join(FOLDER, 'gse112622_{}_t_1.pdf'.format(tool))) as pdf:
        plt.figure(figsize=(2, 5))
        diff = dft.loc[np.logical_and(dft['k36me3_{}_intersect_0'.format(tool)],
                                      np.logical_not(dft['k36me3_{}_intersect_1'.format(tool)]))]
        if len(diff) > 0:
            g = sns.boxplot(x="variable", y="value", data=pd.melt(diff, value_vars=['TPM_0', 'TPM_1']))
            g.set_ylim([0, 8])
            plt.suptitle('Transcription for genes with k36me3_{}_intersect 0 and not 1'.format(tool))
            pdf.savefig()
        else:
            print('NOTHING 0 < 1')
        
    with PdfPages(os.path.join(FOLDER, 'gse112622_{}_t_0.pdf'.format(tool))) as pdf:
        plt.figure(figsize=(2, 5))
        diff = dft.loc[np.logical_and(dft['k36me3_{}_intersect_1'.format(tool)],
                                      np.logical_not(dft['k36me3_{}_intersect_0'.format(tool)]))]
        if len(diff) > 0:
            g = sns.boxplot(x="variable", y="value", data=pd.melt(diff, value_vars=['TPM_0', 'TPM_1']))
            g.set_ylim([0, 8])
            plt.suptitle('Transcription for genes with k36me3_{}_intersect 1 and not 0'.format(tool))
            pdf.savefig()
        else:
            print('NOTHING 0 > 1')

# Hierarchical clustering

In [None]:
dff['k36me3_RPKM'] = (dff['k36me3_RPKM_0'] + dff['k36me3_RPKM_1']) / 2
dff['k36me3_RPM'] = (dff['k36me3_RPM_0'] + dff['k36me3_RPM_1']) / 2

# dfc = dff[['k36me3_RPKM_0', 'k36me3_RPKM_1', 'k36me3_RPKM',
#            'k36me3_RPM_0', 'k36me3_RPM_1', 'k36me3_RPM',
#            'TPM_0', 'TPM_1', 'TPM', 
#            'FPKM_0', 'FPKM_1', 'FPKM',
#            'k36me3_MACS2_COVERAGE_0','k36me3_MACS2_COVERAGE_1', 'k36me3_MACS2_COVERAGE', 
#            'k36me3_SICER_COVERAGE_0', 'k36me3_SICER_COVERAGE_1', 'k36me3_SICER_COVERAGE',
#            'k36me3_SPAN_COVERAGE_0', 'k36me3_SPAN_COVERAGE_1', 'k36me3_SPAN_COVERAGE']]

dfc = dff[['TPM','k36me3_RPKM', 'k36me3_MACS2_COVERAGE', 'k36me3_SICER_COVERAGE', 'k36me3_SPAN_COVERAGE']].copy()
dfc.sort_values(by=['k36me3_RPKM'], inplace=True)
with PdfPages(os.path.join(FOLDER, 'gse112622_clustering.pdf')) as pdf:
    sns.clustermap(dfc, figsize=(4, 7), standard_scale=1, col_cluster=False, row_cluster=False)
    pdf.savefig()

In [None]:
dfc.reset_index(inplace=True, drop=True)
dfc

In [None]:
plt.figure(figsize=(10, 5))
dfc['gene_rank'] = list(dfc.index)
sns.regplot(x='gene_rank', y='k36me3_MACS2_COVERAGE', color="black", label="MACS2", data=dfc, lowess=True,
           scatter_kws=dict(alpha=0.2, s=0.5))
sns.regplot(x='gene_rank', y='k36me3_SICER_COVERAGE', color="blue", label="SICER", data=dfc, lowess=True,
           scatter_kws=dict(alpha=0.2, s=0.5))
sns.regplot(x='gene_rank', y='k36me3_SPAN_COVERAGE', color="green", label="SPAN", data=dfc, lowess=True,
           scatter_kws=dict(alpha=0.2, s=0.5))
plt.suptitle('Fraction of gene body covered by peaks')
plt.show()

# RSEM DE vs intersection / coverage


```
library(DESeq2)

tableFiles <- list.files("/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/rnaseq/fastq_bams_rsem", pattern = "*.genes.results", recursive = F)
geneTables <- lapply(tableFiles, function(x) read.table(paste("/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/rnaseq/fastq_bams_rsem", x, sep = "/"), sep="\t", header=1, row.names=1))
countsTable <- do.call(cbind, lapply(geneTables, function(x) x[, "expected_count", drop=F]))
colnames(countsTable) <- tableFiles
countsPure <- as.matrix(countsTable)
countsPure <- floor(countsPure)
expressedGenes <- rowSums(countsPure) > 0
countsPure <- countsPure[expressedGenes, ]

write.table(file = '/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/rnaseq/fastq_bams_rsem/counts.tsv', x = countsPure, sep='\t')

colData <- data.frame(
    cellType = c("C57BL_6J", "C57BL_6J", "CAST_EiJ", "CAST_EiJ"),
    row.names = colnames(countsPure)
)

dds <- DESeqDataSetFromMatrix(countData = countsPure,
                              colData = colData,
                              design = ~cellType)

dds <- DESeq(dds)
res <- results(dds, name="cellType_CAST_EiJ_vs_C57BL_6J")
res <- res[order(res$stat), ]
res <- lfcShrink(dds, coef="cellType_CAST_EiJ_vs_C57BL_6J", type="apeglm")
write.table(file = '/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/rnaseq/fastq_bams_rsem/diff.tsv', x = res, sep='\t')
```

In [None]:
de = pd.read_csv('/mnt/stripe/shpynov/2020_GSE112622_k36me3_vs_rnaseq/rnaseq/fastq_bams_rsem/diff.tsv', 
                 sep='\t', comment='#').dropna()
de.index.name = 'gene_id'
display(de.head())

# Top DE genes

In [None]:
top_genes_0 = de.loc[de['log2FoldChange'] > 0].sort_values(by=['padj'])[:500]
display(top_genes_0.head(1))
top_genes_1 = de.loc[de['log2FoldChange'] < 0].sort_values(by=['padj'])[:500]
display(top_genes_1.head(1))

de0 = set(list(top_genes_0.index))
dfde0 = dff.loc[[g in de0 for g in dff['gene_id']]] 
dfde0['de'] = 0

de1 = set(list(top_genes_1.index))
dfde1 = dff.loc[[g in de1 for g in dff['gene_id']]] 
dfde1['de'] = 1

dfde = pd.concat([dfde0, dfde1])
display(dfde.head(3))

# Top DE genes vs k36me3 peaks intersection

In [None]:
for t in TOOLS:
    print('Processing {}'.format(t))
    for detype in [0, 1]:
        print('DE {}'.format(detype))
        dfdet = dfde.loc[dfde['de'] == detype]
        
        itt_filter = np.logical_and(
            dfdet['k36me3_{}_intersect_0'.format(t)], 
            dfdet['k36me3_{}_intersect_1'.format(t)])
        print('Intersection TRUE TRUE {}'.format(sum(itt_filter)))
#         print('\t'.join(dfdet.loc[itt_filter][:20]['gene_id']))

        itf_filter = np.logical_and(
            dfdet['k36me3_{}_intersect_0'.format(t)], 
            np.logical_not(dfdet['k36me3_{}_intersect_1'.format(t)]))
        print('Intersection TRUE FALSE {}'.format(sum(itf_filter)))
#         print('\t'.join(dfdet.loc[itf_filter][:20]['gene_id']))

        ift_filter = np.logical_and(
            np.logical_not(dfdet['k36me3_{}_intersect_0'.format(t)]), 
            dfdet['k36me3_{}_intersect_1'.format(t)])
        print('Intersection FALSE TRUE {}'.format(sum(ift_filter)))
#         print('\t'.join(dfdet.loc[ift_filter][:20]['gene_id']))

        iff_filter = np.logical_and(
            np.logical_not(dfdet['k36me3_{}_intersect_0'.format(t)]), 
            np.logical_not(dfdet['k36me3_{}_intersect_1'.format(t)]))
        print('Intersection FALSE FALSE {}'.format(sum(iff_filter)))
#         print('\t'.join(dfdet.loc[iff_filter][:20]['gene_id'])

# Top DE genes vs k36me3 peaks coverage

In [None]:
for t in TOOLS:
    with PdfPages(os.path.join(FOLDER, 'gse112622_de_vs_coverage_{}.pdf'.format(t))) as pdf:
        plt.figure(figsize=(4, 5))
        dfdem = pd.melt(dfde, id_vars=['de'], 
                value_vars=['k36me3_{}_COVERAGE_0'.format(t), 'k36me3_{}_COVERAGE_1'.format(t)])
        sns.violinplot(x='de', y='value', hue='variable', data=dfdem)
#         display(dfdem.head(20))
        plt.suptitle('DE RNA-Seq vs COVERAGE {}'.format(t))
        pdf.savefig()