In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
%matplotlib inline
### imports ###
import sys
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

matplotlib.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, import_homer_diffgene, pull_comparisons_get_diff
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, plot_exp_rpkm, gene_list_bar_plot

In [6]:
workingDirectory = '/home/h1bennet/strains_rna/results/WL00_Strains_Control_WholeLiver_RNA/'
dataDirectory = '/home/h1bennet/strains/data/RNA/control_wholeliver/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


# Check qc stats

In [13]:
qc = ngs_qc(dataDirectory)

/home/h1bennet/strains/data/RNA/control_wholeliver//
./control_wholeliver_qc/


<Figure size 432x288 with 0 Axes>

In [14]:
qc

Unnamed: 0,uniquePositions,fragmentLengthEstimate,tagsPerBP,clonality,GC_Content,totalReads,uniquelyMappedReads,multiMappedReads,frac_unmappedReads_mismatch,frac_unmappedReads_short,frac_unmappedReads_other,uniquelyMappedFraction,mappedFraction
NCoRWT_WholeLiver_RNA_ControlDiet_21week_LN169A_161011,5418430.0,75.0,0.009055,4.547,0.518,28446962.0,24637537.0,3244770.0,0.0,0.0028,0.004,0.866087,0.9932
NCoRWT_WholeLiver_RNA_ControlDiet_21week_LN169B_161019,6143422.0,75.0,0.008829,3.911,0.508,28320636.0,24027018.0,3615957.0,0.0,0.0031,0.0043,0.848393,0.9926
aj_WholeLiver_RNA_ControlDiet_21week_AJ7A_160829.aj.star_shifted_from_AJ.sam,6454076.0,75.0,0.009564,4.032,-1.0,30328593.0,26023507.0,3285931.0,0.0,0.0036,0.0037,0.858052,0.9927
aj_WholeLiver_RNA_ControlDiet_21week_AJ7C_160915.aj.star_shifted_from_AJ.sam,6436851.0,75.0,0.009766,4.129,-1.0,31302418.0,26576241.0,3469062.0,0.0,0.0034,0.0032,0.849016,0.9934
aj_WholeLiver_RNA_ControlDiet_21week_AJ7D_160915.aj.star_shifted_from_AJ.sam,6792930.0,75.0,0.009777,3.917,-1.0,30959468.0,26606947.0,3215638.0,0.0,0.0037,0.0034,0.859412,0.9929
balbc_WholeLiver_RNA_ControlDiet_21week_BALB7A_160829.balbcj.star_shifted_from_BALBCJ.sam,3247208.0,75.0,0.006483,5.432,-1.0,21059751.0,17639457.0,2204592.0,0.0,0.0417,0.0064,0.837591,0.9519
balbc_WholeLiver_RNA_ControlDiet_21week_BALB7B_160915.balbcj.star_shifted_from_BALBCJ.sam,4533870.0,75.0,0.009134,5.482,-1.0,28591122.0,24853107.0,3295667.0,0.0,0.0044,0.0043,0.86926,0.9913
balbc_WholeLiver_RNA_ControlDiet_21week_BALB7D_160915.balbcj.star_shifted_from_BALBCJ.sam,6070319.0,75.0,0.008992,4.031,-1.0,28022806.0,24471101.0,3164767.0,0.0,0.0049,0.0035,0.873257,0.9916


# Run Oyoung's pipeline to process RNA-seq data consistantly.

In [20]:
c57_paths = list(np.sort(glob.glob(dataDirectory+'/NCoRWT*')))
balbc_paths = list(np.sort(glob.glob(dataDirectory+'/balbc*')))
aj_paths = list(np.sort(glob.glob(dataDirectory+'/aj*')))

In [21]:
c57_ids = [i.split('/')[-1].split('_')[4] for i in c57_paths]
balbc_ids = [i.split('/')[-1].split('_')[4] for i in balbc_paths]
aj_ids = [i.split('/')[-1].split('_')[4] for i in aj_paths]

In [22]:
ids = ['c57_wholeliver_control_21weeks',
       'balbc_wholeliver_control_21weeks',
       'aj_wholeliver_control_21weeks']
colors = ['#31a354',
          '#3182bd',
          '#de2d26']
paths = [';'.join(c57_paths),
         ';'.join(balbc_paths),
         ';'.join(aj_paths)]
short_ids = [';'.join([ids[0]+'_'+i for i in c57_ids]),
             ';'.join([ids[1]+'_'+i for i in balbc_ids]),
             ';'.join([ids[2]+'_'+i for i in aj_ids])]

In [23]:
pd.DataFrame([ids,colors,paths,short_ids]).T.to_csv('./sampleDef.txt',
                                                    sep='\t',
                                                    header=False,
                                                    index=False)

# Run O'young RNA pipeline

Run the code below in BASH

    source activate r-ouyangPipe
    
    mkdir ./expression/
    
    mkdir ./differential/
    
    rnaQuan.R ./sampleDef.txt -o ./expression/

    rnaDiff.R ./sampleDef.txt -c ./expression/rawC.txt \
    -t ./expression/rawT.txt -o ./differential/

    # annotate scatter files with tag counts
    for scatter in ./differential/*scatter.txt;
    do diff2Homer.R -d $scatter -q ./expression/HOMER.rawTPM.txt;
    done

### Make sure to check the PCA pdf file for outliers

# Import differential gene files for examination and plotting

In [24]:
if not os.path.isdir('./gene_lists/'):
    os.mkdir('./gene_lists')

In [25]:
strains = ['aj', 'balb', 'c57']
log2fc = 1
padj = 0.05

In [26]:
gene_list_dict ={}

In [27]:
for strain in strains:
    # initialize gene lists
    g_intersect = set()
    g_union = set()
    for i in glob.glob('./differential/'+strain+'*homer.txt'):
        print(i)
        # read in gene expression data
        df, tpm, tpm_mat, tpm_mat_quant = import_homer_diffgene(
            depath=i,
            gepath='./expression/HOMER.rawTPM.txt',
            gene_name_index=True)

        # select differential genes
        df = df.loc[~(df.chr.str.contains('chrX|chrY|ChrUn')), :]
        sig_genes = df.index[(df['padj'] < padj) & (df['log2FoldChange'] >= log2fc)].to_list()
        
        # add to sets
        if len(g_union) == 0:
            g_union = set(sig_genes)
            g_intersect = set(sig_genes)
        else:
            g_union = g_union.union(sig_genes)
            g_intersect = g_intersect.intersection(sig_genes)
        
        # print('union set length:', print(len(g_union)))
        # print('intersection set length:', print(len(g_intersect)))
    
    # after getting union and intersection sets add to dict
    gene_list_dict[strain+'_union'] = list(g_union)
    gene_list_dict[strain+'_intersection'] = list(g_intersect)
        

./differential/aj_wholeliver_control_21weeks.vs.c57_wholeliver_control_21weeks.scatter.homer.txt
ge all transcripts (24940, 15)
Diff Gene selected transcripts (5769, 14)
ge selected transcripts (5769, 15)
./differential/aj_wholeliver_control_21weeks.vs.balbc_wholeliver_control_21weeks.scatter.homer.txt
ge all transcripts (24940, 15)
Diff Gene selected transcripts (6085, 14)
ge selected transcripts (6085, 15)
./differential/balbc_wholeliver_control_21weeks.vs.aj_wholeliver_control_21weeks.scatter.homer.txt
ge all transcripts (24940, 15)
Diff Gene selected transcripts (6085, 14)
ge selected transcripts (6085, 15)
./differential/balbc_wholeliver_control_21weeks.vs.c57_wholeliver_control_21weeks.scatter.homer.txt
ge all transcripts (24940, 15)
Diff Gene selected transcripts (6114, 14)
ge selected transcripts (6114, 15)
./differential/c57_wholeliver_control_21weeks.vs.balbc_wholeliver_control_21weeks.scatter.homer.txt
ge all transcripts (24940, 15)
Diff Gene selected transcripts (6114, 14)


In [30]:
for key in gene_list_dict.keys():
    print(key)
    print(len(gene_list_dict[key]), 'genes')
    with open('./gene_lists/'+key+'_wholeliver_control_genes_fc_'+str(log2fc)+'_pval_'+str(padj)+'.txt', 'w') as f:
        f.write('Gene\n')
        for i in gene_list_dict[key]:
            f.write(i+'\n')
        f.close()

aj_union
191 genes
aj_intersection
28 genes
balb_union
191 genes
balb_intersection
33 genes
c57_union
172 genes
c57_intersection
26 genes
