In [3]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
%matplotlib inline
### imports ###
import sys
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

matplotlib.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, import_homer_diffgene, pull_comparisons_get_diff
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, plot_exp_rpkm, gene_list_bar_plot

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
workingDirectory = '/home/h1bennet/strains_rna/results/A01_Strains_AMLN_AllSamples_RNA/'
kcDirectory = '/home/h1bennet/strains/data/RNA/AMLN_30week_kupffer/'
hepDirectory = '/home/h1bennet/strains/data/RNA/AMLN_21week_wholeliver/'
lsecDirectory = '/home/h1bennet/strains/data/RNA/AMLN_30week_lsec'
stelDirectory = '/home/h1bennet/strains/data/RNA/AMLN_30week_stellate/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


# Check qc stats

# Run Oyoung's pipeline to process RNA-seq data consistantly.

In [46]:
c57_kc_paths = ['/data/mm10/Kupffer/RNA/NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN144B_JSS_TDT_16_10_20',
                '/data/mm10/Kupffer/RNA/NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN146C_JSS_TDT_16_10_20',
                '/data/mm10/Kupffer/RNA/NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN146D_JSS_TDT_16_10_20',
                '/data/mm10/Kupffer/RNA/NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN152B_JSS_TDT_16_10_20']
balbc_kc_paths = ['/data/mm10/Kupffer/RNA/balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3B_JSSTDT_16_09_26',
                  '/data/mm10/Kupffer/RNA/balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3C_JSSTDT_16_09_26',
                  '/data/mm10/Kupffer/RNA/balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3d_JSS_TDT_16_09_28']
aj_kc_paths = ['/data/mm10/Kupffer/RNA/aj_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_AJ6A_JSS_TDT_16_10_24',
               '/data/mm10/Kupffer/RNA/aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3A_JSSTDT_16_09_26',
               '/data/mm10/Kupffer/RNA/aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3B_JSSTDT_16_09_26',
               '/data/mm10/Kupffer/RNA/aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3c_JSS_TDT_16_09_28']

c57_kc_ids = ['LN144B','LN146C','LN146D','LN152B']
balbc_kc_ids = ['BALB3B','BALB3C','BALB3D']
aj_kc_ids = ['AJ6A','AJ3A','AJ3B','AJ3C']

In [47]:
c57_hep_paths = list(np.sort(glob.glob(hepDirectory+'/NCoRWT*')))
balbc_hep_paths = list(np.sort(glob.glob(hepDirectory+'/balbc*')))
aj_hep_paths = list(np.sort(glob.glob(hepDirectory+'/aj*')))

c57_hep_ids = [i.split('/')[-1].split('_')[5] for i in c57_hep_paths]
balbc_hep_ids = [i.split('/')[-1].split('_')[5] for i in balbc_hep_paths]
aj_hep_ids = [i.split('/')[-1].split('_')[5] for i in aj_hep_paths]

In [48]:
c57_lsec_paths = list(np.sort(glob.glob(lsecDirectory+'/NCoRWT*')))
balbc_lsec_paths = list(np.sort(glob.glob(lsecDirectory+'/balbc*')))
aj_lsec_paths = list(np.sort(glob.glob(lsecDirectory+'/aj*')))

c57_lsec_ids = [i.split('/')[-1].split('_')[7] for i in c57_lsec_paths]
balbc_lsec_ids = [i.split('/')[-1].split('_')[7] for i in balbc_lsec_paths]
aj_lsec_ids = [i.split('/')[-1].split('_')[7] for i in aj_lsec_paths]

In [49]:
c57_stel_paths = list(np.sort(glob.glob(stelDirectory+'/NCoRWT*')))
balbc_stel_paths = list(np.sort(glob.glob(stelDirectory+'/balbc*')))
aj_stel_paths = list(np.sort(glob.glob(stelDirectory+'/aj*')))

c57_stel_ids = [i.split('/')[-1].split('_')[5] for i in c57_stel_paths]
balbc_stel_ids = [i.split('/')[-1].split('_')[5] for i in balbc_stel_paths]
aj_stel_ids = [i.split('/')[-1].split('_')[5] for i in aj_stel_paths]

In [50]:
ids = ['c57_kupffer_amln',
       'balbc_kupffer_amln',
       'aj_kupffer_amln',
       'c57_hepatocyte_amln',
       'balbc_hepatocyte_amln',
       'aj_hepatocyte_amln',
       'c57_lsec_amln',
       'balbc_lsec_amln',
       'aj_lsec_amln',
       'c57_stellate_amln',
       'balbc_stellate_amln',
       'aj_stellate_amln']
       
colors = ['#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26']
       
paths = [';'.join(c57_kc_paths),
         ';'.join(balbc_kc_paths),
         ';'.join(aj_kc_paths),
         ';'.join(c57_hep_paths),
         ';'.join(balbc_hep_paths),
         ';'.join(aj_hep_paths),
         ';'.join(c57_lsec_paths),
         ';'.join(balbc_lsec_paths),
         ';'.join(aj_lsec_paths),
         ';'.join(c57_stel_paths),
         ';'.join(balbc_stel_paths),
         ';'.join(aj_stel_paths)]
       
short_ids = [';'.join([ids[0]+'_'+i for i in c57_kc_ids]),
             ';'.join([ids[1]+'_'+i for i in balbc_kc_ids]),
             ';'.join([ids[2]+'_'+i for i in aj_kc_ids]),
             ';'.join([ids[3]+'_'+i for i in c57_hep_ids]),
             ';'.join([ids[4]+'_'+i for i in balbc_hep_ids]),
             ';'.join([ids[5]+'_'+i for i in aj_hep_ids]),
             ';'.join([ids[6]+'_'+i for i in c57_lsec_ids]),
             ';'.join([ids[7]+'_'+i for i in balbc_lsec_ids]),
             ';'.join([ids[8]+'_'+i for i in aj_lsec_ids]),
             ';'.join([ids[9]+'_'+i for i in c57_stel_ids]),
             ';'.join([ids[10]+'_'+i for i in balbc_stel_ids]),
             ';'.join([ids[11]+'_'+i for i in aj_stel_ids])]

In [51]:
print([i.count(';') for i in paths])
print([i.count(';') for i in short_ids])

[3, 2, 3, 2, 2, 2, 1, 3, 1, 1, 1, 1]
[3, 2, 3, 2, 2, 2, 1, 3, 1, 1, 1, 1]


In [52]:
pd.DataFrame([ids,colors,paths,short_ids]).T.to_csv('./sampleDef.txt',
                                                    sep='\t',
                                                    header=False,
                                                    index=False)

# Run O'young RNA pipeline

Run the code below in BASH

    source activate r-ouyangPipe
    
    mkdir ./expression/
    
    mkdir ./differential/
    
    rnaQuan.R ./sampleDef.txt -o ./expression/

### Make sure to check the PCA pdf file for outliers