In [2]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
%matplotlib inline
### imports ###
import sys
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

matplotlib.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, import_homer_diffgene, pull_comparisons_get_diff
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, plot_exp_rpkm, gene_list_bar_plot

In [3]:
workingDirectory = '/home/h1bennet/strains_rna/results/A00_Strains_Control_WholeLiver_RNA/'
kcDirectory = '/home/h1bennet/strains/data/RNA/control_kupffer/'
hepDirectory = '/home/h1bennet/strains/data/RNA/control_hepatocyte/'
lsecDirectory = '/home/h1bennet/strains/data/RNA/control_lsec'
stelDirectory = '/home/h1bennet/strains/data/RNA/control_stellate/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


# Check qc stats

# Run Oyoung's pipeline to process RNA-seq data consistantly.

In [32]:
c57_kc_paths = list(np.sort(glob.glob(kcDirectory+'/C57*')))
balbc_kc_paths = list(np.sort(glob.glob(kcDirectory+'/BALB*')))
aj_kc_paths = list(np.sort(glob.glob(kcDirectory+'/AJ*')))

c57_kc_ids = [i.split('/')[-1].split('_')[3] for i in c57_kc_paths]
balbc_kc_ids = [i.split('/')[-1].split('_')[3] for i in balbc_kc_paths]
aj_kc_ids = [i.split('/')[-1].split('_')[3] for i in aj_kc_paths]

In [35]:
c57_hep_paths = list(np.sort(glob.glob(hepDirectory+'/C57*')))
balbc_hep_paths = list(np.sort(glob.glob(hepDirectory+'/BALB*')))
aj_hep_paths = list(np.sort(glob.glob(hepDirectory+'/AJ*')))

c57_hep_ids = [i.split('/')[-1].split('_')[7] for i in c57_hep_paths]
balbc_hep_ids = [i.split('/')[-1].split('_')[7] for i in balbc_hep_paths]
aj_hep_ids = [i.split('/')[-1].split('_')[7] for i in aj_hep_paths]

In [50]:
c57_lsec_paths = list(np.sort(glob.glob(lsecDirectory+'/C57*')))
balbc_lsec_paths = list(np.sort(glob.glob(lsecDirectory+'/BALB*')))
aj_lsec_paths = list(np.sort(glob.glob(lsecDirectory+'/AJ*')))

c57_lsec_ids = [i.split('/')[-1].split('_')[4] for i in c57_lsec_paths]
balbc_lsec_ids = [i.split('/')[-1].split('_')[4] for i in balbc_lsec_paths]
aj_lsec_ids = [i.split('/')[-1].split('_')[4] for i in aj_lsec_paths]

In [52]:
c57_stel_paths = list(np.sort(glob.glob(stelDirectory+'/C57*')))
balbc_stel_paths = list(np.sort(glob.glob(stelDirectory+'/BALB*')))
aj_stel_paths = list(np.sort(glob.glob(stelDirectory+'/AJ*')))

c57_stel_ids = [i.split('/')[-1].split('_')[4] for i in c57_stel_paths]
balbc_stel_ids = [i.split('/')[-1].split('_')[4] for i in balbc_stel_paths]
aj_stel_ids = [i.split('/')[-1].split('_')[4] for i in aj_stel_paths]

In [54]:
ids = ['c57_kupffer_control_young',
       'balbc_kupffer_control_young',
       'aj_kupffer_control_young',
       'c57_hepatocyte_control_young',
       'balbc_hepatocyte_control_young',
       'aj_hepatocyte_control_young',
       'c57_lsec_control_young',
       'balbc_lsec_control_young',
       'aj_lsec_control_young',
       'c57_stellate_control_young',
       'balbc_stellate_control_young',
       'aj_stellate_control_young']
       
colors = ['#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26']
       
paths = [';'.join(c57_kc_paths),
         ';'.join(balbc_kc_paths),
         ';'.join(aj_kc_paths),
         ';'.join(c57_hep_paths),
         ';'.join(balbc_hep_paths),
         ';'.join(aj_hep_paths),
         ';'.join(c57_lsec_paths),
         ';'.join(balbc_lsec_paths),
         ';'.join(aj_lsec_paths),
         ';'.join(c57_stel_paths),
         ';'.join(balbc_stel_paths),
         ';'.join(aj_stel_paths)]
       
short_ids = [';'.join([ids[0]+'_'+i for i in c57_kc_ids]),
             ';'.join([ids[1]+'_'+i for i in balbc_kc_ids]),
             ';'.join([ids[2]+'_'+i for i in aj_kc_ids]),
             ';'.join([ids[3]+'_'+i for i in c57_hep_ids]),
             ';'.join([ids[4]+'_'+i for i in balbc_hep_ids]),
             ';'.join([ids[5]+'_'+i for i in aj_hep_ids]),
             ';'.join([ids[6]+'_'+i for i in c57_lsec_ids]),
             ';'.join([ids[7]+'_'+i for i in balbc_lsec_ids]),
             ';'.join([ids[8]+'_'+i for i in aj_lsec_ids]),
             ';'.join([ids[9]+'_'+i for i in c57_stel_ids]),
             ';'.join([ids[10]+'_'+i for i in balbc_stel_ids]),
             ';'.join([ids[11]+'_'+i for i in aj_stel_ids])]

In [55]:
print([i.count(';') for i in paths])
print([i.count(';') for i in short_ids])

[1, 1, 1, 2, 1, 1, 3, 3, 3, 2, 3, 3]
[1, 1, 1, 2, 1, 1, 3, 3, 3, 2, 3, 3]


In [56]:
pd.DataFrame([ids,colors,paths,short_ids]).T.to_csv('./sampleDef.txt',
                                                    sep='\t',
                                                    header=False,
                                                    index=False)

# Run O'young RNA pipeline

Run the code below in BASH

    source activate r-ouyangPipe
    
    mkdir ./expression/
    
    mkdir ./differential/
    
    rnaQuan.R ./sampleDef.txt -o ./expression/

### Make sure to check the PCA pdf file for outliers