In [2]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
%matplotlib inline
### imports ###
import sys
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

matplotlib.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, import_homer_diffgene, pull_comparisons_get_diff
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, plot_exp_rpkm, gene_list_bar_plot

In [3]:
workingDirectory = '/home/h1bennet/strains_rna/results/A06_Strains_210602_NextSeq_Troubleshooting/'
controlDirectory = '/home/h1bennet/strains/data/RNA/control_kupffer/'
leptinDirectory = '/home/h1bennet/strains/data/RNA/leptin_6h_strains_kupffer/'
leptinLSECDirectory = '/home/h1bennet/strains/data/RNA/leptin_6h_strains_lsec/'
lpsDirectory = '/home/h1bennet/strains/data/RNA/LPS_2h_kupffer_cohort2/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


# Check qc stats

# Run Oyoung's pipeline to process RNA-seq data consistantly.

In [6]:
c57_control_paths = list(np.sort(glob.glob(controlDirectory+'/C57*')))
balbc_control_paths = list(np.sort(glob.glob(controlDirectory+'/BALB*')))
aj_control_paths = list(np.sort(glob.glob(controlDirectory+'/AJ*')))

c57_control_ids = [i.split('/')[-1].split('_')[3] for i in c57_control_paths]
balbc_control_ids = [i.split('/')[-1].split('_')[3] for i in balbc_control_paths]
aj_control_ids = [i.split('/')[-1].split('_')[3] for i in aj_control_paths]

In [7]:
c57_leptin_paths = list(np.sort(glob.glob(leptinDirectory+'/c57*')))
balbc_leptin_paths = list(np.sort(glob.glob(leptinDirectory+'/balb*')))
aj_leptin_paths = list(np.sort(glob.glob(leptinDirectory+'/aj*')))

c57_leptin_ids = [i.split('/')[-1].split('_')[7] for i in c57_leptin_paths]
balbc_leptin_ids = [i.split('/')[-1].split('_')[7] for i in balbc_leptin_paths]
aj_leptin_ids = [i.split('/')[-1].split('_')[7] for i in aj_leptin_paths]

In [28]:
c57_leptinLSEC_paths = list(np.sort(glob.glob(leptinLSECDirectory+'/c57*')))
balbc_leptinLSEC_paths = list(np.sort(glob.glob(leptinLSECDirectory+'/balb*')))
aj_leptinLSEC_paths = list(np.sort(glob.glob(leptinLSECDirectory+'/aj*')))

c57_leptinLSEC_ids = [i.split('/')[-1].split('_')[7] for i in c57_leptinLSEC_paths]
balbc_leptinLSEC_ids = [i.split('/')[-1].split('_')[7] for i in balbc_leptinLSEC_paths]
aj_leptinLSEC_ids = [i.split('/')[-1].split('_')[7] for i in aj_leptinLSEC_paths]

In [29]:
c57_lps_paths = list(np.sort(glob.glob(lpsDirectory+'/c57*')))
balbc_lps_paths = list(np.sort(glob.glob(lpsDirectory+'/balb*')))
# aj_lps_paths = list(np.sort(glob.glob(lpsDirectory+'/AJ*')))

c57_lps_ids = [i.split('/')[-1].split('_')[7] for i in c57_lps_paths]
balbc_lps_ids = [i.split('/')[-1].split('_')[7] for i in balbc_lps_paths]
# aj_lps_ids = [i.split('/')[-1].split('_')[4] for i in aj_lps_paths]

In [30]:
ids = ['c57_kupffer_control_young',
       'balbc_kupffer_control_young',
       'aj_kupffer_control_young',
       'c57_kupffer_leptin_6h',
       'balbc_kupffer_leptin_6h',
       'aj_kupffer_leptin_6h',
       'c57_lsec_leptin_6h',
       'balbc_lsec_leptin_6h',
       'aj_lsec_leptin_6h',
       'c57_kupffer_lps_2h',
       'balbc_kupffer_lps_2h']
       
colors = ['#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd',
          '#de2d26',
          '#31a354',
          '#3182bd']
       
paths = [';'.join(c57_control_paths),
         ';'.join(balbc_control_paths),
         ';'.join(aj_control_paths),
         ';'.join(c57_leptin_paths),
         ';'.join(balbc_leptin_paths),
         ';'.join(aj_leptin_paths),
         ';'.join(c57_leptinLSEC_paths),
         ';'.join(balbc_leptinLSEC_paths),
         ';'.join(aj_leptinLSEC_paths),
         ';'.join(c57_lps_paths),
         ';'.join(balbc_lps_paths)]
       
short_ids = [';'.join(c57_control_ids),
             ';'.join(balbc_control_ids),
             ';'.join(aj_control_ids),
             ';'.join(c57_leptin_ids),
             ';'.join(balbc_leptin_ids),
             ';'.join(aj_leptin_ids),
             ';'.join(c57_leptinLSEC_ids),
             ';'.join(balbc_leptinLSEC_ids),
             ';'.join(aj_leptinLSEC_ids),
             ';'.join(c57_lps_ids),
             ';'.join(balbc_lps_ids)]

In [31]:
print([i.count(';') for i in paths])
print([i.count(';') for i in short_ids])

[1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1]
[1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1]


In [32]:
pd.DataFrame([ids,colors,paths,short_ids]).T

Unnamed: 0,0,1,2,3
0,c57_kupffer_control_young,#31a354,/home/h1bennet/strains/data/RNA/control_kupffe...,C5701C;C5701D
1,balbc_kupffer_control_young,#3182bd,/home/h1bennet/strains/data/RNA/control_kupffe...,BALB01C;BALB01D
2,aj_kupffer_control_young,#de2d26,/home/h1bennet/strains/data/RNA/control_kupffe...,AJ01C;AJ01D
3,c57_kupffer_leptin_6h,#31a354,/home/h1bennet/strains/data/RNA/leptin_6h_stra...,rep1;rep2
4,balbc_kupffer_leptin_6h,#3182bd,/home/h1bennet/strains/data/RNA/leptin_6h_stra...,rep1;rep2;rep3
5,aj_kupffer_leptin_6h,#de2d26,/home/h1bennet/strains/data/RNA/leptin_6h_stra...,rep1;rep2
6,c57_lsec_leptin_6h,#31a354,/home/h1bennet/strains/data/RNA/leptin_6h_stra...,rep1;rep2
7,balbc_lsec_leptin_6h,#3182bd,/home/h1bennet/strains/data/RNA/leptin_6h_stra...,rep1;rep2
8,aj_lsec_leptin_6h,#de2d26,/home/h1bennet/strains/data/RNA/leptin_6h_stra...,rep1;rep2
9,c57_kupffer_lps_2h,#31a354,/home/h1bennet/strains/data/RNA/LPS_2h_kupffer...,2A


In [33]:
pd.DataFrame([ids,colors,paths,short_ids]).T.to_csv('./sampleDef.txt',
                                                    sep='\t',
                                                    header=False,
                                                    index=False)

# Run O'young RNA pipeline

Run the code below in BASH

    source activate r-ouyangPipe
    
    mkdir ./expression/
    
    mkdir ./differential/
    
    alignStats.R ./sampleDef.txt > qcStats.txt
    
    rnaQuan.R ./sampleDef.txt -o ./expression/
    
    rnaDiff.R ./sampleDef.txt -c ./expression/rawC.txt \
    -t ./expression/rawT.txt -o ./differential/
    
    #remove files comparing across cell types
    rm ./differential/*kupffer*lsec*
    rm ./differential/*kupffer*hepatocyte*
    rm ./differential/*kupffer*stellate*
    rm ./differential/*lsec*kupffer*
    rm ./differential/*lsec*hepatocyte*
    rm ./differential/*lsec*stellate*
    rm ./differential/*stellate*kupffer*
    rm ./differential/*stellate*lsec*
    rm ./differential/*stellate*hepatocyte*
    rm ./differential/*hepatocyte*kupffer*
    rm ./differential/*hepatocyte*lsec*
    rm ./differential/*hepatocyte*stellate*
    
    # annotate scatter files with tag counts
    for scatter in ./differential/*scatter.txt;
    do diff2Homer.R -d $scatter -q ./expression/HOMER.rawTPM.txt;
    done

### Make sure to check the PCA pdf file for outliers