In [11]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
%matplotlib inline
### imports ###
import sys
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

matplotlib.rcParams['savefig.dpi'] = 200
sys.setrecursionlimit(3000)
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, import_homer_diffgene, pull_comparisons_get_diff
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, plot_exp_rpkm, gene_list_bar_plot

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
workingDirectory = '/home/h1bennet/strains_rna/results/00_Strains_Control_LSEC_RNA/'
dataDirectory = '/home/h1bennet/strains/data/RNA/control_lsec'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


# Check qc stats

In [13]:
qc = ngs_qc('/home/h1bennet/strains/data/RNA/control_lsec')

/home/h1bennet/strains/data/RNA/control_lsec/
./control_lsec_qc/


<Figure size 432x288 with 0 Axes>

In [14]:
qc

Unnamed: 0,uniquePositions,fragmentLengthEstimate,tagsPerBP,clonality,GC_Content,totalReads,uniquelyMappedReads,multiMappedReads,frac_unmappedReads_mismatch,frac_unmappedReads_short,frac_unmappedReads_other,uniquelyMappedFraction,mappedFraction
AJ_M_LSEC_RNA_AJ01A_HBENN_l20201124_TACCGAGG_CCTGAACT.aj.star_shifted_from_AJ.sam,4706841.0,75.0,0.003157,1.825,2721584000.0,9726144.0,8591204.0,522160.0,0.0,0.0535,0.0037,0.88331,0.9428
AJ_M_LSEC_RNA_AJ01B_HBENN_l20201124_CGTTAGAA_TTCAGGTC.aj.star_shifted_from_AJ.sam,5013272.0,75.0,0.003789,2.057,2721897000.0,11589227.0,10314088.0,637268.0,0.0,0.0449,0.0042,0.889972,0.9509
AJ_M_LSEC_RNA_AJ01C_HBENN_l20201124_AGCCTCAT_AGTAGAGA.aj.star_shifted_from_AJ.sam,7277467.0,75.0,0.006947,2.597,2720757000.0,20329897.0,18900120.0,1093970.0,0.0,0.0064,0.0045,0.929671,0.9891
AJ_M_LSEC_RNA_AJ01D_HBENN_l20201124_GATTCTGC_GACGAGAG.aj.star_shifted_from_AJ.sam,6625975.0,75.0,0.005157,2.118,2721152000.0,15121849.0,14032414.0,760215.0,0.0,0.0117,0.0044,0.927956,0.9839
BALBC_M_LSEC_RNA_BALB01A_HBENN_l20201124_TCGTAGTG_AGACTTGG.balbcj.star_shifted_from_BALBCJ.sam,7037785.0,75.0,0.005719,2.211,2721487000.0,17000928.0,15563949.0,1041083.0,0.0,0.0114,0.0041,0.915476,0.9845
BALBC_M_LSEC_RNA_BALB01B_HBENN_l20201124_CTACGACA_GAGTCCAA.balbcj.star_shifted_from_BALBCJ.sam,5920730.0,75.0,0.004955,2.277,2720454000.0,14974401.0,13479888.0,853054.0,0.0,0.033,0.004,0.900195,0.963
BALBC_M_LSEC_RNA_BALB01C_HBENN_l20201124_TAAGTGGT_CTTAAGCC.balbcj.star_shifted_from_BALBCJ.sam,7421259.0,75.0,0.00648,2.376,2721375000.0,18990506.0,17634997.0,986437.0,0.0,0.0077,0.0048,0.928622,0.9875
BALBC_M_LSEC_RNA_BALB01D_HBENN_l20201124_CGGACAAC_TCCGGATT.balbcj.star_shifted_from_BALBCJ.sam,7327959.0,75.0,0.005508,2.046,2721579000.0,16477818.0,14991248.0,1006112.0,0.0,0.0184,0.0042,0.909784,0.9774
C57B6J_M_LSEC_RNA_C5701A_HBENN_l20201124_ATATGGAT_CTGTATTA,7237137.0,75.0,0.005849,2.2,0.481,18211635.0,15918830.0,1090304.0,0.0,0.054,0.0048,0.874102,0.9412
C57B6J_M_LSEC_RNA_C5701B_HBENN_l20201124_GCGCAAGC_TCACGCCG,6852539.0,75.0,0.005025,1.996,0.485,15206153.0,13676416.0,931650.0,0.0,0.0262,0.0048,0.8994,0.969


# Run Oyoung's pipeline to process RNA-seq data consistantly.

In [15]:
c57_paths = list(np.sort(glob.glob(dataDirectory+'/C57*')))
balbc_paths = list(np.sort(glob.glob(dataDirectory+'/BALBC*')))
aj_paths = list(np.sort(glob.glob(dataDirectory+'/AJ*')))

In [16]:
c57_ids = [i.split('/')[-1].split('_')[4] for i in c57_paths]
balbc_ids = [i.split('/')[-1].split('_')[4] for i in balbc_paths]
aj_ids = [i.split('/')[-1].split('_')[4] for i in aj_paths]

In [17]:
ids = ['c57_lsec_control_young',
       'balbc_lsec_control_young',
       'aj_lsec_control_young']
colors = ['#31a354',
          '#3182bd',
          '#de2d26']
paths = [';'.join(c57_paths),
         ';'.join(balbc_paths),
         ';'.join(aj_paths)]
short_ids = [';'.join([ids[0]+'_'+i for i in c57_ids]),
             ';'.join([ids[1]+'_'+i for i in balbc_ids]),
             ';'.join([ids[2]+'_'+i for i in aj_ids])]

In [18]:
pd.DataFrame([ids,colors,paths,short_ids]).T.to_csv('./sampleDef.txt',
                                                    sep='\t',
                                                    header=False,
                                                    index=False)

# Run O'young RNA pipeline

Run the code below in BASH

    source activate r-ouyangPipe
    
    mkdir ./expression/
    
    mkdir ./differential/
    
    rnaQuan.R ./sampleDef.txt -o ./expression/

    rnaDiff.R ./sampleDef.txt -c ./expression/rawC.txt \
    -t ./expression/rawT.txt -o ./differential/

    # annotate scatter files with tag counts
    for scatter in ./differential/*scatter.txt;
    do diff2Homer.R -d $scatter -q ./expression/HOMER.rawTPM.txt;
    done

### Make sure to check the PCA pdf file for outliers

# Import differential gene files for examination and plotting

In [19]:
if not os.path.isdir('./gene_lists/'):
    os.mkdir('./gene_lists')

In [24]:
strains = ['aj', 'balb', 'c57']
log2fc = 1
pval = 0.05

In [25]:
gene_list_dict ={}

In [26]:
for strain in strains:
    # initialize gene lists
    g_intersect = set()
    g_union = set()
    for i in glob.glob('./differential/'+strain+'*homer.txt'):
        print(i)
        # read in gene expression data
        df, tpm, tpm_mat, tpm_mat_quant = import_homer_diffgene(
            depath=i,
            gepath='./expression/HOMER.rawTPM.txt',
            gene_name_index=True)

        # select differential genes
        df = df.loc[~(df.chr.str.contains('chrX|chrY|ChrUn')), :]
        sig_genes = df.index[(df['padj'] < padj) & (df['log2FoldChange'] >= log2fc)].to_list()
        
        # add to sets
        if len(g_union) == 0:
            g_union = set(sig_genes)
            g_intersect = set(sig_genes)
        else:
            g_union = g_union.union(sig_genes)
            g_intersect = g_intersect.intersection(sig_genes)
        
        # print('union set length:', print(len(g_union)))
        # print('intersection set length:', print(len(g_intersect)))
    
    # after getting union and intersection sets add to dict
    gene_list_dict[strain+'_union'] = list(g_union)
    gene_list_dict[strain+'_intersection'] = list(g_intersect)
        

./differential/aj_lsec_control_young.vs.balbc_lsec_control_young.scatter.homer.txt
ge all transcripts (24940, 19)
Diff Gene selected transcripts (8925, 14)
ge selected transcripts (8925, 19)
./differential/aj_lsec_control_young.vs.c57_lsec_control_young.scatter.homer.txt
ge all transcripts (24940, 19)
Diff Gene selected transcripts (8953, 14)
ge selected transcripts (8953, 19)
./differential/balbc_lsec_control_young.vs.aj_lsec_control_young.scatter.homer.txt
ge all transcripts (24940, 19)
Diff Gene selected transcripts (8925, 14)
ge selected transcripts (8925, 19)
./differential/balbc_lsec_control_young.vs.c57_lsec_control_young.scatter.homer.txt
ge all transcripts (24940, 19)
Diff Gene selected transcripts (8903, 14)
ge selected transcripts (8903, 19)
./differential/c57_lsec_control_young.vs.aj_lsec_control_young.scatter.homer.txt
ge all transcripts (24940, 19)
Diff Gene selected transcripts (8953, 14)
ge selected transcripts (8953, 19)
./differential/c57_lsec_control_young.vs.balbc_l

In [27]:
for key in gene_list_dict.keys():
    print(key)
    print(len(gene_list_dict[key]), 'genes')
    with open('./gene_lists/'+key+'_lsec_genes_fc_'+str(log2fc)+'_pval_'+str(pval)+'.txt', 'w') as f:
        f.write('Gene\n')
        for i in gene_list_dict[key]:
            f.write(i+'\n')
        f.close()

aj_union
233 genes
aj_intersection
30 genes
balb_union
207 genes
balb_intersection
32 genes
c57_union
211 genes
c57_intersection
59 genes
