In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
%matplotlib inline
### imports ###
import sys
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
from matplotlib.patches import Patch
import seaborn as sns

# set plotting functions
matplotlib.rcParams['savefig.dpi'] = 500
sns.set(font_scale=1)
sns.set_context('talk')
sns.set_style('white')
# get matplotlib to save readable fonts
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, import_homer_diffgene, pull_comparisons_get_diff
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, plot_exp_rpkm, gene_list_bar_plot

In [2]:
workingDirectory = '/home/h1bennet/strains_rna/results/K00_Strains_Control_Kupffer_RNA/'
dataDirectory = '/home/h1bennet/strains/data/RNA/control_kupffer/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)


To do this analysis we will pull from the differential analysis performed by Ty at the following location:  

     /home/ttroutman/strainsKupffer/rnaKupfferNASH/
 

# Import differential gene files for examination and plotting

## Write differential genes to list for metascape

In [3]:
if not os.path.isdir('./gene_lists/'):
    os.mkdir('./gene_lists')

In [4]:
strains = ['aj', 'balb', 'c57']
log2fc = 1
padj = 0.05
diffpath = '/home/ttroutman/strainsKupffer/rnaKupfferNASH/rnaDiff/'
diffpath2 = '/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/'

In [5]:
gene_list_dict ={}

In [6]:
for strain in strains:
    # initialize gene lists
    g_intersect = set()
    g_union = set()
    for i in glob.glob(diffpath2+strain+'*control.vs.*control*.txt'):
        print(i)
        df = pd.read_csv(i,
                 sep='\t',
                 index_col=0)

        # select differential genes
        # df = df.loc[~(df.chr.str.contains('chrX|chrY|ChrUn')), :]
        sig_genes = df.index[(df['padj'] < padj) & (df['log2FoldChange'] <= -log2fc)].to_list()
        
        # add to sets
        if len(g_union) == 0:
            g_union = set(sig_genes)
            g_intersect = set(sig_genes)
        else:
            g_union = g_union.union(sig_genes)
            g_intersect = g_intersect.intersection(sig_genes)
        
#         print('union set length:', print(len(g_union)))
#         print('intersection set length:', print(len(g_intersect)))
    
    # after getting union and intersection sets add to dict
    gene_list_dict[strain+'_union'] = list(g_union)
    gene_list_dict[strain+'_intersection'] = list(g_intersect)
        

/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/aj_control.vs.c57bl6j_control.scatter.txt
/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/aj_control.vs.balbcj_control.scatter.txt
/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/balbcj_control.vs.aj_control.scatter.txt
/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/balbcj_control.vs.c57bl6j_control.scatter.txt
/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/c57bl6j_control.vs.aj_control.scatter.txt
/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/rnaDiff/c57bl6j_control.vs.balbcj_control.scatter.txt


In [7]:
for key in gene_list_dict.keys():
    print(key,':',len(gene_list_dict[key]))

aj_union : 226
aj_intersection : 33
balb_union : 230
balb_intersection : 45
c57_union : 237
c57_intersection : 69


read in Ty's gene lists

In [8]:
ty_aj_union = []
with open('/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/selfAnalysis/ajUnion.tsv', 'r') as f:
    f.readline()
    for line in f.readlines():
        ty_aj_union.append(line.split('\t')[0])

In [9]:
print(len(set(ty_aj_union)))
print(len(set(gene_list_dict['aj_union'])))
print(len(set(ty_aj_union).intersection(gene_list_dict['aj_union'])))

226
226
226


In [10]:
ty_balb_union = []
with open('/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/selfAnalysis/balbUnion.tsv', 'r') as f:
    f.readline()
    for line in f.readlines():
        ty_balb_union.append(line.split('\t')[0])

In [11]:
print(len(set(ty_balb_union)))
print(len(set(gene_list_dict['balb_union'])))
print(len(set(ty_balb_union).intersection(gene_list_dict['balb_union'])))

230
230
230


In [12]:
ty_c57_union = []
with open('/home/ttroutman/strainsKupffer/rnaKupfferHealthyYoung_20210111/selfAnalysis/c57Union.tsv', 'r') as f:
    f.readline()
    for line in f.readlines():
        ty_c57_union.append(line.split('\t')[0])

In [13]:
print(len(set(ty_c57_union)))
print(len(set(gene_list_dict['c57_union'])))
print(len(set(ty_c57_union).intersection(gene_list_dict['c57_union'])))

237
237
237


# Need to extract all 6 categories of gene regulation

In [14]:
gene_list_dict.keys()

dict_keys(['aj_union', 'aj_intersection', 'balb_union', 'balb_intersection', 'c57_union', 'c57_intersection'])

In [15]:
aj_balb = set(gene_list_dict['aj_union']).intersection(gene_list_dict['balb_union'])

In [16]:
aj_balb = set(gene_list_dict['aj_union']).intersection(gene_list_dict['balb_union'])
aj_c57 = set(gene_list_dict['aj_union']).intersection(gene_list_dict['c57_union'])
balb_c57 = set(gene_list_dict['balb_union']).intersection(gene_list_dict['c57_union'])

In [17]:
gene_member_dict = {}
gene_lists = [gene_list_dict['aj_union'],
              gene_list_dict['balb_union'],
              gene_list_dict['c57_union'],
              aj_balb,
              aj_c57,
              balb_c57,]

list_names = ['aj_union',
              'balb_union',
              'c57_union',
              'aj_balb_shared',
              'aj_c57_shared',
              'balb_c57_shared']

for gene_list, name in zip(gene_lists, list_names):
    for i in gene_list:
        gene_member_dict[i.upper()] = name

gene_member_series = pd.Series(gene_member_dict, name='target_group')
gene_member_series.index.name = 'target'

gene_member_series.to_csv('./diff_gene_membership_uppercase.txt',
                                   sep='\t',
                                   header=True)

In [18]:
gene_member_dict = {}
gene_lists = [gene_list_dict['aj_union'],
              gene_list_dict['balb_union'],
              gene_list_dict['c57_union'],
              aj_balb,
              aj_c57,
              balb_c57,]

list_names = ['aj_union',
              'balb_union',
              'c57_union',
              'aj_balb_shared',
              'aj_c57_shared',
              'balb_c57_shared']

for gene_list, name in zip(gene_lists, list_names):
    for i in gene_list:
        gene_member_dict[i] = name
        
gene_member_series = pd.Series(gene_member_dict, name='target_group')
gene_member_series.index.name = 'target'

gene_member_series.to_csv('./diff_gene_membership.txt',
                                   sep='\t',
                                   header=True)