Hunter Bennett | Glass Lab | Kupffer Strains Project | April 27 2021  

This series of notebooks is for manuscript preparation of "final" figures for import into Adobe Illustrator.

This notebook in particular prepares a panel showing that leptin treatment induces BALBcJ specific gene expression.

In [56]:
!pwd

/gpfs/data01/glasslab/home/h1bennet/strains_rna/results/10_Strains_Manuscript_Prep


In [36]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns

# import custom functions
import sys
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from plotting_scripts import label_point, pca_rpkm_mat, get_diff_volcano, gene_list_bar_plot
from homer_preprocessing import read_annotated_peaks, import_homer_diffgene, pull_comparisons_get_diff, read_homer_gene_exp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [50]:
dataDirectory = '/home/h1bennet/strains_rna/results/A00_Strains_Control_WholeLiver_RNA/'
workingDirectory = '/home/h1bennet/strains_rna/results/10_Strains_Manuscript_Prep/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

if not os.path.isdir('./kc_identity_gene_barplots'):
    os.mkdir('./kc_identity_gene_barplots')

In [38]:
#### PLOTTING PARAMETERS FOR MANUSCRIPT ####
# # get matplotlib to save readable fonts
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['font.size'] = 6
matplotlib.rcParams['savefig.dpi'] = 500

# line widths
matplotlib.rcParams['axes.linewidth'] = 1
matplotlib.rcParams['xtick.major.width'] = 1
matplotlib.rcParams['ytick.major.width'] = 1

# adjust defualt color for plots to black
# normal default is a dark gray
COLOR = 'black'
matplotlib.rcParams['text.color'] = COLOR
matplotlib.rcParams['axes.labelcolor'] = COLOR
matplotlib.rcParams['xtick.color'] = COLOR
matplotlib.rcParams['ytick.color'] = COLOR
matplotlib.rcParams['axes.edgecolor'] = COLOR

#### PLOT PARAMETERS FOR THIS PLOT ####

# Define plotting function

In [39]:
def gene_list_bar_plot_illus(genes, data, groups,
                             group_labels, colors, width=0.5,
                             spacing=0.0, title='', ylabel='', xlabel='',
                             xticklabels=[],
                             ax=None):
    '''This function allows for visualization of expression in qPCR style
    bar plots for multiple genes contained in an rpkm data file.
    
        Accepts:
            genes (str): list of geneIDs to plot
            data (pandas DataFrame): dataset in gene x sample format with geneIDs as index
            groups (int): groups of samples in the columns of data
            group_labels (str): labels of each group in groups
            colors (str) = colors of each group in groups
            width (float) = width of bars to plot, default 0.5
            spacing (float) = extra spacing between bars to plot, default 0.0
            relative (bool) = whether to standardize plot so that first group mean is 1,
            ax (matplotlib.Axes object) = plot axis
        
        Returns:
            ax (matplotlib.Axes object) = gene list bar plot'''

    import pandas as pd
    import matplotlib.pyplot as plt    
    import numpy as np

    # calculate means and standard deviations for plotting
    # adjust if using a relative method (control group set to 1)
    # get mean and std
    means = []
    stds = []
    for (group, label) in zip(groups, group_labels):
        means.append(data.iloc[:, group].reindex(genes).mean(1))
        stds.append(data.iloc[:, group].reindex(genes).std(1))
    
    # set axis
    if ax==None:
        ax = plt.gca()
    

    # plot
    ind = np.arange(len(genes))
    ind = np.arange(len(genes))    # the x locations for the groups
    i = 0
    for mean, std, group, label, color in zip(means, stds, groups, group_labels, colors):
        ax.bar(ind + (width+spacing)*i,
               mean,
               width,
               color=color,
               yerr=[np.zeros(len(std)), std],
               edgecolor='k',
               linewidth=1,
               capsize=20,
               error_kw={'elinewidth':1,
                         'capthick':1,
                         'capsize':4},
               label=label)
        i+=1
        
    if len(xticklabels) > 0:
        ax.set_xticks([ind + (width+spacing)*i for i in np.arange(len(xticklabels))]);
        ax.set_xticklabels(xticklabels);
    ax.legend(fontsize=10);
    ax.set_ylabel(ylabel, fontsize=8)
    ax.set_xlabel(xlabel, fontsize=8)
    ax.set_title(title, fontsize=8)
    
    return ax

# Make folder for figures

In [40]:
if not os.path.isdir('./kc_identity_rna_bar_plots/'):
    os.mkdir('./kc_identity_rna_bar_plots/')

# Import RNA-seq data

In [41]:
pval = 0.05
log2fc = np.log2(2)
pthresh = 0.05
lfcthresh = (np.log2(2),np.log2(4))


Read in data from experiment

In [42]:
tpm_mat = pd.read_csv('/home/ttroutman/strainsKupffer/rnaKupfferNASH/rnaQuan/rawT.txt',
                      sep='\t', index_col=0)
tpm_mat = tpm_mat.loc[:, tpm_mat.columns.str.contains('control')]

Read in ido amit gene list

In [43]:
with open('/gpfs/data01/glasslab/home/h1bennet/data/Amit_KC_genelist.txt', 'r') as f:
    kc_list = [i.strip().replace('\"','') for i in f.readlines()]

Iterate through comparisons

In [44]:
comps = ['aj_kupffer_control_young.vs.balbc_kupffer_control_young.scatter.homer.txt',
         '/aj_kupffer_control_young.vs.c57_kupffer_control_young.scatter.homer.txt',
         '/balbc_kupffer_control_young.vs.c57_kupffer_control_young.scatter.homer.txt']

ids = [['AJ.*Kup', 'BALB.*Kup'],
       ['AJ.*Kup', 'C57.*Kup'],
       ['BALB.*Kup', 'C57.*Kup']]

strain_varying_kc_spec = []

In [45]:
for comp, grep, in zip(comps, ids):
    print(comp)
    de, tpm, tpm_mat, tpm_mat_quant = import_homer_diffgene(
        dataDirectory+'./differential/'+comp,
        dataDirectory+'./expression/HOMER.rawTPM.txt',
        gene_name_index=True)
    
    # extract columns
    g0 = tpm_mat.columns.str.contains(grep[0])
    g1 = tpm_mat.columns.str.contains(grep[1])
    col = g0 | g1
    
    de['logtpm'] = np.log2(tpm_mat.loc[:, col].mean(1)+1)
    de['log10p'] = -np.log10(de.padj + 10**(-50))
    de['g0_mean'] = np.log2(tpm_mat.loc[:, g0].mean(1)+1)
    de['g1_mean'] = np.log2(tpm_mat.loc[:, g1].mean(1)+1)
       
    # plot group by group scatter:
    de_all = de
    de_kc_spec = de.reindex(kc_list).dropna()
    de_sig_fc1 = de.reindex(de.index[(de.padj < pthresh) & (np.abs(de.log2FoldChange) >= lfcthresh[0])])
    
    # genes up in group 1
    de_sig_fc1_up = de_sig_fc1.index[de_sig_fc1.log2FoldChange >= lfcthresh[0]].to_list()
    de_sig_fc1_dn = de_sig_fc1.index[de_sig_fc1.log2FoldChange <= -lfcthresh[0]].to_list()
    
    # find overlap
    kc_up = set(de_sig_fc1_up).intersection(kc_list)
    kc_dn = set(de_sig_fc1_dn).intersection(kc_list)
    
    # store overlap
    strain_varying_kc_spec.extend(set(kc_up))
    strain_varying_kc_spec.extend(set(kc_dn))

aj_kupffer_control_young.vs.balbc_kupffer_control_young.scatter.homer.txt
ge all transcripts (24940, 43)
Diff Gene selected transcripts (7612, 14)
ge selected transcripts (7612, 43)
/aj_kupffer_control_young.vs.c57_kupffer_control_young.scatter.homer.txt
ge all transcripts (24940, 43)
Diff Gene selected transcripts (7652, 14)
ge selected transcripts (7652, 43)
/balbc_kupffer_control_young.vs.c57_kupffer_control_young.scatter.homer.txt
ge all transcripts (24940, 43)
Diff Gene selected transcripts (7659, 14)
ge selected transcripts (7659, 43)


In [47]:
print(len(kc_list))
print(len(set(strain_varying_kc_spec)))

303
33


# Barplots of strain specific genes

In [58]:
groups = [[0,1],
          [2,3],
          [4,5]]

group_labels = ['AJ',
                'Balb/cJ',
                'C57BL/6J']

colors = ['#de2d26',
          '#6baed6',
          '#74c476']

legend_elements = []
for lab, col in  zip(group_labels, colors):
    legend_elements.append(Patch(label=lab, color=col))

In [60]:
for gene in set(strain_varying_kc_spec):
    fig, ax = plt.subplots(figsize=(1,1))
    gene_list_bar_plot_illus([gene],
                       tpm_mat,
                       groups=groups,
                       group_labels = group_labels,
                       colors = colors,
                       xticklabels=group_labels,
                       title=gene,
                       xlabel='',
                       ylabel='',
                       spacing=0.025,
                       width = 0.10,
                       ax=ax)
    
    # Hide the right and top spines
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.get_legend().remove()
    
    ax.set_ylabel('TPM');

    plt.savefig('./kc_identity_gene_barplots/%s_strain_kc_tpm_barplot.pdf' % gene,
                bbox_inches='tight')
    plt.close()