Short utility notebook for looking into mutations at certain loci between the strains using MARGE

In [1]:
### header ###
__author__ = "Hunter Bennett"
__license__ = "BSD"
__email__ = "hunter.r.bennett@gmail.com"
%load_ext autoreload
%autoreload 2
### imports ###
import sys
%matplotlib inline
import os
import re
import glob
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

# import custom functions
sys.path.insert(0, '/home/h1bennet/code/')
from hbUtils import ngs_qc, quantile_normalize_df
from homer_preprocessing import read_homer_gene_exp, read_annotated_peaks

In [2]:
dataDirectory = ''
workingDirectory = '/home/h1bennet/strains/results/99b_Survey_Strain_mutations/'
if not os.path.isdir(workingDirectory):
    os.mkdir(workingDirectory)
os.chdir(workingDirectory)

In [15]:
tpm = read_homer_gene_exp('/home/ttroutman/strainsKupffer/rnaKupfferNASH/rnaQuan/HOMER.rawTPM.txt',
                          gene_name_index=True)
# tpm_mat = tpm_mat.loc[:, tpm_mat.columns.str.contains('control')]

Select genes of interest

In [19]:
genes_of_interest = ['Lepr', 'Lep', 'Mitf', 'Tfe3', 'Tfeb', 'Tfec']

In [21]:
tpm.loc[genes_of_interest, :].iloc[:, :4].to_csv(
    './210403_genes.txt',
    sep='\t')

In [24]:
tpm.head()

Unnamed: 0_level_0,chr,start,end,strand,Length,Copies,Annotation/Divergence,AJ_Kupffer_RNA_AJ01C_HBENN_l20201208_TACCGAGG_CCTGAACT.aj.star_shifted_from_AJ.sa,AJ_Kupffer_RNA_AJ01D_HBENN_l20201208_CGTTAGAA_TTCAGGTC.aj.star_shifted_from_AJ.sa,BALBC_Kupffer_RNA_BALB01C_HBENN_l20201208_AGCCTCAT_AGTAGAGA.balbcj.star_shifted_from_BALBCJ.sa,...,aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3A_JSSTDT_16_09_2,aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3B_JSSTDT_16_09_2,aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3c_JSS_TDT_16_09_2,balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3B_JSSTDT_16_09_2,balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3C_JSSTDT_16_09_2,balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3d_JSS_TDT_16_09_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN144B_JSS_TDT_16_10_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN146C_JSS_TDT_16_10_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN146D_JSS_TDT_16_10_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN152B_JSS_TDT_16_10_2
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mia2,chr12,59095800,59190175,+,4538.0,1,Mia2|Ctage5|D12Bwg0579e|Mea6|Mgea|Mgea6|-|12 2...,60.108,67.625,57.602,...,60.632,53.128,26.63,64.113,45.735,43.279,39.245,33.394,39.861,35.133
Abraxas1,chr5,100804802,100820935,-,2488.0,1,Abraxas1|3830405G04Rik|5630400M01Rik|AI506069|...,2.581,4.101,2.813,...,2.803,3.057,1.566,2.611,3.043,2.577,2.701,2.288,1.751,2.201
March1,chr8,66386301,66471637,+,3936.0,1,March1|2900024D24Rik|BB085186|-|8|8 B3.1|prote...,57.909,79.025,55.064,...,112.092,98.156,51.064,103.723,73.842,55.775,58.059,64.909,58.102,53.251
Mob3b,chr4,34949074,35157484,-,6024.0,1,Mob3b|8430436F23Rik|A430018A01Rik|Mobkl2b|-|4 ...,1.531,2.496,3.912,...,2.454,2.114,1.031,3.396,3.078,2.522,1.165,0.533,0.925,0.987
1810034E14Rik,chr13,64248700,64268145,+,1799.0,1,1810034E14Rik|-|-|13|13 B3|ncRNA,0.974,1.119,0.458,...,1.562,0.917,1.259,0.808,1.133,1.74,1.818,1.549,2.315,1.745


    MMARGE.pl annotate_mutations -file ./210403_genes.txt \
    -ind balbcj, aj
    
    MMARGE.pl annotate_mutations -genome mm10 -exons -file ./210403_genes.txt \
    -ind balbcj, aj

# Check exons

In [26]:
tpm_refseq = read_homer_gene_exp('/home/ttroutman/strainsKupffer/rnaKupfferNASH/rnaQuan/HOMER.rawTPM.txt',
                          gene_name_index=False)

In [32]:
tpm_refseq.loc[tpm_refseq['Annotation/Divergence'].str.contains('Lepr\|'), :]

Unnamed: 0_level_0,chr,start,end,strand,Length,Copies,Annotation/Divergence,AJ_Kupffer_RNA_AJ01C_HBENN_l20201208_TACCGAGG_CCTGAACT.aj.star_shifted_from_AJ.sa,AJ_Kupffer_RNA_AJ01D_HBENN_l20201208_CGTTAGAA_TTCAGGTC.aj.star_shifted_from_AJ.sa,BALBC_Kupffer_RNA_BALB01C_HBENN_l20201208_AGCCTCAT_AGTAGAGA.balbcj.star_shifted_from_BALBCJ.sa,...,aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3A_JSSTDT_16_09_2,aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3B_JSSTDT_16_09_2,aj_KupfferTotal_RNA_polyA_AMLNDiet_30week_AJ3c_JSS_TDT_16_09_2,balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3B_JSSTDT_16_09_2,balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3C_JSSTDT_16_09_2,balbc_KupfferTotal_RNA_polyA_AMLNDiet_30week_Balb3d_JSS_TDT_16_09_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN144B_JSS_TDT_16_10_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN146C_JSS_TDT_16_10_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN146D_JSS_TDT_16_10_2,NCoRWT_KupfferTim4Pos_RNA_polyA_AMLNDiet_30week_LN152B_JSS_TDT_16_10_2
refseqID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NM_001122899,chr4,101717407,101813667,+,5518.0,1,Lepr|LEPROT|Leprb|Modb1|OB-RGRP|Obr|db|diabete...,14.771,20.947,144.931,...,10.169,8.287,11.375,39.444,19.404,65.375,1.67,4.27,1.672,2.179


In [39]:
exons = pd.read_csv('/home/ttroutman/strainsKupffer/annotateMutations/mm10_exons',
                    sep='\t',
                    index_col=0)

In [40]:
exons.loc[exons.index.str.contains('NM_001122899')]

Unnamed: 0_level_0,chr,start,end,strand,Code,Divergence,FullStart,FullEnd
RepeatID (cmd=analyzeRepeats.pl rna mm10 -count exons),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
NM_001122899--Part1,chr4,101717407,101717938,+,E:101717407,0,101717407,101813667
NM_001122899--Part2,chr4,101728010,101728029,+,E:101728010,0,101717407,101813667
NM_001122899--Part3,chr4,101728030,101728069,+,E:101728030,0,101717407,101813667
NM_001122899--Part4,chr4,101733267,101733596,+,E:101733267,0,101717407,101813667
NM_001122899--Part5,chr4,101735560,101735683,+,E:101735560,0,101717407,101813667
NM_001122899--Part6,chr4,101745511,101745716,+,E:101745511,0,101717407,101813667
NM_001122899--Part7,chr4,101750287,101750429,+,E:101750287,0,101717407,101813667
NM_001122899--Part8,chr4,101752061,101752205,+,E:101752061,0,101717407,101813667
NM_001122899--Part9,chr4,101764861,101765151,+,E:101764861,0,101717407,101813667
NM_001122899--Part10,chr4,101765304,101765421,+,E:101765304,0,101717407,101813667


In [41]:
exons.loc[exons.index.str.contains('NM_001122899')].to_csv(
    './lepr_exons.txt',
    sep='\t')

    MMARGE.pl annotate_mutations -file ./lepr_exons.txt \
    -ind balbcj, aj

# Look in annotated Leptin receptor ATAC Peaks

In [5]:
df, mat = read_annotated_peaks(
    '/gpfs/data01/glasslab/home/h1bennet/strains/results/00_Strains_Control_H3K27Ac/merged_peaks/ann_norm_kc_control_atac_peaks_all.txt')

In [8]:
df.loc[df['Nearest PromoterID']=='NM_001122899', :].iloc[:, :4].to_csv(
    './lepr_atac_peaks.txt',
    sep='\t')

    MMARGE.pl annotate_mutations -file ./lepr_atac_peaks.txt \
    -ind balbcj, aj

# Look in ATAC peaks from browser

In [19]:
chrs = ['chr4', 'chr4', 'chr4']
start = [101740726, 101741039, 101714403]
end = [101743066, 101722450, 101717403]
strand = ['+', '+', '+']

In [20]:
df = pd.DataFrame([chrs, start, end, strand],
                  index=['chr', 'start', 'end', 'strand'],
                  columns=['intron_1', 'intron_2', 'tss']).T.to_csv(
    './lepr_custom_peaks.txt',
    sep='\t')

    MMARGE.pl annotate_mutations -file ./lepr_custom_peaks.txt \
    -ind balbcj, aj