In [1]:
### Author: Leonie Küchenhoff
### Date: October 2022
### Purpose of script: Write one list per tissue of tissue specific variants

In [2]:
import numpy as np
import pandas as pd
import os
from config import basedir, tissue_specdir, annotation_dir

In [3]:
outdir = tissue_specdir
annodir = annotation_dir


print('This base directory will be used:\n', basedir) 
os.chdir(basedir)

# define filtering method
filter = '1ed'
names = ['279', '282', '450']
paths = [f'merged/txt_files/ad_hc_mu/HLT{i}.specific.txt' for i in names]
col1 = 'ad_h'
col2 = 'ad_l'
col3 = 'ad_t'
paths_anno = [f'{annodir}/HLT{i}.merged.mm10_multianno.txt' for i in names]

This base directory will be used:
 /g/steinmetz/project/leonie_crispr/03_data/01_heartproject/snakemake_vcf/


In [4]:
# read in tissue specific variants
file_dict = {}
for i in zip(paths, names):
    df = pd.read_csv(i[0], delimiter = '\t')
    file_dict[i[1]] = df

In [5]:
def get_sets(df):
    setlist = np.arange(0, len(df))
    heart = setlist[df['heart']]
    liver = setlist[df['liver']]
    tail = setlist[df['tail']]
    return heart, liver, tail

In [6]:
venn_dict = {}
for i in names:
    venn_dict[i] = get_sets(file_dict[i])

In [7]:
def get_af(df, col1 = 'AD|h_mu', col2 = 'AD|l_mu', col3 = 'AD|t_mu'):
    # determine allele ferquency and total number of reads per variant and sample
    mouse_ad = df[[col1, col2, col3]]
    allel1 = mouse_ad.applymap(lambda x: int(x.split(',')[0])).to_numpy()
    allel2 = mouse_ad.applymap(lambda x: int(x.split(',')[1])).to_numpy()
    af = allel2 / (allel1 + allel2)
    af = np.nan_to_num(af, 0)
    tot_reads = allel1 + allel2
    return af, tot_reads

In [13]:
heart_spec_279 = file_dict['279'][file_dict['279']['heart'] == True]
heart_spec_282 = file_dict['282'][file_dict['282']['heart'] == True]
heart_spec_450 = file_dict['450'][file_dict['450']['heart'] == True]

In [14]:
# heart specific variants with additional filter step:
# only allow variants that do not have any alt. allesles measured with mutect in other tissues

af, tot_reads = get_af(heart_spec_279, col1, col2, col3) 
heart_spec_279['AF_h'], heart_spec_279['AF_l'], heart_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
heart_spec_279['reads_h'], heart_spec_279['reads_l'], heart_spec_279['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
heart_spec_279_3 = heart_spec_279.iloc[np.where((af[:,1]==0) &(af[:,2]==0))]

af, tot_reads  = get_af(heart_spec_282, col1, col2, col3) 
heart_spec_282['AF_h'], heart_spec_282['AF_l'], heart_spec_282['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
heart_spec_282['reads_h'], heart_spec_282['reads_l'], heart_spec_282['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
heart_spec_282_3 = heart_spec_282.iloc[np.where((af[:,1]==0) &(af[:,2]==0))]

af, tot_reads  = get_af(heart_spec_450, col1, col2, col3) 
heart_spec_450['AF_h'], heart_spec_450['AF_l'], heart_spec_450['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
heart_spec_450['reads_h'], heart_spec_450['reads_l'], heart_spec_450['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
heart_spec_450_3 = heart_spec_450.iloc[np.where((af[:,1]==0) &(af[:,2]==0))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec_279['AF_h'], heart_spec_279['AF_l'], heart_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec_279['AF_h'], heart_spec_279['AF_l'], heart_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  h

In [17]:
use_cols = ['Func.refGene', 'Gene.refGene',
       'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
       'Otherinfo4', 'Otherinfo5', 'Otherinfo7', 'Otherinfo8']
annotated_heart_dict = {}
samples = [heart_spec_279_3,heart_spec_282_3, heart_spec_450_3]
for count, i in enumerate(paths_anno):
       #add annotation to files
       heart_file = samples[count][['chr', 'pos', 'ref', 'alt', col1, col2, col3, 'AF_h', 'AF_l', 'AF_t', 'reads_h', 'reads_l', 'reads_t']]
       anno = pd.read_csv(i, delimiter = '\t', usecols = use_cols).rename(columns={'Otherinfo4':'chr', 'Otherinfo5':'pos', 'Otherinfo7':'ref', 'Otherinfo8':'alt'})
       merged = pd.merge(heart_file, anno, how = 'left',on = ['chr', 'pos', 'ref', 'alt'])
       merged.loc[:,'total_reads'] = merged.loc[:,'reads_h'] +  merged.loc[:,'reads_l'] +  merged.loc[:,'reads_t']
       merged.loc[:,'lt_reads'] =  merged.loc[:,'reads_l'] +  merged.loc[:,'reads_t']
       merged.loc[:,'present_in_sample'] =  names[count]
       annotated_heart_dict[names[count]] = merged
       


In [36]:
def annotate_duplicated(df_main, df_compare1, df_compare2):
    '''
    function to indicate wether variant occurs only in on sample or also other sample
    '''
    df_main_anno = pd.merge(df_main,df_compare1[['chr', 'pos', 'ref','alt', 'present_in_sample']], on =['chr', 'pos', 'ref','alt'], how = 'left')
    df_main_anno['present_in_sample_y'] = df_main_anno['present_in_sample_y'].fillna('')
    df_main_anno['present_in_sample_y'] = df_main_anno['present_in_sample_x'] +', ' + df_main_anno['present_in_sample_y']

    df_main_anno = pd.merge(df_main_anno,df_compare2[['chr', 'pos', 'ref','alt', 'present_in_sample']], on =['chr', 'pos', 'ref','alt'], how = 'left')
    df_main_anno['present_in_sample'] = df_main_anno['present_in_sample'].fillna('')
    df_main_anno['present_in_sample'] = df_main_anno['present_in_sample_y'] +', ' + df_main_anno['present_in_sample']

    df_main_anno = df_main_anno.drop(['present_in_sample_x', 'present_in_sample_y'], axis = 1)
    return(df_main_anno)

In [41]:
# use annotate_duplicated function to indicate wether variant occurs only in on sample or also other sample
# save file
heart279 = annotate_duplicated(annotated_heart_dict['279'], annotated_heart_dict['282'], annotated_heart_dict['450'])
heart279.to_csv(f'{outdir}/sample_279_{filter}.txt', sep = '\t', index = False)
heart282 = annotate_duplicated(annotated_heart_dict['282'], annotated_heart_dict['279'], annotated_heart_dict['450'])
heart282.to_csv(f'{outdir}/sample_282_{filter}.txt', sep = '\t', index = False)
heart450 = annotate_duplicated(annotated_heart_dict['450'], annotated_heart_dict['279'], annotated_heart_dict['282'])
heart450.to_csv(f'{outdir}/sample_450_{filter}.txt', sep = '\t', index = False)

## Liver

In [43]:
liver_spec_279 = venn_dict['279'][3][(venn_dict['279'][3]['heart'] == False) & (venn_dict['279'][3]['tail'] == False) & (venn_dict['279'][3]['liver'] == True)] 
liver_spec_282 = venn_dict['282'][3][(venn_dict['282'][3]['heart'] == False) & (venn_dict['282'][3]['tail'] == False) & (venn_dict['282'][3]['liver'] == True)] 
liver_spec_450 = venn_dict['450'][3][(venn_dict['450'][3]['heart'] == False) & (venn_dict['450'][3]['tail'] == False) & (venn_dict['450'][3]['liver'] == True)] 

In [47]:
# heart specific variants with additional filter step:
# only allow variants that do not have any alt. allesles measured with mutect in other tissues

af, tot_reads = get_af(liver_spec_279, col1, col2, col3) 
liver_spec_279['AF_h'], liver_spec_279['AF_l'], liver_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
liver_spec_279['reads_h'], liver_spec_279['reads_l'], liver_spec_279['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
liver_spec_279_3 = liver_spec_279.iloc[np.where((af[:,0]==0) &(af[:,2]==0))]

af, tot_reads  = get_af(liver_spec_282, col1, col2, col3) 
liver_spec_282['AF_h'], liver_spec_282['AF_l'], liver_spec_282['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
liver_spec_282['reads_h'], liver_spec_282['reads_l'], liver_spec_282['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
liver_spec_282_3 = liver_spec_282.iloc[np.where((af[:,0]==0) &(af[:,2]==0))]

af, tot_reads  = get_af(liver_spec_450, col1, col2, col3) 
liver_spec_450['AF_h'], liver_spec_450['AF_l'], liver_spec_450['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
liver_spec_450['reads_h'], liver_spec_450['reads_l'], liver_spec_450['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
liver_spec_450_3 = liver_spec_450.iloc[np.where((af[:,0]==0) &(af[:,2]==0))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver_spec_279['AF_h'], liver_spec_279['AF_l'], liver_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  liver_spec_279['AF_h'], liver_spec_279['AF_l'], liver_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  l

In [48]:

paths_anno = [f'{annodir}/HLT{i}.merged.mm10_multianno.txt' for i in names]

use_cols = ['Func.refGene', 'Gene.refGene',
       'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
       'Otherinfo4', 'Otherinfo5', 'Otherinfo7', 'Otherinfo8']
annotated_liver_dict = {}
samples = [liver_spec_279_3,liver_spec_282_3, liver_spec_450_3]
for count, i in enumerate(paths_anno):
       #add annotation to files
       liver_file = samples[count][['chr', 'pos', 'ref', 'alt', 'AD|h_mu', 'AD|l_mu', 'AD|t_mu', 'AF_h', 'AF_l', 'AF_t', 'reads_h', 'reads_l', 'reads_t']]
       anno = pd.read_csv(i, delimiter = '\t', usecols = use_cols).rename(columns={'Otherinfo4':'chr', 'Otherinfo5':'pos', 'Otherinfo7':'ref', 'Otherinfo8':'alt'})
       merged = pd.merge(liver_file, anno, how = 'left',on = ['chr', 'pos', 'ref', 'alt'])
       merged.loc[:,'total_reads'] = merged.loc[:,'reads_h'] +  merged.loc[:,'reads_l'] +  merged.loc[:,'reads_t']
       merged.loc[:,'lt_reads'] =  merged.loc[:,'reads_l'] +  merged.loc[:,'reads_t']
       merged.loc[:,'present_in_sample'] =  names[count]
       annotated_liver_dict[names[count]] = merged
       


In [49]:
# use annotate_duplicated function to indicate wether variant occurs only in on sample or also other sample
# save file
liver279 = annotate_duplicated(annotated_liver_dict['279'], annotated_liver_dict['282'], annotated_liver_dict['450'])
liver279.to_csv(f'{outdir}/liver_sample_279_{filter}.txt', sep = '\t', index = False)
liver282 = annotate_duplicated(annotated_liver_dict['282'], annotated_liver_dict['279'], annotated_liver_dict['450'])
liver282.to_csv(f'{outdir}/liver_sample_282_{filter}.txt', sep = '\t', index = False)
liver450 = annotate_duplicated(annotated_liver_dict['450'], annotated_liver_dict['279'], annotated_liver_dict['282'])
liver450.to_csv(f'{outdir}/liver_sample_450_{filter}.txt', sep = '\t', index = False)

## Tail

In [50]:
tail_spec_279 = venn_dict['279'][3][(venn_dict['279'][3]['heart'] == False) & (venn_dict['279'][3]['tail'] == True) & (venn_dict['279'][3]['liver'] == False)] 
tail_spec_282 = venn_dict['282'][3][(venn_dict['282'][3]['heart'] == False) & (venn_dict['282'][3]['tail'] == True) & (venn_dict['282'][3]['liver'] == False)] 
tail_spec_450 = venn_dict['450'][3][(venn_dict['450'][3]['heart'] == False) & (venn_dict['450'][3]['tail'] == True) & (venn_dict['450'][3]['liver'] == False)] 


In [51]:

# tail specific variants with additional filter step:
# only allow variants that do not have any alt. allesles measured with mutect in other tissues

af, tot_reads = get_af(tail_spec_279, col1, col2, col3) 
tail_spec_279['AF_h'], tail_spec_279['AF_l'], tail_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
tail_spec_279['reads_h'], tail_spec_279['reads_l'], tail_spec_279['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
tail_spec_279_3 = tail_spec_279.iloc[np.where((af[:,0]==0) &(af[:,1]==0))]

af, tot_reads  = get_af(tail_spec_282, col1, col2, col3) 
tail_spec_282['AF_h'], tail_spec_282['AF_l'], tail_spec_282['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
tail_spec_282['reads_h'], tail_spec_282['reads_l'], tail_spec_282['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
tail_spec_282_3 = tail_spec_282.iloc[np.where((af[:,0]==0) &(af[:,1]==0))]

af, tot_reads  = get_af(tail_spec_450, col1, col2, col3) 
tail_spec_450['AF_h'], tail_spec_450['AF_l'], tail_spec_450['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
tail_spec_450['reads_h'], tail_spec_450['reads_l'], tail_spec_450['reads_t'] = tot_reads[:, 0], tot_reads[:, 1], tot_reads[:, 2]
tail_spec_450_3 = tail_spec_450.iloc[np.where((af[:,0]==0) &(af[:,1]==0))]

paths_anno = [f'{annodir}/HLT{i}.merged.mm10_multianno.txt' for i in names]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tail_spec_279['AF_h'], tail_spec_279['AF_l'], tail_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tail_spec_279['AF_h'], tail_spec_279['AF_l'], tail_spec_279['AF_t'] = af[:, 0], af[:, 1], af[:, 2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tail_sp

In [52]:

use_cols = ['Func.refGene', 'Gene.refGene',
       'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
       'Otherinfo4', 'Otherinfo5', 'Otherinfo7', 'Otherinfo8']
annotated_tail_dict = {}
samples = [tail_spec_279_3,tail_spec_282_3, tail_spec_450_3]
for count, i in enumerate(paths_anno):
       #add annotation to files
       tail_file = samples[count][['chr', 'pos', 'ref', 'alt', 'AD|h_mu', 'AD|l_mu', 'AD|t_mu', 'AF_h', 'AF_l', 'AF_t', 'reads_h', 'reads_l', 'reads_t']]
       anno = pd.read_csv(i, delimiter = '\t', usecols = use_cols).rename(columns={'Otherinfo4':'chr', 'Otherinfo5':'pos', 'Otherinfo7':'ref', 'Otherinfo8':'alt'})
       merged = pd.merge(tail_file, anno, how = 'left',on = ['chr', 'pos', 'ref', 'alt'])
       merged.loc[:,'total_reads'] = merged.loc[:,'reads_h'] +  merged.loc[:,'reads_l'] +  merged.loc[:,'reads_t']
       merged.loc[:,'lt_reads'] =  merged.loc[:,'reads_l'] +  merged.loc[:,'reads_t']
       merged.loc[:,'present_in_sample'] =  names[count]
       annotated_tail_dict[names[count]] = merged
       


In [53]:
# use annotate_duplicated function to indicate wether variant occurs only in on sample or also other sample
# save file
tail279 = annotate_duplicated(annotated_tail_dict['279'], annotated_tail_dict['282'], annotated_tail_dict['450'])
tail279.to_csv(f'{outdir}/tail_sample_279_{filter}.txt', sep = '\t', index = False)
tail282 = annotate_duplicated(annotated_tail_dict['282'], annotated_tail_dict['279'], annotated_tail_dict['450'])
tail282.to_csv(f'{outdir}/tail_sample_282_{filter}.txt', sep = '\t', index = False)
tail450 = annotate_duplicated(annotated_tail_dict['450'], annotated_tail_dict['279'], annotated_tail_dict['282'])
tail450.to_csv(f'{outdir}/tail_sample_450_{filter}.txt', sep = '\t', index = False)