In [1]:
### Author: Leonie Küchenhoff
### Date: November 2022
### Purpose of script: Write one list per tissue of tissue specific variants

In [2]:
# pacage import
import numpy as np
import pandas as pd
import os
from config import basedir, tissue_specdir, annotation_dir
from itertools import product

In [3]:
#directory settings
outdir = tissue_specdir
annodir = annotation_dir
print('This base directory will be used:\n', basedir) 
os.chdir(basedir)
# sample names
names = ['028_pbs_R', '029_pbs_R', '030_nrch_R', '032_pbs_R', '033_nrch_R', '036_nrch_R',
       '011_pbs', '012_nrch', '013_nrch', '014_nrch', '279_spry', '321_pbs', '333_pbs', '450_spry', '283_spry']
varcallers = ['hc', 'pl', 'st']
combinations = list(product(names, varcallers))

paths = [f'filtered_tables/HL{i}.specific.annofilter.txt' for i in names]
col1 = 'ad_h'
col2 = 'ad_l' 

use_cols = ['Func.refGene', 'Gene.refGene',
       'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
       'Otherinfo4', 'Otherinfo5', 'Otherinfo7', 'Otherinfo8']


This base directory will be used:
 /g/steinmetz/project/leonie_crispr/03_data/02_rnaseq/snakemake/


In [4]:
paths_anno = [f'{annodir}/HL_{i[1]}_{i[0]}.mm10_multianno.txt' for i in combinations]

In [5]:
li = []
for path in paths_anno:
    anno = pd.read_csv(path, delimiter = '\t', usecols = use_cols).rename(columns={'Otherinfo4':'chr', 'Otherinfo5':'pos', 'Otherinfo7':'ref', 'Otherinfo8':'alt'})
    li.append(anno)

anno = pd.concat(li, axis=0, ignore_index=True)
anno = anno.drop_duplicates()

In [6]:
def get_sets(df):
    setlist = np.arange(0, len(df))
    heart = setlist[df['heart']]
    liver = setlist[df['liver']]
    return heart, liver

In [7]:
def get_af(df, col1, col2):
    # determine allele ferquency and total number of reads per variant and sample
    mouse_ad = df[[col1, col2]]
    allel1 = mouse_ad.applymap(lambda x: int(x.split(',')[0])).to_numpy()
    allel2 = mouse_ad.applymap(lambda x: int(x.split(',')[1])).to_numpy()
    af = allel2 / (allel1 + allel2)
    af = np.nan_to_num(af, 0)
    tot_reads = allel1 + allel2
    return af, tot_reads

In [8]:
def annotate_duplicated(df_main, df_compare1, df_compare2):
    '''
    function to indicate wether variant occurs only in on sample or also other sample
    '''
    df_main_anno = pd.merge(df_main,df_compare1[['chr', 'pos', 'ref','alt', 'present_in_sample']], on =['chr', 'pos', 'ref','alt'], how = 'left')
    df_main_anno['present_in_sample_y'] = df_main_anno['present_in_sample_y'].fillna('')
    df_main_anno['present_in_sample_y'] = df_main_anno['present_in_sample_x'] +', ' + df_main_anno['present_in_sample_y']

    df_main_anno = pd.merge(df_main_anno,df_compare2[['chr', 'pos', 'ref','alt', 'present_in_sample']], on =['chr', 'pos', 'ref','alt'], how = 'left')
    df_main_anno['present_in_sample'] = df_main_anno['present_in_sample'].fillna('')
    df_main_anno['present_in_sample'] = df_main_anno['present_in_sample_y'] +', ' + df_main_anno['present_in_sample']

    df_main_anno = df_main_anno.drop(['present_in_sample_x', 'present_in_sample_y'], axis = 1)
    return(df_main_anno)

In [9]:

use_cols = ['Func.refGene', 'Gene.refGene',
       'GeneDetail.refGene', 'ExonicFunc.refGene', 'AAChange.refGene',
       'Otherinfo4', 'Otherinfo5', 'Otherinfo7', 'Otherinfo8']

sets = [['028_pbs_R', '029_pbs_R', '032_pbs_R'], ['033_nrch_R', '030_nrch_R', '036_nrch_R'],['014_nrch', '012_nrch', '013_nrch'], ['279_spry','450_spry', '283_spry'], ['011_pbs','333_pbs','321_pbs']]
for set in sets:
    '''
    For each sample, read tissue specific files, annotate in which other samples these variants occur,
    annotate and save as txt file
    '''
    names = set
    print(names)
    # read in tissue specific variants
    file_dict = {}
    for i in names:
        path = f'filtered_tables/HL{i}.specific.annofilter.txt'
        print(i)
        print(path)
        df = pd.read_csv(path, delimiter = '\t')
        file_dict[i] = df
    venn_dict = {}
    for i in names:
        venn_dict[i] = get_sets(file_dict[i])   

    annotated_heart_dict = {}
    annotated_liver_dict = {}

    for count, i in enumerate(names):
        #heart
        heart_spec = file_dict[i][file_dict[i]['heart'] == True]
        af, tot_reads = get_af(heart_spec, col1, col2) 
        heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
        heart_spec['reads_h'], heart_spec['reads_l'] = tot_reads[:, 0], tot_reads[:, 1]
        heart_spec_3 = heart_spec.iloc[np.where(af[:,1]==0)]
        heart_file = heart_spec_3[['chr', 'pos', 'ref', 'alt', col1, col2, 'AF_h', 'AF_l', 'reads_h', 'reads_l', 'normed_h', 'normed_l']]

        #liver
        liver_spec = file_dict[i][file_dict[i]['liver'] == True]
        af, tot_reads = get_af(liver_spec, col1, col2) 
        liver_spec['AF_h'], liver_spec['AF_l'] = af[:, 0], af[:, 1]
        liver_spec['reads_h'], liver_spec['reads_l'] = tot_reads[:, 0], tot_reads[:, 1]
        liver_spec_3 = liver_spec.iloc[np.where(af[:,0]==0)]
        liver_file = liver_spec_3[['chr', 'pos', 'ref', 'alt', col1, col2, 'AF_h', 'AF_l', 'reads_h', 'reads_l', 'normed_h', 'normed_l']]
        
        # merge annotation with heart files
        merged_h = pd.merge(heart_file, anno, how = 'left',on = ['chr', 'pos', 'ref', 'alt'])
        merged_h.loc[:,'total_reads'] = merged_h.loc[:,'reads_h'] +  merged_h.loc[:,'reads_l']
        merged_h.loc[:,'present_in_sample'] =  names[count]
        annotated_heart_dict[names[count]] = merged_h

        # merge annotation with liver files
        merged = pd.merge(liver_file, anno, how = 'left',on = ['chr', 'pos', 'ref', 'alt'])
        merged.loc[:,'total_reads'] = merged.loc[:,'reads_h'] +  merged.loc[:,'reads_l']
        merged.loc[:,'present_in_sample'] =  names[count]
        annotated_liver_dict[names[count]] = merged

    # save heart files
    heart1 = annotate_duplicated(annotated_heart_dict[names[0]], annotated_heart_dict[names[1]], annotated_heart_dict[names[2]])
    heart1.to_csv(f'{outdir}/HC_PL_ST_heart_{names[0]}.txt', sep = '\t', index = False)
    heart2 = annotate_duplicated(annotated_heart_dict[names[1]], annotated_heart_dict[names[0]], annotated_heart_dict[names[2]])
    heart2.to_csv(f'{outdir}/HC_PL_ST_heart_{names[1]}.txt', sep = '\t', index = False)
    heart3 = annotate_duplicated(annotated_heart_dict[names[2]], annotated_heart_dict[names[0]], annotated_heart_dict[names[1]])
    heart3.to_csv(f'{outdir}/HC_PL_ST_heart_{names[2]}.txt', sep = '\t', index = False)
    
    # save liver files
    liver1 = annotate_duplicated(annotated_liver_dict[names[0]], annotated_liver_dict[names[1]], annotated_liver_dict[names[2]])
    liver1.to_csv(f'{outdir}/HC_PL_ST_liver_{names[0]}.txt', sep = '\t', index = False)
    liver2 = annotate_duplicated(annotated_liver_dict[names[1]], annotated_liver_dict[names[0]], annotated_liver_dict[names[2]])
    liver2.to_csv(f'{outdir}/HC_PL_ST_liver_{names[1]}.txt', sep = '\t', index = False)
    liver3 = annotate_duplicated(annotated_liver_dict[names[2]], annotated_liver_dict[names[0]], annotated_liver_dict[names[1]])
    liver3.to_csv(f'{outdir}/HC_PL_ST_liver_{names[2]}.txt', sep = '\t', index = False)


['028_pbs_R', '029_pbs_R', '032_pbs_R']
filtered_tables/HL028_pbs_R.specific.annofilter.txt
filtered_tables/HL029_pbs_R.specific.annofilter.txt
filtered_tables/HL030_nrch_R.specific.annofilter.txt


  af = allel2 / (allel1 + allel2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['reads_h'], heart_spec['reads_l'] = tot_

['033_nrch_R', '030_nrch_R', '036_nrch_R']
filtered_tables/HL028_pbs_R.specific.annofilter.txt
filtered_tables/HL029_pbs_R.specific.annofilter.txt
filtered_tables/HL030_nrch_R.specific.annofilter.txt


  af = allel2 / (allel1 + allel2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['reads_h'], heart_spec['reads_l'] = tot_

['014_nrch', '012_nrch', '013_nrch']
filtered_tables/HL028_pbs_R.specific.annofilter.txt
filtered_tables/HL029_pbs_R.specific.annofilter.txt
filtered_tables/HL030_nrch_R.specific.annofilter.txt


  af = allel2 / (allel1 + allel2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['AF_h'], heart_spec['AF_l'] = af[:, 0], af[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  heart_spec['reads_h'], heart_spec['reads_l'] = tot_