In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import io
import os

In [None]:
#partly taken from https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744
def read_vcf(path):
    '''
    input: path to vcf file
    returns: pandas df from vcf, the header is ignored
    '''
    with open(path, 'r') as f: 
        lines = [l for l in f if not l.startswith('##')]
        
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': float, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})

In [None]:
def read_INFO(df):
    '''
    input: pandas vcf dataframe
    returns: dataframe containing recoded column INFO from the vcf dataframe
    '''
    new = df["INFO"].str.split(";", expand = True)
    new = new.dropna()
    new.columns = pd.DataFrame(new.iloc[0,].str.split("=")).apply(lambda row : row[0][0], axis = 1)
    new_new = new.applymap(lambda cell: cell.split("=")[-1])
    return(new_new)

In [None]:
def number_of_heterozygots(df, column):
    '''
    input: pandas vcf dataframe, name of the sample
    returns: valuecounts of mono/heterozygots (0/0,0/1 ...)
    '''
    
    new = df[column].str.split(":", expand = True)[0]
    new = new.dropna()
    return(new.value_counts())

In [None]:
def numeric_INFO(INFO):
    
    INFO['MQ'] = pd.to_numeric(INFO['MQ'])
    INFO['AN'] = pd.to_numeric(INFO['AN'])
    INFO['BaseQRankSum'] = pd.to_numeric(INFO['BaseQRankSum'])
    INFO['ClippingRankSum'] = pd.to_numeric(INFO['ClippingRankSum'])
    INFO['ExcessHet'] = pd.to_numeric(INFO['ExcessHet'])
    INFO['FS'] = pd.to_numeric(INFO['FS'])
    INFO['MQRankSum'] = pd.to_numeric(INFO['MQRankSum'])
    INFO['QD'] = pd.to_numeric(INFO['QD'])
    INFO['ReadPosRankSum'] = pd.to_numeric(INFO['ReadPosRankSum'])
    INFO['SOR'] = pd.to_numeric(INFO['SOR'])
    

In [None]:
def plot_dist(df, column, threshold):
    '''
    Plot the distribution of the metric with the treshold
    Default thresholds GATK hard filtering (GATK 4.0 VariantFiltration):
        For indels:
            QD < 2.0
            QUAL < 30.0
            FS > 200.0
            ReadPosRankSum < -20.0
            
        For SNPs:
            QD < 2.0
            QUAL < 30.0
            SOR > 3.0
            FS > 60.0
            MQ < 40.0
            MQRankSum < -12.5
            ReadPosRankSum < -8.0

    '''
    
    sns.displot(df[column], x=column, kind="kde")
    plt.axvline(treshold, 0, 1, color = 'red')