# Processing of bcftools vcf ('samtools')

In [1]:
import pandas as pd
import numpy as np
import io
import os
import gzip
import sys

### Plan:
1. Variant calling
2. Filtering
3. Annotation

# 1. Variant calling

```
REF=/path/to/ref/S288C_reference_sequence_R64-3-1_20210421_chr.fasta
BAMS=bams_list_LL13.txt
bcftools mpileup -C50 -f ${REF} -min-MQ 4 -min-BQ 13 --skip-any-set 1796 -a FORMAT/AD,FORMAT/ADR,FORMAT/ADF,FORMAT/DP -Ou -b ${BAMS} | \
bcftools call --ploidy 1 -mv -f gq -Oz -o snp_bcftools_LL13.vcf.gz -
tabix -p vcf snp_bcftools_LL13.vcf.gz
```

# 2. Filtering

```
TMP=../DATA_temp
SITES=../DATA/S288C_reference_sequence_R64-3-1_mod_noRepeats.bed

for bg in LL13 NC02
  do
  echo $bg

  # Sample filters
  # 1. Masking genotypes with DP < 4 (mask) and GT 2 or 3; 
  # 2. Allelic depth (AD) of the second most common variant within genotype is less than 4
  # 3. Allelic depth of the second most frequent variant to most frequent variant is less than 0.2

  python filterAD2.py ../DATA/snp_bcftools_${bg}.vcf.gz
  mv ../DATA/snp_bcftools_${bg}_filterAD.vcf $TMP/snp_bcftools_${bg}_filterAD.vcf
  bgzip -f $TMP/snp_bcftools_${bg}_filterAD.vcf .
  tabix -f -p vcf $TMP/snp_bcftools_${bg}_filterAD.vcf.gz

  # SNP filters
  # 1. Mean per sample coverge is higher than 10
  # 2. Max total DP is less than 20000
  # 3. SNP QUAL is higher than 20
  # 4. Mapping quality MQ is higher than 40
  # 5. Fixed alleles removed (AF>0.99 or AC==0)

  bcftools filter -e "AVG(FORMAT/DP)<10 || INFO/DP>20000 || QUAL<20 || MQ<40 || (AF > 0.99) || (AC == 0)" -Ou $TMP/snp_bcftools_${bg}_filterAD.vcf.gz | bcftools view -m2 -Oz -o $TMP/snp_bcftools_${bg}_FLT4.vcf.gz -
  bcftools stats $TMP/snp_bcftools_${bg}_FLT4.vcf.gz > snp_bcftools_${bg}_FLT4.stats
  tabix -f -p vcf $TMP/snp_bcftools_${bg}_FLT4.vcf.gz
  
  done
```
Python script available here: [filterAD2_bcftools.py](filterAD2_bcftools.py)

# 3. Annotation

#### Annotating with snpEff and *S. cerevisiae* genome vR64-3-1 and splitting into SNPs and INDELs
```
conda activate snpeff-5.0
# SnpEff version SnpEff 5.0e (build 2021-03-09 06:01), by Pablo Cingolani
snpEff_DIR=/home/anna/soft/snpEff
TMP=../DATA_temp
CONFIG=/home/software/snpEff/snpEff.config

for bg in LL13 NC02
  do

  # SnpEff
  snpEff -c $CONFIG R64.3.1 $TMP/snp_bcftools_${bg}_FLT4.vcf.gz > $TMP/snp_bcftools_${bg}_snpEff.vcf
  bgzip -f $TMP/snp_bcftools_${bg}_snpEff.vcf
  tabix -f -p vcf $TMP/snp_bcftools_${bg}_snpEff.vcf.gz

  # Saving table
  bcftools query -i '%TYPE="snp"' -f '%CHROM\t%POS\t%REF\t%ALT\t%AC\t\t%ANN\n' -o snp_bcftools_${bg}_SNP_snpEff.tab $TMP/snp_bcftools_${bg}_snpEff.vcf.gz
  bcftools query -i '%TYPE="indel"' -f '%CHROM\t%POS\t%REF\t%ALT\t%AC\t\t%ANN\n' -o snp_bcftools_${bg}_INDEL_snpEff.tab $TMP/snp_bcftools_${bg}_snpEff.vcf.gz
  cp $TMP/snp_bcftools_${bg}_snpEff.vcf.gz* ../DATA/
```

# 3. Getting a summary table

In [2]:
important_categories = ['synonymous_variant','stop_gained','missense_variant']

#### Function for reading vcf

In [3]:
def readVcfGzip(filepath):
    fh = gzip.open(filepath,'rt')
    lines = [l for l in fh if not l.startswith('##')]
    df_vcf = pd.read_csv(io.StringIO(''.join(lines)),sep='\t')
    return(df_vcf)

#### Function for formatting vcf into stacked table

In [4]:
def stackMutations(vcf_file):
    
    # Vcf to dataframe
    df_vcf = readVcfGzip(vcf_file)
    
    # Formatting snpeff annotations & mutation
    infos = df_vcf['INFO'].values.tolist()
    I, G = [],[]
    #T = []
    for info in infos:
        muts = []
        ninfo = [i.split('=')[1] for i in info.split(';') if i.startswith('ANN=')][0]
        sinfo = [i for i in ninfo.split(',')]
        #T.append(ninfo)
        mut = ":".join([sinfo[0].split('|')[0],sinfo[0].split('|')[1]]), sinfo[0].split('|')[3].replace("_CDS","")
        muts.append(mut)
        for gene in sinfo[1:]:
            ginfo = gene.split('|')
            mut = ":".join([ginfo[0],ginfo[1]]), ginfo[3].replace("_CDS","")
            if (ginfo[1] in important_categories) & (mut not in muts):
                muts.append(mut)
        all_muts = '/'.join([m[0] for m in muts])
        all_genes = '/'.join([m[1] for m in muts])
        I.append(all_muts)
        G.append(all_genes)
    df_vcf['MUT'] = I
    df_vcf['GENE'] = G
    df_vcf['mutID'] = df_vcf.apply(lambda x: x["#CHROM"]+":"+str(x["POS"])+":"+str(x["REF"])+":"+x["MUT"], axis=1)
    df_vcf.head()
    
    # Formatting samples
    df_vcf_ = df_vcf.drop(columns=["ID","FILTER","INFO","FORMAT","MUT"])
    df_stack = df_vcf_.set_index(["#CHROM","POS","REF","ALT","QUAL","GENE","mutID"]).stack().reset_index().rename(columns={"level_7":"sampleID",0:"GT"})
    del df_vcf_
    df_stack['sample'] = df_stack['sampleID'].apply(lambda x: x.split('/')[-1].split('.')[0])
    df_stack['snp'] = df_stack['GT'].apply(lambda x: 1 if x.split(':')[0] == '1' else 0)
    df_stack = df_stack.drop(columns=["REF","ALT","QUAL","sampleID","GT"])
    df_stack = df_stack[df_stack['snp']==1].reset_index(drop=True)
    del df_vcf
    return(df_stack)

In [51]:
def stackMutationsWithAnn(vcf_file):
    
    # Vcf to dataframe
    df_vcf = readVcfGzip(vcf_file)
    
    # Formatting snpeff annotations & mutation
    infos = df_vcf['INFO'].values.tolist()
    I, G = [],[]
    #T = []
    for info in infos:
        muts = []
        ninfo = [i.split('=')[1] for i in info.split(';') if i.startswith('ANN=')][0]
        sinfo = [i for i in ninfo.split(',')]
        #T.append(ninfo)
        mut = ":".join([sinfo[0].split('|')[0],sinfo[0].split('|')[1]]), sinfo[0].split('|')[3].replace("_CDS","")
        muts.append(mut)
        for gene in sinfo[1:]:
            ginfo = gene.split('|')
            mut = ":".join([ginfo[0],ginfo[1]]), ginfo[3].replace("_CDS","")
            if (ginfo[1] in important_categories) & (mut not in muts):
                muts.append(mut)
        all_muts = '/'.join([m[0] for m in muts])
        all_genes = '/'.join([m[1] for m in muts])
        I.append(all_muts)
        G.append(all_genes)
        
    A = []
    ANN = []
    for info in infos:
        muts = []
        ninfo = [i.split('=')[1] for i in info.split(';') if i.startswith('ANN=')][0]
        ANN.append(ninfo)
        sinfo = [i for i in ninfo.split(',')]
        #mut = ":".join([sinfo[0].split('|')[0],sinfo[0].split('|')[1]]), sinfo[0].split('|')[3].replace("_CDS","")
        #muts.append(mut)
        for gene in sinfo:
            ginfo = gene.split('|')
            mut = ":".join([ginfo[0],ginfo[1],ginfo[3].replace("_CDS","")])
            if mut not in muts:
                muts.append(mut)
        all_muts = '/'.join(muts)
        A.append(all_muts)
    
    df_vcf['MUT'] = I
    df_vcf['GENE'] = G
    df_vcf['MUT_ALL'] = A
    df_vcf['ANN'] = ANN
    df_vcf['mutID'] = df_vcf.apply(lambda x: x["#CHROM"]+":"+str(x["POS"])+":"+str(x["REF"])+":"+x["MUT"], axis=1)
    df_vcf.head()
    
    # Formatting samples
    df_vcf_ = df_vcf.drop(columns=["ID","FILTER","INFO","FORMAT","MUT"])
    df_stack = df_vcf_.set_index(["#CHROM","POS","REF","ALT","QUAL","GENE","mutID","MUT_ALL","ANN"]).stack().reset_index().rename(columns={"level_9":"sampleID",0:"GT"})
    del df_vcf_
    df_stack['sample'] = df_stack['sampleID'].apply(lambda x: x.split('/')[-1].split('.')[0])
    df_stack['snp'] = df_stack['GT'].apply(lambda x: 1 if x.split(':')[0] == '1' else 0)
    df_stack = df_stack.drop(columns=["QUAL","sampleID"])
    df_stack = df_stack[df_stack['snp']==1].reset_index(drop=True)
    del df_vcf
    return(df_stack)

# SNPs

#### Combining two backgrounds together

In [52]:
vcf1 = "../DATA/snp_bcftools_LL13_snpEff_SNP.vcf.gz"
vcf2 = "../DATA/snp_bcftools_NC02_snpEff_SNP.vcf.gz"
# LL13
df_vcf1 = stackMutationsWithAnn(vcf1)
df_vcf1['background'] = "LL13-040"
parent_variants_LL13 = df_vcf1.loc[df_vcf1['sample'].isin(['H13']), ['#CHROM','POS']].reset_index(drop=True)
df_vcf1_merged = pd.merge(df_vcf1, parent_variants_LL13, on = ['#CHROM','POS'], how = 'outer', indicator=True)
df_vcf1_noWT = df_vcf1_merged[df_vcf1_merged['_merge'] == 'left_only'].reset_index(drop=True)
# NC02
df_vcf2 = stackMutationsWithAnn(vcf2)
df_vcf2['background'] = "NC-02"
parent_variants_NC02 = df_vcf2.loc[df_vcf2['sample'].isin(['E23']), ['#CHROM','POS']].reset_index(drop=True)
df_vcf2_merged = pd.merge(df_vcf2, parent_variants_NC02, on = ['#CHROM','POS'], how = 'outer', indicator=True)
df_vcf2_noWT = df_vcf2_merged[df_vcf2_merged['_merge'] == 'left_only'].reset_index(drop=True)

# Merged
df = pd.concat([df_vcf1_noWT, df_vcf2_noWT],ignore_index=True)
print(df.shape)
df.head()

(4348, 13)


Unnamed: 0,#CHROM,POS,REF,ALT,GENE,mutID,MUT_ALL,ANN,GT,sample,snp,background,_merge
0,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:42,0:4:0,1:0,3:0,4:41",F4,1,LL13-040,left_only
1,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:86,0:8:0,3:0,5:0,8:85",F5,1,LL13-040,left_only
2,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:55,0:8:1,6:0,1:1,7:54",J2,1,LL13-040,left_only
3,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:45,1:7:0,5:1,1:1,6:43",L2,1,LL13-040,left_only
4,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:63,0:7:0,2:1,3:1,5:62",M4,1,LL13-040,left_only


In [53]:
df['NGENE'] = df['GENE'].apply(lambda x: list(set(x.split('/')))[0] if len(set(x.split('/'))) == 1 else x)
#df[df['GENE'] == "YHR128W/YHR128W"].head()

#### Getting gene descriptions from SGD

#### Reading gene names extracted from SGD for corresponding gene IDs 

In [54]:
dg = pd.read_csv('../../DATA/S288C_reference_genome_R64-3-1_20210421/gene_list_SGD.tsv',sep="\t",names=['ID','Gene','Species','Symbol','Name'])
dg.head()

Unnamed: 0,ID,Gene,Species,Symbol,Name
0,S000000001,YAL001C,S. cerevisiae,TFC3,Transcription Factor class C
1,S000000002,YAL002W,S. cerevisiae,VPS8,Vacuolar Protein Sorting
2,S000000003,YAL003W,S. cerevisiae,EFB1,Elongation Factor Beta
3,S000000004,YAL005C,S. cerevisiae,SSA1,Stress-Seventy subfamily A
4,S000000005,YAL007C,S. cerevisiae,ERP2,Emp24p/Erv25p Related Protein


#### Adding missing gene IDs

In [55]:
dictG = {ele:np.array(ele.split('/')) for ele in df['NGENE'].drop_duplicates().values}
all_gene_ids = np.concatenate(list(dictG.values()))
missing_IDs = [ele for ele in all_gene_ids if ele not in dg['Gene'].values]
missing_dg = pd.DataFrame({"ID":"SXX","Gene":missing_IDs,"Species":"S. cerevisiae","Symbol":"UNK","Name":"UNK"})
dg_combined = pd.concat([dg, missing_dg], ignore_index = True)

gene_symbol = {a:b for a,b in dg_combined.get(['Gene','Symbol']).fillna('UNK').values.tolist()}
gene_name = {a:b for a,b in dg_combined.get(['Gene','Name']).fillna('UNK').values.tolist()}

df['GENE_SYMBOL'] = df['NGENE'].apply(lambda x: '/'.join([gene_symbol[i] for i in x.split('/')]))
df['GENE_NAME'] = df['NGENE'].apply(lambda x: '/'.join([gene_name[i] for i in x.split('/')]))
print(df.shape)
df.to_csv('samtools_variants_SNP.tab',sep='\t',header=True,index=True)
df.head()

(4348, 16)


Unnamed: 0,#CHROM,POS,REF,ALT,GENE,mutID,MUT_ALL,ANN,GT,sample,snp,background,_merge,NGENE,GENE_SYMBOL,GENE_NAME
0,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:42,0:4:0,1:0,3:0,4:41",F4,1,LL13-040,left_only,YAL035W,FUN12,Function Unknown Now
1,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:86,0:8:0,3:0,5:0,8:85",F5,1,LL13-040,left_only,YAL035W,FUN12,Function Unknown Now
2,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:55,0:8:1,6:0,1:1,7:54",J2,1,LL13-040,left_only,YAL035W,FUN12,Function Unknown Now
3,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:45,1:7:0,5:1,1:1,6:43",L2,1,LL13-040,left_only,YAL035W,FUN12,Function Unknown Now
4,chrI,77527,A,G,YAL035W,chrI:77527:A:G:synonymous_variant,G:synonymous_variant:YAL035W/G:upstream_gene_v...,G|synonymous_variant|LOW|YAL035W_CDS|GENE_YAL0...,"1:63,0:7:0,2:1,3:1,5:62",M4,1,LL13-040,left_only,YAL035W,FUN12,Function Unknown Now


# INDELs

#### Combining two backgrounds

In [60]:
vcf1 = "../DATA/snp_bcftools_LL13_snpEff_INDEL.vcf.gz"
vcf2 = "../DATA/snp_bcftools_NC02_snpEff_INDEL.vcf.gz"
# LL13
df_vcf1 = stackMutationsWithAnn(vcf1)
df_vcf1['background'] = "LL13-040"
parent_variants_LL13 = df_vcf1.loc[df_vcf1['sample'].isin(['H13']), ['#CHROM','POS']].reset_index(drop=True)
df_vcf1_merged = pd.merge(df_vcf1, parent_variants_LL13, on = ['#CHROM','POS'], how = 'outer', indicator=True)
df_vcf1_noWT = df_vcf1_merged[df_vcf1_merged['_merge'] == 'left_only'].reset_index(drop=True)
# NC02
df_vcf2 = stackMutationsWithAnn(vcf2)
df_vcf2['background'] = "NC-02"
parent_variants_NC02 = df_vcf2.loc[df_vcf2['sample'].isin(['E23']), ['#CHROM','POS']].reset_index(drop=True)
df_vcf2_merged = pd.merge(df_vcf2, parent_variants_NC02, on = ['#CHROM','POS'], how = 'outer', indicator=True)
df_vcf2_noWT = df_vcf2_merged[df_vcf2_merged['_merge'] == 'left_only'].reset_index(drop=True)

# Merged
df = pd.concat([df_vcf1_noWT, df_vcf2_noWT],ignore_index=True)
print(df.shape)

# Gene name
df['NGENE'] = df['GENE'].apply(lambda x: list(set(x.split('/')))[0] if len(set(x.split('/'))) == 1 else x)
df.head()

(8461, 13)


Unnamed: 0,#CHROM,POS,REF,ALT,GENE,mutID,MUT_ALL,ANN,GT,sample,snp,background,_merge,NGENE
0,chrI,6736,CAAAAAAAAAAAAAAAAAAA,"CAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAA",YAL068C,chrI:6736:CAAAAAAAAAAAAAAAAAAA:CAAAAAAAAAAAAAA...,CAAAAAAAAAAAAAAAAAAAA:upstream_gene_variant:YA...,CAAAAAAAAAAAAAAAAAAAA|upstream_gene_variant|MO...,"1:106,0,101:13:0,8,1:1,3,0:1,11,1:127",B13,1,LL13-040,left_only,YAL068C
1,chrI,6736,CAAAAAAAAAAAAAAAAAAA,"CAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAA",YAL068C,chrI:6736:CAAAAAAAAAAAAAAAAAAA:CAAAAAAAAAAAAAA...,CAAAAAAAAAAAAAAAAAAAA:upstream_gene_variant:YA...,CAAAAAAAAAAAAAAAAAAAA|upstream_gene_variant|MO...,"1:87,0,101:8:1,7,0:0,0,0:1,7,0:127",F12,1,LL13-040,left_only,YAL068C
2,chrI,13051,GTGACTAGAGAAGAAGTGACTGAGGAAGAAATGACTAGAGAAGAAG...,GTGACTAGAGAAGAAGTGTCTGAGGAAGAAAT,YAL067C,chrI:13051:GTGACTAGAGAAGAAGTGACTGAGGAAGAAATGAC...,GTGACTAGAGAAGAAGTGTCTGAGGAAGAAAT:upstream_gene...,GTGACTAGAGAAGAAGTGTCTGAGGAAGAAAT|upstream_gene...,"1:119,11:7:1,0:0,6:1,6:109",B11,1,LL13-040,left_only,YAL067C
3,chrI,31500,CACACATATATA,CATATA,YAL063C,chrI:31500:CACACATATATA:CATATA:upstream_gene_v...,CATATA:upstream_gene_variant:YAL063C/CATATA:up...,CATATA|upstream_gene_variant|MODIFIER|YAL063C_...,"1:150,71:16:2,7:0,4:2,11:76",B6,1,LL13-040,left_only,YAL063C
4,chrI,31500,CACACATATATA,CATATA,YAL063C,chrI:31500:CACACATATATA:CATATA:upstream_gene_v...,CATATA:upstream_gene_variant:YAL063C/CATATA:up...,CATATA|upstream_gene_variant|MODIFIER|YAL063C_...,"1:104,0:13:1,9:1,1:2,10:101",C12,1,LL13-040,left_only,YAL063C


In [61]:
df[["#CHROM","POS"]].drop_duplicates()
df[df['NGENE'] == "YHR128W"]

Unnamed: 0,#CHROM,POS,REF,ALT,GENE,mutID,MUT_ALL,ANN,GT,sample,snp,background,_merge,NGENE


#### SGD gene names

In [62]:
dg = pd.read_csv('../../DATA/S288C_reference_genome_R64-3-1_20210421/gene_list_SGD.tsv',sep="\t",names=['ID','Gene','Species','Symbol','Name'])
dictG = {ele:np.array(ele.split('/')) for ele in df['NGENE'].drop_duplicates().values}
all_gene_ids = np.concatenate(list(dictG.values()))
missing_IDs = [ele for ele in all_gene_ids if ele not in dg['Gene'].values]
missing_dg = pd.DataFrame({"ID":"SXX","Gene":missing_IDs,"Species":"S. cerevisiae","Symbol":"UNK","Name":"UNK"})
dg_combined = pd.concat([dg, missing_dg], ignore_index = True)

gene_symbol = {a:b for a,b in dg_combined.get(['Gene','Symbol']).fillna('UNK').values.tolist()}
gene_name = {a:b for a,b in dg_combined.get(['Gene','Name']).fillna('UNK').values.tolist()}

df['GENE_SYMBOL'] = df['NGENE'].apply(lambda x: '/'.join([gene_symbol[i] for i in x.split('/')]))
df['GENE_NAME'] = df['NGENE'].apply(lambda x: '/'.join([gene_name[i] for i in x.split('/')]))
print(df.shape)
df.to_csv('samtools_variants_INDEL.tab',sep='\t',header=True,index=True)
df.head()

(8461, 16)


Unnamed: 0,#CHROM,POS,REF,ALT,GENE,mutID,MUT_ALL,ANN,GT,sample,snp,background,_merge,NGENE,GENE_SYMBOL,GENE_NAME
0,chrI,6736,CAAAAAAAAAAAAAAAAAAA,"CAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAA",YAL068C,chrI:6736:CAAAAAAAAAAAAAAAAAAA:CAAAAAAAAAAAAAA...,CAAAAAAAAAAAAAAAAAAAA:upstream_gene_variant:YA...,CAAAAAAAAAAAAAAAAAAAA|upstream_gene_variant|MO...,"1:106,0,101:13:0,8,1:1,3,0:1,11,1:127",B13,1,LL13-040,left_only,YAL068C,PAU8,seriPAUperin
1,chrI,6736,CAAAAAAAAAAAAAAAAAAA,"CAAAAAAAAAAAAAAAAAAAA,CAAAAAAAAAAAAAAAAAAAAA",YAL068C,chrI:6736:CAAAAAAAAAAAAAAAAAAA:CAAAAAAAAAAAAAA...,CAAAAAAAAAAAAAAAAAAAA:upstream_gene_variant:YA...,CAAAAAAAAAAAAAAAAAAAA|upstream_gene_variant|MO...,"1:87,0,101:8:1,7,0:0,0,0:1,7,0:127",F12,1,LL13-040,left_only,YAL068C,PAU8,seriPAUperin
2,chrI,13051,GTGACTAGAGAAGAAGTGACTGAGGAAGAAATGACTAGAGAAGAAG...,GTGACTAGAGAAGAAGTGTCTGAGGAAGAAAT,YAL067C,chrI:13051:GTGACTAGAGAAGAAGTGACTGAGGAAGAAATGAC...,GTGACTAGAGAAGAAGTGTCTGAGGAAGAAAT:upstream_gene...,GTGACTAGAGAAGAAGTGTCTGAGGAAGAAAT|upstream_gene...,"1:119,11:7:1,0:0,6:1,6:109",B11,1,LL13-040,left_only,YAL067C,SEO1,Suppressor of sulfoxyde EthiOnine resistance
3,chrI,31500,CACACATATATA,CATATA,YAL063C,chrI:31500:CACACATATATA:CATATA:upstream_gene_v...,CATATA:upstream_gene_variant:YAL063C/CATATA:up...,CATATA|upstream_gene_variant|MODIFIER|YAL063C_...,"1:150,71:16:2,7:0,4:2,11:76",B6,1,LL13-040,left_only,YAL063C,FLO9,FLOcculation
4,chrI,31500,CACACATATATA,CATATA,YAL063C,chrI:31500:CACACATATATA:CATATA:upstream_gene_v...,CATATA:upstream_gene_variant:YAL063C/CATATA:up...,CATATA|upstream_gene_variant|MODIFIER|YAL063C_...,"1:104,0:13:1,9:1,1:2,10:101",C12,1,LL13-040,left_only,YAL063C,FLO9,FLOcculation
