# SNP Annotation

The following steps have to repeated for every chromosome and every plasmid!

What we need for these steps:
- Data:
    - the variant call files (`.vcf`) from the different callers
    - the `.fasta` files for the reference genome
    - the genome annotation files for the reference genome (`.gff` or `.gff3`)
- Tools:
    - Python version 3.10.10
    - Python scripts available from the directory \PythonScripts

In [1]:
import sys
sys.path.append("PythonScripts")

In [2]:
import ReadFiles as Rf
import vcf
import fasta
import gff
import re
import translation_table

In [3]:
fasta_file = (Rf.ReadFile("../Data/RefGenome/Chr1.fasta"))
gff_file = (Rf.ReadFile("../Data/Gff3/Chr1.gff3"))
vcf_bcf = (Rf.ReadFile("../Data/SNPCalls/BCFTools/Filtered/Chr1.vcf"))
vcf_fb = (Rf.ReadFile("../Data/SNPCalls/Freebayes/Filtered/Chr1.vcf"))
vcf_lf = (Rf.ReadFile("../Data/SNPCalls/Lofreq/Filtered/Chr1.vcf"))

First we need to reformat the `.fasta` file with the function `fasta.FastaCleanup(fasta_file)`

In [4]:
fasta_new = fasta.FastaCleanup(fasta_file)

From the genome annotation file we need to extract all loci for coding sequences (`CDS`).  If we change the search-word we can also extract other loci (i.e. `gene`)

In [None]:
gff_Loci = gff.GFFExtractLoci(gff_file, "CDS")

Now we can extract a list of variants for every caller.

In [None]:
vcf_new_bcf = vcf.VCFExtractSNP(vcf_bcf)
vcf_new_fb = vcf.VCFExtractSNP(vcf_fb)
vcf_new_lf = vcf.VCFExtractSNP(vcf_lf)

The three lists were merged now to get only consent SNPs. With the parameter `snp` we ingnore all indels.

In [None]:
vcf_snp = vcf.MergeVCF(vcf_new_bcf, vcf_new_fb, vcf_new_lf, "snp")

Sequences for the coding regions were added to the annotated loci using `ExtractRegion()`.

In [8]:
subregion_list = [["Lower Limit","Upper Limit","Direction","Info", "Region"]]
for i in gff_Loci:
    subregion_list.append([i[3], i[4],i[6],i[8], fasta.ExtractRegion(int(i[3]), int(i[4])+1, i[6], fasta_new)])
del i 

Now we can extract all regions with a SNP in it:

In [9]:
vcf_snp1 = vcf_snp
vcf_snp = list()
for i in vcf_snp1:
    if (i[2] == i[4]) and\
       (i[2] == i[6]):
            vcf_snp.append([i[0],i[1],i[2]])
del vcf_snp1
del i

In [10]:
subregion_with_snp = [['Pos','Ref',"Alt","LowerLimit","UpperLimit","Direction",\
                      "Info", "Sequence"]]
#subregion_with_snp = list()
for i in subregion_list:
    for j in vcf_snp:
        if j[0].isdigit() and i[0].isdigit():
         if int(j[0])>=int(i[0]) and int(j[0])<= int(i[1]):
            subregion_with_snp.append([j[0],j[1],j[2],i[0],i[1],i[2], i[3], i[4]])
del i
del j

In the next step we create two fasta files: one without the observed mutations (simple the sequence from the reference genome) and one with the oberserved SNPs

In [11]:
multifasta_orig = []
for i in subregion_with_snp:
    if i[0].isdigit():
        multifasta_orig.append('>_'+i[0]+'_'+i[1]+'_'+i[2]+'_'+i[3]+'_'+i[4]+\
                               '_'+i[5])
        multifasta_orig.append(i[7])
        
multifasta_orig_rev = []
for i in multifasta_orig:
    x = re.findall("^>", i)
    if x: 
        a = i.split('_')
        multifasta_orig_rev.append(i)
    else:
        if a[6] == '-':
            i=fasta.ReverseString(i)
            i= i.replace("A", "t").replace("T", "a").replace("C", "g").\
                replace("G", "c")
            i=i.upper()
        multifasta_orig_rev.append(i)

In [12]:
multifasta_mut = []
for i in multifasta_orig:
    x = re.findall("^>", i)
    if x: 
        a = i.split('_')
        multifasta_mut.append(i)
    else:
        i = list(i)
        if i[int(a[1])-int(a[4])]== a[2]:
            i[int(a[1])-int(a[4])]=a[3]
            j="".join(i)
            if a[6] == '-':
                j=fasta.ReverseString(j)
                j= j.replace("A", "t").replace("T", "a").replace("C", "g").\
                    replace("G", "c")
                j=j.upper()
            multifasta_mut.append(j)
            print("Mutate "+ a[2]+" to " + a[3] + " at Position " + a[1])
        else:
            print("Error at SNP Position: " + a[1] + ': ' + a[2]+ ' expected')

Mutate A to G at Position 95915
Mutate T to G at Position 226764
Mutate C to G at Position 227076
Mutate C to G at Position 227091
...


In the last step of the annotation pipeline we do some in silico translation to get `.fasta` files of the protein sequences with and without the mutations.

In [13]:
kmer_list_orig = []
for i in multifasta_orig_rev:
    x = re.findall("^>", i)
    if x: 
        kmer_list_orig.append(i)
    else:
        kmer_tmp = fasta.KmerSplit(i,3)
        kmer_list_orig.append(kmer_tmp)
        


kmer_list_mut = []
for i in multifasta_mut:
    x = re.findall("^>", i)
    if x: 
        kmer_list_mut.append(i)
    else:
        kmer_tmp = fasta.KmerSplit(i,3)
        kmer_list_mut.append(kmer_tmp)

In [14]:
protein_seq_orig = []
for i in kmer_list_orig:
    x = re.findall("^>", ''.join(i))
    if x: 
        protein_seq_orig.append(i)
    else:
        protein_seq_orig.append(translation_table.translateDNA(i))
        
protein_seq_mut = []
for i in kmer_list_mut:
    x = re.findall("^>", ''.join(i))
    if x: 
        protein_seq_mut.append(i)
    else:
        protein_seq_mut.append(translation_table.translateDNA(i))
        

In [15]:
print('Orig:\n')
print('\n'.join(protein_seq_orig))
print('\nMut: \n')
print('\n'.join(protein_seq_mut))

Orig:

>_95915_A_G_95773_96516_-
MVPQSRPLVPAVLLTRPEAQGARFAAALAEALGPVRLVTSPLMVPAFLVPAIPLRPDALIFTSETGVEGYRRLAAPELSDVRRAWCVGNRTARAAEAAGLAAHSAEGDAERLIAAILAADEPGPLLHLRGAESRGEVAPRLAAAGLTAAEAVVYDQRPQPLSSEARALLTDGAPVIAPLFSPRTARLLAQELARIGGTGPLWVAAMSPAVAEAAAALPVARLSVAARPDAEALLQAVKALLDAEADA
>_226764_T_G_226593_226931_-
MARKRAAAMTSATIDYSRLVKARDIRAQAEARARGPAEISVLQAMIVVGEEKWGQAMAIAEDAAYPWAMRAALRGATVLVRDSETTDTLAFLLGLSPEETDRLFIEAAEVRL
>_227076_C_G_226935_227252_-
MMRIRMVPLRRMYELTVFRVQGDTLTCNDMVYDFSGVEEGDVLPWDAMDNTWVTSNVTRVNGVLEFEVVFPHGYYGDLPLPTPGILEVEDQDIPIPPYLPPFAEG
>_227091_C_G_226935_227252_-
MMRIRMVPLRRMYELTVFRVQGDTLTCNDMVYDFSGVEEGDVLPWDAMDNTWVTSNVTRVNGVLEFEVVFPHGYYGDLPLPTPGILEVEDQDIPIPPYLPPFAEG
...
Mut: 

>_95915_A_G_95773_96516_-
MVPQSRPLVPAVLLTRPEAQGARFAAALAEALGPVRLVTSPLMVPAFLVPAIPLRPDALIFTSETGVEGYRRLAAPELSDVRRAWCVGNRTARAAEAAGLAAHSAEGDAERLIAAILAADEPGPLLHLRGAESRGEVAPRLAAAGLTAAEAVVYDQRPQPLSSEARALLTDGAPVIAPLFSPRTARLLAQELARIGGTGPPWVAAMSPAVAEAAAALPVARLSVAARPDAEALLQAVKALLDAEADA
>_226764_T_G_226593_22

In [16]:
protein_seq_orig_1 = list()
for i in range(0,len(protein_seq_orig)-1):
    x = re.findall("^>", ''.join(protein_seq_orig[i]))
    if x: 
        if not protein_seq_orig[i+1] == protein_seq_mut[i+1]:
            a = list(protein_seq_orig[i+1])
            b = list(protein_seq_mut[i+1])
            for j in range(0, len(a)):
                if a[j] != b[j]:
                    pos_counter = j
            protein_seq_orig_1.append(protein_seq_orig[i].replace("\n","")+'_'+ str(pos_counter)+\
                                      '_'+a[pos_counter]+'_'+b[pos_counter])
            protein_seq_orig_1.append(protein_seq_orig[i+1])

All our data processed above we can now export: 

In [17]:
annotation = list()
for i in protein_seq_orig_1:
    x = re.findall("^>", ''.join(i))
    if x: 
        annolist = i.split('_')
        for j in subregion_with_snp:
            if annolist[1] == j[0]:
                info = j[6].replace("\n","").split(";")
                annotation.append(j[0]+"\t"+info[len(info)-3]+"\t"+info[len(info)-2]+"\t"+\
                                   annolist[7]+"\t"+annolist[8]+"\t"+annolist[9])
f = open("../Data/ProteinSequences/Chr1_Annotation.tab", "w")
f.write('\n'.join(annotation).replace("product=","").replace("protein_id=",""))
f.close()

In [18]:
f = open("../Data/ProteinSequences/Chr1_Prot.fasta", "w")
f.write('\n'.join(protein_seq_orig_1))
f.close()

#open and read the file after the appending:
f = open("../Data/ProteinSequences/Chr1_Prot.fasta", "r")
print(f.read()) 

>_95915_A_G_95773_96516_-_200_L_P
MVPQSRPLVPAVLLTRPEAQGARFAAALAEALGPVRLVTSPLMVPAFLVPAIPLRPDALIFTSETGVEGYRRLAAPELSDVRRAWCVGNRTARAAEAAGLAAHSAEGDAERLIAAILAADEPGPLLHLRGAESRGEVAPRLAAAGLTAAEAVVYDQRPQPLSSEARALLTDGAPVIAPLFSPRTARLLAQELARIGGTGPLWVAAMSPAVAEAAAALPVARLSVAARPDAEALLQAVKALLDAEADA
>_227275_G_A_227259_228743_-_489_T_I
MDFFSPPPTPPNSGNPGTFNDDADAFLGWFPAFVAELNALLPYLTGAGFGDGTAAAPGLVWKGDPDTGLFRPGSNAVGVTAGGVLRLTVSALALTSTVPLRAPLGTAAAPGISFEADPNTGIRSDGADVLHFVTGGVTRGFFSTTHFQSTLPAALPGGAAGAPGLTFAGDLDTGIFRAAADLLGIAAGGEERFRVGSGRAAALVPFSVPDGTQTFPGLTFNGEVGSNTGFFLAAENEIGVTCQGTERARFTPSGMQLQGLLSGTAVTQSDLDTTPGRLLKVGDYGLGGTARPIPGNDADQIGTTGFYQVTGATLNRPAGMSVGTLQHIQHGAARAVQIAYPQTASDTGRWCRHKDTSWGDWFLTYDQRNIVGAVSWASGFPRGGIIEKGETAGAEYVRFADGTQLCRLVQTGVPGPTTPQGSLYRTEWQTVTLPVEFVSGALNGHCVTGGCRGGSVISLLGRPGASNVAAYMLLAPTSYGATQTVDLLVTGRWR
>_227431_A_G_227259_228743_-_437_V_A
MDFFSPPPTPPNSGNPGTFNDDADAFLGWFPAFVAELNALLPYLTGAGFGDGTAAAPGLVWKGDPDTGLFRPGSNAVGVTAGGVLRLTVSALALTSTVPLRAPLGTAAAPGISFEADPNTGIRSDGADVLHFVTGGVTRGFFSTTHFQS