In [34]:
# !pip install biopython

In [35]:
from Bio.Seq import Seq
from Bio import SeqIO
from Bio import pairwise2
import os

In [36]:
# data path
influenzaPath = "./Bank/Influenza"
ebolaPath = "./Bank/Zaire Ebola Virus"
testPath = "./Bank/Test"

influenzaFileList = os.listdir(influenzaPath)
ebolaFileList = os.listdir(ebolaPath)
testFileList = os.listdir(testPath)
influenzaSeq = []
ebolaSeq = []
testSeq =[]

In [37]:
def fetchData(filePath, fileList):
    seqList = []
    for file in fileList:
        for sequence in SeqIO.parse(filePath + "/" + file,"fasta"):
            seqList.append(sequence)
    return seqList

In [38]:
# fetch data
influenzaSeqList = fetchData(influenzaPath, influenzaFileList)
ebolaSeqList = fetchData(ebolaPath, ebolaFileList)
testSeqList = fetchData(testPath, testFileList)

In [39]:
def localAlignment(testSeq, dataSeq):
    bestPercentage = 0
    bestSeqName = ""
    for seq_data in dataSeq:
        local_score = pairwise2.align.localms(testSeq.seq, seq_data.seq, 1, -1, -0.5, -0.1, score_only = True)
        local_percentage = (local_score / len(seq_data.seq)) * 100

        if local_percentage > bestPercentage:
            bestPercentage = local_percentage
            bestSeqName = seq_data.name
            if local_percentage == 100:
                return bestPercentage, bestSeqName
                
    return bestPercentage, bestSeqName

In [40]:
def determineVerdict(virus1, percentage1, seqName1, virus2, percentage2, seqName2):
    print(f"Virus name: {virus1}")
    print(f"Best percentage: {percentage1}")
    print(f"Best sequence name: {seqName1}")
    print("")
    print(f"Virus name: {virus2}")
    print(f"Best percentage: {percentage2}")
    print(f"Best sequence name: {seqName2}")
    print("")

    if percentage1 < 50 and percentage2 < 50:
        print("Both sequence are not quite convergent! It is either not match to both virus or please provide a more complete genome!")
    elif percentage1 == percentage2:
        print(f"The sequence alignment result for virus {virus1} and {virus2} is exactly the same with percentage of {percentage1}")
    elif percentage1 == 100:
        print(f"Perfect match for {virus1} is found! it is {seqName1} with percentage of {percentage1}!")
    elif percentage2 == 100:
        print(f"Perfect match for {virus2} is found! it is {seqName2} with percentage of {percentage2}!")
    elif percentage1 > percentage2:
        print(f"Sequence alignment result of {virus1} is better than {virus2} with percentage of {percentage1}")
    else:
        print(f"Sequence alignment result of {virus2} is better than {virus1} with percentage of {percentage2}")

    print("\n\n\n-----------------------------------------\n")


In [41]:
for i, seq in enumerate(testSeqList):
    print(f"analyzing file: {testFileList[i]}")
    
    bestInfluenzaPercentage, bestInfluenzaSeqName = localAlignment(seq, influenzaSeqList)
    bestEbolaPercentage, bestEbolaSeqName = localAlignment(seq, ebolaSeqList)

    print("Verdict:")
    determineVerdict("influenza A Virus", bestInfluenzaPercentage, bestInfluenzaSeqName, "Zaire Ebola Virus", bestEbolaPercentage, bestEbolaSeqName)

analyzing file: ebolatest.fna
Verdict:
Virus name: influenza A Virus
Best percentage: 60.570749108203536
Best sequence name: NC_026428.1

Virus name: Zaire Ebola Virus
Best percentage: 95.37475605253437
Best sequence name: NC_002549.1

Sequence alignment result of Zaire Ebola Virus is better than influenza A Virus with percentage of 95.37475605253437



-----------------------------------------

analyzing file: Influenza.fna
Verdict:
Virus name: influenza A Virus
Best percentage: 100.0
Best sequence name: NC_026438.1

Virus name: Zaire Ebola Virus
Best percentage: 7.103222743815839
Best sequence name: NC_002549.1

Perfect match for influenza A Virus is found! it is NC_026438.1 with percentage of 100.0!



-----------------------------------------

analyzing file: Zaire ebolavirus.fna
Verdict:
Virus name: influenza A Virus
Best percentage: 60.67052023121241
Best sequence name: NC_007364.1

Virus name: Zaire Ebola Virus
Best percentage: 100.0
Best sequence name: NC_002549.1

Perfect matc