In [3]:
import pandas as pd
import os
import numpy as np
from Bio import SeqIO
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [42]:
input_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/BtHKU5-CoV-2-spike.fasta")
output_file_path = os.path.join(os.getcwd(), "..", "..", "..","..", "input/data/coronaviridae/BtHKU5-CoV-2-spike_sequences_genbase_20250303.csv")

accession_id_isolate_map = {
    "C_AAI84049.1": "023",
    "C_AAI84054.1":	"028",
    "C_AAI84059.1":	"153",
    "C_AAI84064.1":	"155",
    "C_AAI84069.1":	"381",
    "C_AAI84074.1":	"441"
}
sequencing_date = "09/11/2024"
virus_name = "BtHKU5-CoV-2"

In [43]:
def parse_fasta_file(input_file_path, output_file_path):
    sequences = []
    i = 0
    parse_error_count = 0
    print("START: Parsing fasta file")
    # parse fasta file to extract uniref90_id, tax_id of virus/organism, and protein sequence
    with open(input_file_path) as f:
        for record in SeqIO.parse(f, "fasta"):
            i += 1
            metadata = re.search(r".+? (.+).\[(.+)\]", record.description)
            # metadata: accession_id  protein name [host]
            # C_AAI84074.1 spike protein [Pipistrellus]
            # ? in regex is for ungreedy match to capture 'spike protein' together
            protein = metadata.group(1)
            host = metadata.group(2)
            sequences.append({
                "accession_id": record.id,
                "protein": protein,
                "host": host,
                "pango_lineage": virus_name,
                "who_variant": accession_id_isolate_map[record.id],
                "first_designation_date": sequencing_date,
                "seq": str(record.seq)
            })
    print("END: Parsing fasta file")
    print(len(sequences))
    print(f"Number of records parsed = {i}")
    print(f"Number of records with parsing error = {parse_error_count}")
    df = pd.DataFrame(sequences, dtype=str)
    # write the parsed dataframe to a csv file
    print(f"Writing to file {output_file_path}")
    df.to_csv(output_file_path, index=False)
    return df

In [44]:
df = parse_fasta_file(input_file_path, output_file_path)

START: Parsing fasta file
END: Parsing fasta file
6
Number of records parsed = 6
Number of records with parsing error = 0
Writing to file /home/blessyantony/dev/git/zoonosis/src/jupyter_notebooks/datasets/generation/../../../../input/data/coronaviridae/BtHKU5-CoV-2-spike_sequences_genbase_20250303.csv


In [45]:
df = pd.read_csv(output_file_path, converters={"who_variant": str})

In [46]:
df

Unnamed: 0,accession_id,protein,host,pango_lineage,who_variant,first_designation_date,seq
0,C_AAI84074.1,spike protein,Pipistrellus,BtHKU5-CoV-2,441,09/11/2024,MMYLAFPLMFLLTLGSADVNLGPDGTGNCPVTDVQPDFFTHWNWPE...
1,C_AAI84049.1,spike protein,Pipistrellus,BtHKU5-CoV-2,23,09/11/2024,MMHLAFPLMFLLTLGSADVNLGPDGTGNCPVTDVQPDFFTHWNWPE...
2,C_AAI84054.1,spike protein,Pipistrellus,BtHKU5-CoV-2,28,09/11/2024,MMHLAFPLMFLLTLGSADVNLGPDGTGNCPVTDVQPDFFTHWNWPE...
3,C_AAI84069.1,spike protein,Pipistrellus,BtHKU5-CoV-2,381,09/11/2024,MMYLAFPLMFLLTLGSADVNLGPDGTGNCPVTDVQPDFFTHWNWPE...
4,C_AAI84059.1,spike protein,Pipistrellus,BtHKU5-CoV-2,153,09/11/2024,MMCLAFPLMFLLTLGSADVNLGPDAISNCPVTDVRPEFFDHFNWPA...
5,C_AAI84064.1,spike protein,Pipistrellus,BtHKU5-CoV-2,155,09/11/2024,MMCLAFPLMFLLTLGSADVNLGPDAISNCPVTDVRPEFFDHFNWPA...
