<a href="https://colab.research.google.com/github/JK-Oblivion/Dissertation_NIV/blob/main/MetadataCollection_JK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from Bio import Entrez, SeqIO

# Set your email (required for NCBI Entrez API access)
Entrez.email = "your_email@example.com"  # Replace with your actual email

# List of 44 Echo 30 isolate accession numbers
accessions = [
    "AF311938.1", "AF162711.1", "DQ246620.1", "DQ534205.1", "EF066391.1",
    "EF066392.1", "JN704615.1", "JX854435.1", "JX976773.1", "KC897073.1",
    "KP266571.1", "KT353720.1", "KY645964.1", "KY888272.1", "KY888273.1",
    "KY888274.1", "KY888275.1", "MF678335.1", "MF678337.1", "MK238483.1",
    "MK800120.1", "MN153799.1", "MW080372.1", "MW080377.1", "MZ229659.1",
    "MZ229660.1", "MZ389230.1", "MZ389231.1", "MZ389232.1", "MZ436966.1",
    "OM677620.1", "ON129560.1", "OP207961.1", "OP207962.1", "OQ210941.1",
    "OQ210942.1", "OQ210943.1", "OQ210944.1", "OQ791513.1", "OQ791516.1",
    "OQ791571.1", "OQ842412.1", "OQ842429.1", "PQ472410.1"
]

# Store extracted metadata
metadata = []

for acc in accessions:
    try:
        # Fetch GenBank record
        handle = Entrez.efetch(db="nucleotide", id=acc, rettype="gb", retmode="text")
        record = SeqIO.read(handle, "genbank")

        # Extract metadata fields
        isolate_name = record.annotations.get("source", "Unknown")
        collection_date = record.annotations.get("date", "Unknown")
        strain = "Unknown"
        geo_loc_name = "Unknown"
        country = "Unknown"

        # Search for metadata in features
        for feature in record.features:
            if feature.type == "source":
                if "strain" in feature.qualifiers:
                    strain = feature.qualifiers["strain"][0]
                if "country" in feature.qualifiers:
                    country = feature.qualifiers["country"][0]
                if "geo_loc_name" in feature.qualifiers:
                    geo_loc_name = feature.qualifiers["geo_loc_name"][0]
                break  # Stop searching once found

        # Store extracted metadata
        metadata.append([acc, isolate_name, strain, geo_loc_name, country, collection_date])

    except Exception as e:
        print(f"Error fetching data for {acc}: {e}")

# Print the results in a formatted table
print("Accession\tIsolate Name\tStrain\tGeo_Loc_Name\tCountry\tCollection Date")
for row in metadata:
    print("\t".join(row))





Accession	Isolate Name	Strain	Geo_Loc_Name	Country	Collection Date
AF311938.1	Echovirus E30	Bastianni	Unknown	Unknown	02-DEC-2001
AF162711.1	Echovirus E30	Bastianni	Unknown	Unknown	20-JUL-1999
DQ246620.1	Echovirus E30	Echo30/Zhejiang/17/03/CSF	China: Zhejiang	Unknown	06-NOV-2005
DQ534205.1	Echovirus E30	Unknown	Netherlands	Unknown	24-MAY-2007
EF066391.1	Echovirus E30	TW/2513/01	Taiwan	Unknown	01-OCT-2007
EF066392.1	Echovirus E30	TW/3182/01	Taiwan	Unknown	01-OCT-2007
JN704615.1	Echovirus E30	Kor08-ECV30	South Korea	Unknown	13-FEB-2012
JX854435.1	Echovirus E30	Unknown	China: Guangxi	Unknown	08-DEC-2012
JX976773.1	Echovirus E30	Unknown	China: Shandong	Unknown	26-JUL-2013
KC897073.1	Echovirus E30	Unknown	China: Guandong	Unknown	26-FEB-2014
KP266571.1	Echovirus E30	2002-59	China	Unknown	28-APR-2015
KT353720.1	Echovirus E30	1-B4-TW	Taiwan	Unknown	13-FEB-2016
KY645964.1	Echovirus E30	16-I10	USA	Unknown	29-MAR-2017
KY888272.1	Echovirus E30	13-311	Germany	Unknown	03-MAR-2018
KY888273.1	Echoviru

# New Section

In [None]:
!pip install pandas
import pandas as pd # import the pandas library and alias it as 'pd'

# Create a Pandas DataFrame
df = pd.DataFrame(metadata, columns=["Accession", "Isolate Name", "Strain", "Geo_Loc_Name", "Country", "Collection Date"])

# Save to an Excel file
excel_filename = "Echo30_Metadata.xlsx"
df.to_excel(excel_filename, index=False)

print(f"Metadata successfully saved to {excel_filename}")

Metadata successfully saved to Echo30_Metadata.xlsx
