# Monkeypox ncbi data processing

In [5]:
import os
import csv
import pandas as pd
import numpy as np
from Bio import SeqIO, Entrez
from datetime import datetime
import calendar
from dateutil.parser import parse
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from urllib.error import HTTPError
import chardet

## GB File Downloading

To download GenBank files, you need a sequences.csv file containing the NCBI MPOX list of accessions. Please make surethe sequences.csv file is saved in the same folder as this notebook.

The output file (sequence.gb) will be saved in the sequence_files folder.

In [6]:
def download_sequence_files(accessions, output_directory):
    Entrez.email = 'juanfinello@gmail.com'  # Enter your email address

    genbank_records = []  # List to store GenBank records

    for accession in accessions:
        try:
            # Fetch the GenBank (full) record
            handle = Entrez.efetch(db='nucleotide', id=accession, rettype='gb', retmode='text')
            genbank_data = handle.read()
            handle.close()
            genbank_records.append(genbank_data)

            print(f'Downloaded: {accession}')

        except HTTPError as e:
            print(f'Error downloading {accession}: {e}')

    # Save the GenBank records to a single file
    genbank_filename = os.path.join(output_directory, 'sequences.gb')
    with open(genbank_filename, 'w') as f:
        f.write('\n'.join(genbank_records))

# Read accessions from the sequences.csv file
accessions = []
with open('sequences.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header row if present
    for row in reader:
        accession = row[0]  # Assuming the accessions are in the first column
        accessions.append(accession)

# Define the output directory to save the downloaded files
output_directory = 'sequence_files'

# Create the output directory if it doesn't exist
os.makedirs(output_directory, exist_ok=True)

# Download the sequence files for the accessions
download_sequence_files(accessions, output_directory)

Downloaded: OR126131.1
Downloaded: OR126132.1
Downloaded: OR209307.1
Downloaded: OR209308.1
Downloaded: OR209309.1
Downloaded: OR209310.1
Downloaded: OR209311.1
Downloaded: OR209312.1
Downloaded: OR209313.1
Downloaded: OR209314.1
Downloaded: OR209315.1
Downloaded: OR209316.1
Downloaded: OR209317.1
Downloaded: OR209318.1
Downloaded: OR209319.1
Downloaded: OR209320.1
Downloaded: OR209321.1
Downloaded: OR209322.1
Downloaded: OR209323.1
Downloaded: OR209324.1
Downloaded: OR209325.1
Downloaded: OR209326.1
Downloaded: OR209327.1
Downloaded: OR209328.1
Downloaded: OR178739.1
Downloaded: OR178740.1
Downloaded: OR178741.1
Downloaded: OR178742.1


### Import .gb files

In [8]:
# 4 - Open .gb file and create 
gb_file = "sequences.gb"
genbank_df = pd.DataFrame()

for gb_record in SeqIO.parse(open(gb_file, "r"), "genbank"):
    # Attributes are: id, seq, name, description, dbxrefs, features, annotations, letter_annotations
    try:
        gb_lab = gb_record.annotations["references"][1].journal
    except:
        gb_lab = gb_record.annotations["references"][0].journal
        
    entries_dic = {"gb_id": gb_record.name,
                   "gb_description": gb_record.description,
                   "gb_authors": gb_record.annotations["references"][0].authors,
                   "gb_lab": gb_lab,
                   "gb_sequence": str(gb_record.seq)}
    gb_features = dict(gb_record.features[0].qualifiers)
    
    try:
        gb_assembly = dict(dict(gb_record.annotations["structured_comment"])["Assembly-Data"])
        gb_annotations = gb_record.annotations.pop("structured_comment", "references")
        gb_annotations.pop("Assembly-Data")
        all_ncbi_info = all_ncbi_info = dict(**entries_dic, **gb_features, **gb_annotations, **gb_assembly)
        single_record_df = pd.DataFrame(all_ncbi_info)
        genbank_df = genbank_df.append(single_record_df)
    except:
        gb_assembly = {"Assembly Method": "", "Sequencing Technology": ""}
        gb_annotations = {"Empty": "empty"}
        all_ncbi_info = all_ncbi_info = dict(**entries_dic, **gb_features, **gb_annotations, **gb_assembly)
        single_record_df = pd.DataFrame(all_ncbi_info)
        genbank_df = genbank_df.append(single_record_df)

# Code I may use
#    print(gb_record.annotations["references"][1].journal)
#    print(gb_record.annotations["references"][0])
#    print(dict(gb_record.features[0].qualifiers))
#    single_record_df = pd.DataFrame(dict(gb_record.features[0].qualifiers))
#    genbank_df = genbank_df.append(single_record_df)
#    print(gb_record.features[0])
#    print(dict(gb_record.features[0].qualifiers))
#        gb_annotations = gb_record.annotations.pop("structured_comment", "references")
#        gb_annotations.pop("Assembly-Data")
#        all_ncbi_info = dict(entries_dic | gb_features | gb_annotations | gb_assembly)
#        single_record_df = pd.DataFrame(all_ncbi_info)
#        genbank_df = genbank_df.append(single_record_df)


  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.append(single_record_df)
  genbank_df = genbank_df.appen

In [4]:
#Step 5 - Error Checking and Metadata Verification
#After processing the data, we need to check for errors. An empty column in the genbank_df DataFrame indicates whether good metadata (NaN) or bad metadata (empty) was found.

genbank_df

Unnamed: 0,gb_id,gb_description,gb_authors,gb_lab,gb_sequence,organism,mol_type,isolate,host,db_xref,country,collection_date,Assembly Method,Coverage,Sequencing Technology
0,OR095042,Monkeypox virus isolate MPXV/UZ_REGA_224/Belgi...,"Vanmechelen,B., Wawina-Bokalanga,T., Logis,A.-...","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGATTGGTATAAGGATGTTGATAAGCTCTACGAGAATATATTGTT...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_224/Belgium/2022,Homo sapiens,taxon:10244,Belgium,12-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,654x(average),Oxford Nanopore Technologies GridION
0,OR095056,Monkeypox virus isolate MPXV/UZ_REGA_238/Belgi...,"Vanmechelen,B., Wawina-Bokalanga,T., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TGATGCAATTGTCTGACAACCTAGATTGGTATAAGGATGTTGATAA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_238/Belgium/2022,Homo sapiens,taxon:10244,Belgium,12-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,1021x(average),Oxford Nanopore Technologies GridION
0,OR095064,Monkeypox virus isolate MPXV/UZ_REGA_246/Belgi...,"Wawina-Bokalanga,T., Vanmechelen,B., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TGATGCAATTGTCTGACAACCTAGATTGGTATAAGGATGTTGATAA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_246/Belgium/2022,Homo sapiens,taxon:10244,Belgium,16-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,1074x(average),Oxford Nanopore Technologies GridION
0,OR095041,Monkeypox virus isolate MPXV/UZ_REGA_223/Belgi...,"Vanmechelen,B., Wawina-Bokalanga,T., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TCCATTGGATGGTGCATGTGGTGCTATATCTCTTCCGTTTATTATT...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_223/Belgium/2022,Homo sapiens,taxon:10244,Belgium,08-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,518x(average),Oxford Nanopore Technologies GridION
0,OR095036,Monkeypox virus isolate MPXV/UZ_REGA_218/Belgi...,"Wawina-Bokalanga,T., Vanmechelen,B., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGGAAACTCTAGAGGGTAAGAAAAATCAATCGTTTATAGAGACCA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_218/Belgium/2022,Homo sapiens,taxon:10244,Belgium,08-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,857x(average),Oxford Nanopore Technologies GridION
0,OR095047,Monkeypox virus isolate MPXV/UZ_REGA_229/Belgi...,"Vanmechelen,B., Wawina-Bokalanga,T., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGGAAACTCTAGAGGGTAAGAAAAATCAATCGTTTATAGAGACCA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_229/Belgium/2022,Homo sapiens,taxon:10244,Belgium,12-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,1535x(average),Oxford Nanopore Technologies GridION
0,OR095025,Monkeypox virus isolate MPXV/UZ_REGA_207/Belgi...,"Wawina-Bokalanga,T., Vanmechelen,B., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGGAAACTCTAGAGGGTAAGAAAAATCAATCGTTTATAGAGACCA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_207/Belgium/2022,Homo sapiens,taxon:10244,Belgium,08-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,1611x(average),Oxford Nanopore Technologies GridION
0,OR095055,Monkeypox virus isolate MPXV/UZ_REGA_237/Belgi...,"Vanmechelen,B., Wawina-Bokalanga,T., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGGAAACTCTAGAGGGTAAGAAAAATCAATCGTTTATAGAGACCA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_237/Belgium/2022,Homo sapiens,taxon:10244,Belgium,12-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Salmtools v...,719x(average),Oxford Nanopore Technologies GridION
0,OR095028,Monkeypox virus isolate MPXV/UZ_REGA_210/Belgi...,"Wawina-Bokalanga,T., Vanmechelen,B., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGGAAACTCTAGAGGGTAAGAAAAATCAATCGTTTATAGAGACCA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_210/Belgium/2022,Homo sapiens,taxon:10244,Belgium,08-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,1600x(average),Oxford Nanopore Technologies GridION
0,OR095024,Monkeypox virus isolate MPXV/UZ_REGA_206/Belgi...,"Wawina-Bokalanga,T., Vanmechelen,B., Logist,A....","Submitted (05-JUN-2023) Microbiology, Immunolo...",TAGGAAACTCTAGAGGGTAAGAAAAATCAATCGTTTATAGAGACCA...,Monkeypox virus,genomic DNA,MPXV/UZ_REGA_206/Belgium/2022,Homo sapiens,taxon:10244,Belgium,08-Aug-2022,MiniMap2 v. 2.24; Medaka v. 1.6.0; Samtools v....,1767x(average),Oxford Nanopore Technologies GridION


In [10]:
# 6 - Write tsv file with all info (except sequence)
genbank_df.drop(columns = ["gb_sequence"]).to_csv("original.tsv", index = False, encoding = "utf8", sep = "\t")

In [11]:
# Create FASTA file

fasta_file = open("original.fasta", "w")
for key, value in pd.Series(genbank_df.gb_sequence.values, index = genbank_df.isolate).to_dict().items():
    fasta_file.write(">" + key + "\n" + value + "\n")
fasta_file.close()

### Metadata Import and Editing

In [12]:
#Import tsv, output from the script number one

tsv= pd.read_csv('original.tsv', sep='\t', header=0)

In [14]:
#Get columns with data from tsv

gb_authors =  list(tsv["gb_authors"])
gb_lab =  list(tsv["gb_lab"])
isolate =  list(tsv["isolate"])
isolate_org =  list(tsv["isolate"])
#isolation_source =  list(tsv["isolation_source"])
country =  list(tsv["country"])
collection_date =  list(tsv["collection_date"])
assembly_method =  list(tsv["Assembly Method"])
sequencing_technology =  list(tsv["Sequencing Technology"])
host = list(tsv["host"])

In [16]:
#Handling Empty Lines in Columns
#If you encounter empty lines in the columns and the previous code gives you an error, you can use the following code snippet:

#collection_date = []
#for date in tsv["collection_date"]:
#    if isinstance(date, str) and date.strip():
#        collection_date.append(date.strip())
#    else:
#        collection_date.append("unknown")

In [19]:
# Creating Lists to Complete Columns
# edit your submitter name and fasta file name

unknown = list(["unknown"]* len(gb_authors))
submitter = list(["juanfinello"]* len(gb_authors))
FASTAfile = list(["sequence_curated.fasta"]* len(gb_authors))
isolation_source =  list(["unknown"]* len(gb_authors))
empty_col = []

In [20]:
#transform the collection date format

def convert_date(date):
    try:
        # Parse the date string using dateutil.parser
        parsed_date = parse(date)

        # Determine the format of the input date
        num_elements = len(date.split('-'))
        if '/' in date:
            num_elements = len(date.split('/'))

        # Format the datetime object according to the number of elements
        if num_elements == 3:
            formatted_date = parsed_date.strftime('%Y-%m-%d')
        elif num_elements == 2:
            formatted_date = parsed_date.strftime('%Y-%m')
        elif num_elements == 1:
            formatted_date = parsed_date.strftime('%Y')
        else:
            raise ValueError('Invalid date format')
        
        return formatted_date
    
    except ValueError:
        print(f"Invalid date format: {date}")
        return None

In [21]:
salida = []

for date in collection_date:
  salida.append(convert_date(date))

collection_date = salida
print(collection_date)

['2022-09-21', '2023-03-23', '2022-07', '2022-07', '2022-12', '2022-12', '2023-01', '2022-08', '2022-07', '2022-11', '2022-08', '2022-07', '2022-08', '2022-08', '2022-07', '2022-07', '2022-07', '2022-07', '2022-12', '2022-07', '2022-08', '2022-08', '2022-08', '2022-07', '2023-01-12', '2023-01-12', '2023-02-14', '2023-02-14']


In [22]:
#append year to virus names 

# Iterate over each element in the list
for i in range(len(collection_date)):
    # Check if the element is a date in the Y-M-D format
    try:
        datetime.strptime(collection_date[i], "%Y-%m-%d")
        # Extract the year from the date string
        year = str(datetime.strptime(collection_date[i], "%Y-%m-%d").year)
        # Check if the element already ends with "/Y" and a year
        if isolate[i].endswith("/" + year):
            continue  # if the element ends with the year skip it
        # Append the year to the end of the element
        isolate[i] += "/" + year
    except ValueError:
        # If the element is not a date in the Y-M-D format, skip it
        pass

#print(isolate)



In [23]:
#replace MPXV for hMpxV

for i in range(len(isolate)):
    if isolate[i].startswith("MPXV/") or isolate[i].startswith("MPXV22/") or isolate[i].startswith("MPXV23/") or isolate[i].startswith("HMPXV/"):
        isolate[i] = "hMpxV/" + isolate[i][5:]

print(isolate)

['HRYC_MPXV001/2022', 'HRYC_MPXV002/2023', 'MPXV_USA_2022_OR0009', 'MPXV_USA_2022_RI0006', 'MPXV_USA_2022_TN0019', 'MPXV_USA_2022_TX0052', 'MPXV_USA_2022_TX0053', 'MPXV_USA_2022_WV0001', 'MPXV_USA_2022_CT0005', 'MPXV_USA_2022_IN0004', 'MPXV_USA_2022_LA0012', 'MPXV_USA_2022_LA0013', 'MPXV_USA_2022_LA0014', 'MPXV_USA_2022_LA0015', 'MPXV_USA_2022_MI0011', 'MPXV_USA_2022_MI0012', 'MPXV_USA_2022_MN0014', 'MPXV_USA_2022_MN0016', 'MPXV_USA_2022_NC0003', 'MPXV_USA_2022_NE0004', 'MPXV_USA_2022_NE0005', 'MPXV_USA_2022_NE0006', 'MPXV_USA_2022_OK0006', 'MPXV_USA_2022_OR0002', 'hMpxV/USA/CACDPH1MPX1000172/2023', 'hMpxV/USA/CACDPH1MPX1000191/2023', 'hMpxV/USA/CACDPH1MPX1000193/2023', 'hMpxV/USA/CACDPH1MPX1000194/2023']


In [25]:
# Change host if "Homo sapiens"
hsapiens = 'homo sapiens'
new_string = 'Human'

for i in range(len(host)):
    if host[i].lower() == hsapiens.lower():
        host[i] = new_string

#print(host)

In [26]:
#remove "Submitter" and the date that appears in parentheses

# Define the substring and characters to delete
substring_to_delete = 'Submitted'

# Iterate over each string in the list
for i in range(len(gb_lab)):
    # Find the index of the substring to delete
    index_to_delete = gb_lab[i].find(substring_to_delete)
    
    # Delete the substring and parentheses if they exist
    if index_to_delete != -1:
        start_index = gb_lab[i].find('(')
        end_index = gb_lab[i].find(')', start_index) + 1
        gb_lab[i] = gb_lab[i][:start_index] + gb_lab[i][end_index:]
        gb_lab[i] = gb_lab[i][:index_to_delete] + gb_lab[i][index_to_delete+len(substring_to_delete):]

# Print the updated list
#print(gb_lab)

In [27]:
# Change USA state abbreviation for full word

# Define a dictionary mapping state abbreviations to full state names
state_dict = {'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland', 'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi', 'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'}

# Iterate over each string in the list
for i in range(len(country)):
    # Check if the string contains the word 'USA'
    if 'USA' in country[i]:
        # Replace each state abbreviation in the string with its full name
        for abbr, full_name in state_dict.items():
            country[i] = country[i].replace(abbr, full_name)
            
# Print the updated list
#print(country)


In [28]:
#Create final df with template structure

df = pd.DataFrame({
    'Submitter' : [submitter],
    'FASTA filename' : [FASTAfile],
    'Virus name' : [isolate],
    'Passage details/history' : [empty_col],
    'Collection date' : [collection_date],
    'Location': [country],
    'Additional location information': [empty_col],
    'Host': [host],
    'Additional Host information': [empty_col],
    'Sampling Strategy': [empty_col],
    'Gender': [unknown],
    'Patient age': [unknown],
    'Patient status': [unknown],
    'Specimen source': [isolation_source],
    'Outbreak': [empty_col],
    'Last vaccinated': [empty_col],
    'Treatment': [empty_col],
    'Sequencing technology' : [sequencing_technology],
    'Assembly method': [assembly_method],
    'Depth of coverage': [empty_col],
    'Originating lab': [gb_lab],
    'Address': [empty_col],
    'Sample ID given by the sample provider': [empty_col],
    'Submitting lab': [gb_lab],
    'Address 2': [empty_col],
    'Sample ID given by the submitting laboratory' : [isolate_org],
    'Authors' : [gb_authors],
    'Comment': [empty_col],
    'Comment Icon': [empty_col],
    
})

In [29]:

df2 = df.apply(pd.Series.explode)

In [30]:
df2 = df2.replace(np.nan, '', regex=True)

### Export data frame to Mpox template 

In [29]:
# Load the Excel file into a workbook object
book = openpyxl.load_workbook('template.xlsx')

In [30]:
# Select the sheet you want to copy the DataFrame to
sheet = book['Submissions']

In [31]:
# Specify the row to start copying the DataFrame to
start_row = 2

In [32]:
# Write the DataFrame to the sheet starting from the specified row
for r in dataframe_to_rows(df2, index=False, header=True):
    sheet.insert_rows(start_row)
    for c, val in enumerate(r, 1):
        sheet.cell(row=start_row, column=c, value=val)
    start_row += 1

In [33]:
# Save the updated Excel file
book.save('/metadata.xlsx')

### Change Virus Names

At this point, you need to go to the template1.xlsx file saved in the sequences_file folder and curate the data.

Edit the virus names in the pox_virus_name column and save the previous virus names in the pox_subm_sample_id column.

The following code will replace the old virus names in the FASTA file with the newly edited ones. Remember to save the changes in the metadata.xlsx file before continuing.


In [34]:
# Read the Excel file
excel_file = pd.ExcelFile('metadata.xlsx')

# Extract the desired columns from the second sheet
sheet_name = excel_file.sheet_names[1]  # Assuming the second sheet
df = excel_file.parse(sheet_name)
desired_columns = ['pox_subm_sample_id' , 'pox_virus_name']
extracted_data = df[desired_columns]

# Remove the second row (header) from each column
extracted_data = extracted_data.iloc[1:]

# Save the values in a text file
output_file = 'vnames_to_change.txt'
extracted_data.to_csv(output_file, sep='\t', header=False, index=False)

In [35]:
fasta= open('original.fasta', 'r') #encoding = 'utf-16')
newnames= open('vnames_to_change.txt', 'r')
newfasta= open('sequence_curated.fasta', 'w')

In [36]:
dic={}
for row in newnames:
    n_old = row.split('\t')[0].strip()
    n_new = row.split('\t')[1].strip()
    dic[n_old]=n_new
print (dic)


{'MPXV/UZ_REGA_224/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_224/2022', 'MPXV/UZ_REGA_238/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_238/2022', 'MPXV/UZ_REGA_246/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_246/2022', 'MPXV/UZ_REGA_223/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_223/2022', 'MPXV/UZ_REGA_218/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_218/2022', 'MPXV/UZ_REGA_229/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_229/2022', 'MPXV/UZ_REGA_207/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_207/2022', 'MPXV/UZ_REGA_237/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_237/2022', 'MPXV/UZ_REGA_210/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_210/2022', 'MPXV/UZ_REGA_206/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_206/2022', 'MPXV/UZ_REGA_226/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_226/2022', 'MPXV/UZ_REGA_221/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_221/2022', 'MPXV/UZ_REGA_217/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_217/2022', 'MPXV/UZ_REGA_216/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_216/2022', 'MPXV/UZ_REGA_248/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_248/2

In [37]:
dic



{'MPXV/UZ_REGA_224/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_224/2022',
 'MPXV/UZ_REGA_238/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_238/2022',
 'MPXV/UZ_REGA_246/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_246/2022',
 'MPXV/UZ_REGA_223/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_223/2022',
 'MPXV/UZ_REGA_218/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_218/2022',
 'MPXV/UZ_REGA_229/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_229/2022',
 'MPXV/UZ_REGA_207/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_207/2022',
 'MPXV/UZ_REGA_237/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_237/2022',
 'MPXV/UZ_REGA_210/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_210/2022',
 'MPXV/UZ_REGA_206/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_206/2022',
 'MPXV/UZ_REGA_226/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_226/2022',
 'MPXV/UZ_REGA_221/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_221/2022',
 'MPXV/UZ_REGA_217/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_217/2022',
 'MPXV/UZ_REGA_216/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_216/2022',
 'MPXV/UZ_REGA_248/Belgium/2022': 'hMpxV/Belgium

In [38]:
for line in fasta:
    if line.startswith('>'):
        line_name=line.split('>')[1]
        line_name=line_name.strip()  #去除特殊符号，例如空格，\t, \n等
        if line_name in dic:
            newname= dic[line_name]
            newname='>'+newname+'\n'
            newfasta.write(newname)
        else:
            print('Warning!!!   '+line_name+ '  not exist')
    else:
        newfasta.write(line)

fasta.close()
newnames.close()
newfasta.close()

##if not in order:
##use perl
#  type cd C:\Strawberry\perl\bin in command prompt
#then type perl Format_Fasta.pl input.fasta > output.fasta
## linearised the sequence
##  while read line ; do grep -A1 "^$line" iran_curated.fasta >> output.fasta ; done < modifying.txt

In [39]:
dic

{'MPXV/UZ_REGA_224/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_224/2022',
 'MPXV/UZ_REGA_238/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_238/2022',
 'MPXV/UZ_REGA_246/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_246/2022',
 'MPXV/UZ_REGA_223/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_223/2022',
 'MPXV/UZ_REGA_218/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_218/2022',
 'MPXV/UZ_REGA_229/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_229/2022',
 'MPXV/UZ_REGA_207/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_207/2022',
 'MPXV/UZ_REGA_237/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_237/2022',
 'MPXV/UZ_REGA_210/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_210/2022',
 'MPXV/UZ_REGA_206/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_206/2022',
 'MPXV/UZ_REGA_226/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_226/2022',
 'MPXV/UZ_REGA_221/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_221/2022',
 'MPXV/UZ_REGA_217/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_217/2022',
 'MPXV/UZ_REGA_216/Belgium/2022': 'hMpxV/Belgium/UZ_REGA_216/2022',
 'MPXV/UZ_REGA_248/Belgium/2022': 'hMpxV/Belgium