## Imports

In [2]:
import pandas as pd

## Meta

In [4]:
import pandas as pd
import glob

# Directory containing the TSV files
directory = '../data/raw/'

# List to hold each DataFrame
data_frames = []

# Iterate over each TSV file in the directory
for filepath in glob.glob(directory + '*.tsv'):
    # Read the current file
    df = pd.read_csv(filepath, sep='\t')

    # Extract the desired columns
    meta = df[['Accession ID', 'Lineage', 'Collection date']]

    # Add the DataFrame to our list
    data_frames.append(meta)

# Concatenate all DataFrames in the list
all_data = pd.concat(data_frames, ignore_index=True)

# Display the combined DataFrame
print(all_data)

           Accession ID  Lineage Collection date
0        EPI_ISL_467472      B.1      2020-04-01
1       EPI_ISL_1706558  B.1.351      2020-12-28
2       EPI_ISL_1706559  B.1.351      2020-12-31
3       EPI_ISL_1706552  B.1.1.7      2021-02-12
4       EPI_ISL_1706554  B.1.351      2020-12-23
...                 ...      ...             ...
43463  EPI_ISL_15976839     CQ.2      2022-11-07
43464  EPI_ISL_15976841     BE.7      2022-11-10
43465  EPI_ISL_16052099     BE.7      2022-11-17
43466  EPI_ISL_16052102     BE.7      2022-11-21
43467  EPI_ISL_16078592    AY.45      2021-08-19

[43468 rows x 3 columns]


## Genome Sequences

In [22]:
from Bio import SeqIO
import pandas as pd
import re
import os

# Specify the directory containing the FASTA files
directory_path = '../data/raw/'

# Initialize a list to store the data
sequences_data = []

# Loop through each file in the directory
for filename in os.listdir(directory_path):
    # Check if the file is a FASTA file
    if filename.endswith(".fasta") or filename.endswith(".fa"): 
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)

        # Read the FASTA file and extract relevant information
        for seq_record in SeqIO.parse(file_path, "fasta"):
            # Extracting the Accession ID using regular expression
            accession_id_match = re.search(r"EPI_ISL_\d+", seq_record.description)
            accession_id = accession_id_match.group(0) if accession_id_match else None
            
            # Adding the data to the list
            sequences_data.append({
                "Accession ID": accession_id,
                "Sequence": str(seq_record.seq)  # Convert Seq object to string
            })

# Creating a DataFrame from the list
df = pd.DataFrame(sequences_data)

# Display the DataFrame
display(df)

Unnamed: 0,Accession ID,Sequence
0,EPI_ISL_467472,CCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAA...
1,EPI_ISL_515573,ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGA...
2,EPI_ISL_515873,CAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAA...
3,EPI_ISL_490274,CAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGA...
4,EPI_ISL_490277,CAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAA...
...,...,...
43463,EPI_ISL_18713739,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...
43464,EPI_ISL_18713740,TCNCNCNGTNNANNNNNTAANTANTNNNTGTNNNNNNNNNNNNNNN...
43465,EPI_ISL_18713741,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...
43466,EPI_ISL_18713742,TTTAAAATCTGTGTNGCTGTCACTCGGCTGCATGCTTAGNGCACTC...


In [23]:
data = pd.merge(all_data, df, on='Accession ID', how='inner')  # Change 'how' as needed
data

Unnamed: 0,Accession ID,Lineage,Collection date,Sequence
0,EPI_ISL_467472,B.1,2020-04-01,CCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAA...
1,EPI_ISL_1706558,B.1.351,2020-12-28,CTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCT...
2,EPI_ISL_1706559,B.1.351,2020-12-31,ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATC...
3,EPI_ISL_1706552,B.1.1.7,2021-02-12,ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATC...
4,EPI_ISL_1706554,B.1.351,2020-12-23,ACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATC...
...,...,...,...,...
43463,EPI_ISL_15976839,CQ.2,2022-11-07,GTGGCTGTCACTCGGCTGCATGCTNAGTGCACTCACGCAGTATAAT...
43464,EPI_ISL_15976841,BE.7,2022-11-10,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...
43465,EPI_ISL_16052099,BE.7,2022-11-17,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...
43466,EPI_ISL_16052102,BE.7,2022-11-21,TTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTC...


In [24]:
data.to_csv('../data/genomes.csv')