In [1]:
import csv
from itertools import islice
from typing import Iterator

from tqdm import tqdm

from data_processor.raw_extractor import extract_data
from schemas.tables_data import DNASequence

In [None]:
annotations_file_path = "./storage/datasets/genbank/gb_curated.gb"
dna_file = "./storage/data/base/dna_sequences.csv"
exin_file = "./storage/data/base/exin_sequences.csv"
cds_file = "./storage/data/base/cds_sequences.csv"

In [3]:
generator = extract_data(
	annotations_file_path=annotations_file_path
)

In [4]:
def chunk_generator(
	generator: Iterator[DNASequence],
	chunk_size: int = 2000
) -> Iterator[list[DNASequence]]:
	iterator = iter(generator)
	while batch := list(islice(iterator, chunk_size)):
		yield batch

In [5]:
with \
	open(dna_file, "w", newline="") as f_dna, \
	open(exin_file, "w", newline="") as f_exin, \
	open(cds_file, "w", newline="") as f_cds:

	dna_writer = csv.DictWriter(f_dna, fieldnames=["sequence", "accession", "organism"])
	exin_writer = csv.DictWriter(f_exin, fieldnames=["dna_accession", "sequence", "type", "start", "end", "gene", "strand", "before", "after"])
	cds_writer = csv.DictWriter(f_cds, fieldnames=["dna_accession", "sequence", "type", "start", "end", "gene"])

	dna_writer.writeheader()
	exin_writer.writeheader()
	cds_writer.writeheader()

In [6]:
seen_dna = set()

for chunk in tqdm(chunk_generator(
	generator=generator, chunk_size=2000
)):
	with \
		open(dna_file, "a", newline="") as f_dna, \
		open(exin_file, "a", newline="") as f_exin, \
		open(cds_file, "a", newline="") as f_cds:

		dna_writer = csv.DictWriter(f_dna, fieldnames=["sequence", "accession", "organism"])
		exin_writer = csv.DictWriter(f_exin, fieldnames=["dna_accession", "sequence", "type", "start", "end", "gene", "strand", "before", "after"])
		cds_writer = csv.DictWriter(f_cds, fieldnames=["dna_accession", "sequence", "type", "start", "end", "gene"])

		for dna in chunk:
			key = (dna["sequence"], dna["organism"])
			if key not in seen_dna:
				dna_writer.writerow({
					"sequence": dna["sequence"],
					"accession": dna["accession"],
					"organism": dna["organism"]
				})
				seen_dna.add(key)
			
			for exin in dna["exin"]:
				exin_writer.writerow({
					"dna_accession": dna["accession"],
					**exin
				})
			
			for cds in dna["cds"]:
				cds_writer.writerow({
					"dna_accession": dna["accession"],
					**cds
				})

1227it [1:24:39,  4.14s/it]
