In [None]:
import json
from itertools import islice
from typing import Iterator

from data_processor.raw_extractor import extract_data
from datasets import Dataset
from schemas.tables_data import DNASequence
from tqdm import tqdm

In [None]:
annotations_file_path = "./storage/datasets/genbank/gb_curated.gb"
HF_DATASET_NAME = "gu-dudi/DNA_coding_regions"
OUTPUT_JSONL = "./storage/data/base/dna_coding_regions.jsonl"
CHUNK_SIZE = 2000

In [None]:
generator = extract_data(
	annotations_file_path=annotations_file_path
)

In [None]:
def chunk_generator(
	generator: Iterator[DNASequence],
	chunk_size: int = 2000
) -> Iterator[list[DNASequence]]:
	iterator = iter(generator)
	while batch := list(islice(iterator, chunk_size)):
		yield batch

In [None]:
seen_accession = set()
seen_seq_org = set()

skipped_accession = 0
skipped_seq_org = 0
skipped_invalid = 0

with open(OUTPUT_JSONL, "w", encoding="utf-8") as f_out:
	for chunk in tqdm(chunk_generator(generator, chunk_size=CHUNK_SIZE), desc="Processing DNA"):
		for dna in chunk:
			acc = dna["accession"].strip()
			org = dna["organism"].strip()
			seq = dna["sequence"].strip()
			
			key_accession = acc
			key_seq_org = (seq, org)


			if key_accession in seen_accession:
				skipped_accession += 1
				continue

			if key_seq_org in seen_seq_org:
				skipped_seq_org += 1
				continue

			seen_accession.add(key_accession)
			seen_seq_org.add(key_seq_org)
		
			exin_list = dna.get("exin", [])
			protein_list = dna.get("cds", [])

			if not exin_list and not protein_list:
				skipped_invalid += 1
				continue

			record = {
				"accession": acc,
					"organism": org,
					"sequence": seq,
					"introns": [],
					"exons": [],
					"proteins": []
			}
			
			for exin in exin_list:
				item = {
					"sequence": exin["sequence"].strip(),
					"start": exin["start"],
					"end": exin["end"],
					"gene": exin.get("gene", ""),
					"before": exin.get("before"),
					"after": exin.get("after")
				}
				if exin["type"] == "EXON":
					record["exons"].append(item)
				elif exin["type"] == "INTRON":
					record["introns"].append(item)
				
			for protein in protein_list:
				record["proteins"].append({
					"sequence": protein["sequence"].strip(),
					"start": protein["start"],
					"end": protein["end"],
					"gene": protein.get("gene", "")
				})
			
			f_out.write(json.dumps(record, ensure_ascii=False) + "\n")

In [None]:
print(f"Skipped {skipped_accession} duplicate accessions.")
print(f"Skipped {skipped_seq_org} duplicate sequence+organism entries.")

In [None]:
def jsonl_to_dataset(
	path,
	sample_limit = None
) -> Dataset:
	records = []
	with open(path, "r", encoding="utf-8") as f:
		for i, line in enumerate(f):
			if sample_limit and i >= sample_limit:
				break
			records.append(json.loads(line))
	
	return Dataset.from_list(records)

In [None]:
dataset = jsonl_to_dataset(OUTPUT_JSONL)

In [None]:
print(f"Dataset Len: {len(dataset)}")

In [None]:
dataset.push_to_hub(HF_DATASET_NAME)