In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from pandas import DataFrame

# Load Dataset From HuggingFace Hub

In [None]:
dataset = load_dataset("GustavoHCruz/DNA_Coding_Regions", split="train")

In [None]:
assert isinstance(dataset, Dataset)

df = dataset.to_pandas()

## Exons and Introns Binary Classification

In [None]:
assert isinstance(df, DataFrame)

final_df = []
for _, row in df.iterrows():
	dna = row["sequence"]
	org = row["organism"]

	for exon in row["introns"]:
		seq = exon.get("sequence", "")
		gene = exon.get("gene", "")
		before = exon.get("before", "")
		after = exon.get("after", "")

		if len(seq) >= 256:
			continue

		final_df.append({
			"sequence": seq,
			"target": "INTRON",
			"organism": org,
			"gene": gene,
			"before": before,
			"after": after
		})

	for exon in row["exons"]:
		seq = exon.get("sequence", "")
		gene = exon.get("gene", "")
		before = exon.get("before", "")
		after = exon.get("after", "")

		if len(seq) >= 256:
			continue

		final_df.append({
			"sequence": seq,
			"target": "EXON",
			"organism": org,
			"gene": gene,
			"before": before,
			"after": after
		})

final_df = pd.DataFrame(final_df)

final_df.to_csv("./storage/data/processed/exin-256.csv", index=False)

In [None]:
assert isinstance(df, DataFrame)

final_df = []
for _, row in df.iterrows():
	dna = row["sequence"]
	org = row["organism"]

	for exon in row["introns"]:
		seq = exon.get("sequence", "")
		gene = exon.get("gene", "")
		before = exon.get("before", "")
		after = exon.get("after", "")

		if len(seq) >= 512:
			continue

		final_df.append({
			"sequence": seq,
			"target": "INTRON",
			"organism": org,
			"gene": gene,
			"before": before,
			"after": after
		})

	for exon in row["exons"]:
		seq = exon.get("sequence", "")
		gene = exon.get("gene", "")
		before = exon.get("before", "")
		after = exon.get("after", "")

		if len(seq) >= 512:
			continue

		final_df.append({
			"sequence": seq,
			"target": "EXON",
			"organism": org,
			"gene": gene,
			"before": before,
			"after": after
		})

final_df = pd.DataFrame(final_df)

final_df.to_csv("./storage/data/processed/exin-512.csv", index=False)

## Nucleotide Classification

In [None]:
def build_target(
	seq: str,
	introns: list,
	exons: list
) -> str:
	target = np.array(["U"] * len(seq))

	for exon in exons:
		start = exon.get("start")
		end = exon.get("end")
		target[start:end] = "E"

	for intron in introns:
		start = intron.get("start")
		end = intron.get("end")
		target[start:end] = "I"

	return "".join(target)

In [None]:
assert isinstance(df, DataFrame)
df = df[df["sequence"].str.len() < 500]

results = []
for _, row in df.iterrows():
	seq = row["sequence"]
	org = row["organism"]
	introns = row["introns"]
	exons = row["exons"]

	if len(introns) == 0 and len(exons) == 0:
		continue

	target = build_target(seq, introns, exons)

	results.append({
		"sequence": seq,
		"target": target,
		"organism": org
	})

result_df = pd.DataFrame(results)
result_df.to_csv("./storage/data/processed/nucl-500.csv", index=False)

## DNA Translator

In [None]:
def join_proteins(protein_list) -> str:
	seqs = []
	for p in protein_list:
		seq = p.get("sequence", "")
		if isinstance(seq, str):
			seqs.append(seq)
	return "".join(seqs)

In [None]:
assert isinstance(df, DataFrame)

df["target"] = df["proteins"].apply(join_proteins).astype(str)
mask = (df["sequence"].str.len() + df["target"].str.len()) < 500
filtered_df = df[mask]

final_df = filtered_df[["sequence", "target", "organism"]]

final_df.to_csv("./storage/data/processed/tran-500.csv", index=False)

In [None]:
assert isinstance(df, DataFrame)

df["target"] = df["proteins"].apply(join_proteins).astype(str)
mask = (df["sequence"].str.len() + df["target"].str.len()) < 1000
filtered_df = df[mask]

final_df = filtered_df[["sequence", "target", "organism"]]

final_df.to_csv("./storage/data/processed/tran-1000.csv", index=False)

In [None]:
assert isinstance(df, DataFrame)

df["target"] = df["proteins"].apply(join_proteins).astype(str)
mask = (df["sequence"].str.len() + df["target"].str.len()) < 8000
filtered_df = df[mask]

final_df = filtered_df[["sequence", "target", "organism"]]

final_df.to_csv("./storage/data/processed/tran-8000.csv", index=False)

In [None]:
assert isinstance(df, DataFrame)

df["target"] = df["proteins"].apply(join_proteins).astype(str)
mask = (df["sequence"].str.len() + df["target"].str.len()) < 16000
filtered_df = df[mask]

final_df = filtered_df[["sequence", "target", "organism"]]

final_df.to_csv("./storage/data/processed/tran-16000.csv", index=False)