In [7]:
import csv
import numpy as np
import pandas as pd
from tqdm import tqdm

## Exons and Introns Binary Classification

In [None]:
dna_df = pd.read_csv("./storage/data/base/dna_sequences.csv")
exin_df = pd.read_csv("./storage/data/base/exin_sequences.csv")

In [None]:
dna_df = dna_df.fillna("")
exin_df = exin_df.fillna("")

In [None]:
merged_df = pd.merge(
	dna_df,
	exin_df,
	right_on="dna_accession",
	left_on="accession",
	how="right"
)

In [None]:
filtered_df = merged_df[merged_df["sequence_y"].str.len() < 950]

filtered_df = filtered_df.rename(columns={
	"sequence_y": "sequence",
	"type": "target"
})[["sequence", "target", "organism", "gene", "before", "after"]]

filtered_df.to_csv("./storage/data/processed/exin-950.csv", index=False)

In [None]:
filtered_df = merged_df[merged_df["sequence_y"].str.len() < 512]

filtered_df = filtered_df.rename(columns={
	"sequence_y": "sequence",
	"type": "target"
})[["sequence", "target", "organism", "gene", "before", "after"]]

filtered_df.to_csv("./storage/data/processed/exin-512.csv", index=False)

In [None]:
filtered_df = merged_df[merged_df["sequence_y"].str.len() < 256]

filtered_df = filtered_df.rename(columns={
	"sequence_y": "sequence",
	"type": "target"
})[["sequence", "target", "organism", "gene", "before", "after"]]

filtered_df.to_csv("./storage/data/processed/exin-256.csv", index=False)

## Triplet Classification

In [2]:
dna_df = pd.read_csv("./storage/data/base/dna_sequences.csv")
exin_df = pd.read_csv("./storage/data/base/exin_sequences.csv")

  exin_df = pd.read_csv("./storage/data/base/exin_sequences.csv")


In [3]:
dna_df = dna_df.fillna("")
exin_df = exin_df.fillna("")

In [6]:
merged_df = pd.merge(
	dna_df,
	exin_df,
	right_on="dna_accession",
	left_on="accession",
	how="right"
)

In [8]:
def build_target(seq, exins):
	target = np.array(["N"] * len(seq))

	for _, exin in exins.iterrows():
		start, end, etype = int(exin["start"]), int(exin["end"]), exin["type"]
		if etype.lower().startswith("exon"):
			target[start:end] = "E"
		elif etype.lower().startswith("intron"):
			target[start:end] = "I"

	return "".join(target)

In [12]:
filtered_df = merged_df[merged_df["sequence_x"].str.len() < 2000]

In [13]:
results = []
for accession, group in filtered_df.groupby("accession"):
	seq = group.iloc[0]["sequence_x"]
	organism = group.iloc[0]["organism"]
	target = build_target(seq, group)
	results.append({
		"sequence": seq,
		"target": target,
		"organism": organism
	})

result_df = pd.DataFrame(results)

result_df.to_csv("./storage/data/processed/triplet-2000.csv")

## DNA Translator

In [None]:
dna_df = pd.read_csv("./storage/data/base/dna_sequences.csv")
cds_df = pd.read_csv("./storage/data/base/cds_sequences.csv")

In [None]:
dna_df = dna_df.fillna("")
cds_df = cds_df.fillna("")

In [None]:
merged_df = pd.merge(
	dna_df,
	cds_df,
	right_on="dna_accession",
	left_on="accession",
	how="right"
)

In [None]:
filtered_df = merged_df[(merged_df["sequence_x"].str.len() + merged_df["sequence_y"].str.len()) < 1000]

filtered_df = filtered_df.rename(columns={
	"sequence_x": "sequence",
	"sequence_y": "target"
})[["sequence", "target", "organism"]]

filtered_df.to_csv("./storage/data/processed/dna-1000.csv", index=False)