In [1]:
import csv
import numpy as np
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm

## Exons and Introns Binary Classification

In [None]:
dna_df = pd.read_csv("./storage/data/base/dna_sequences.csv", keep_default_na=False)
exin_df = pd.read_csv("./storage/data/base/exin_sequences.csv", keep_default_na=False)

In [None]:
dna_df = dna_df.fillna("")
exin_df = exin_df.fillna("")

In [None]:
merged_df = pd.merge(
	dna_df,
	exin_df,
	right_on="dna_accession",
	left_on="accession",
	how="right"
)

In [None]:
filtered_df = merged_df[merged_df["sequence_y"].str.len() < 950]

filtered_df = filtered_df.rename(columns={
	"sequence_y": "sequence",
	"type": "target"
})[["sequence", "target", "organism", "gene", "before", "after"]]

filtered_df.to_csv("./storage/data/processed/exin-950.csv", index=False)

In [None]:
filtered_df = merged_df[merged_df["sequence_y"].str.len() < 512]

filtered_df = filtered_df.rename(columns={
	"sequence_y": "sequence",
	"type": "target"
})[["sequence", "target", "organism", "gene", "before", "after"]]

filtered_df.to_csv("./storage/data/processed/exin-512.csv", index=False)

In [None]:
filtered_df = merged_df[merged_df["sequence_y"].str.len() < 256]

filtered_df = filtered_df.rename(columns={
	"sequence_y": "sequence",
	"type": "target"
})[["sequence", "target", "organism", "gene", "before", "after"]]

filtered_df.to_csv("./storage/data/processed/exin-256.csv", index=False)

## Nucleotide Classification

In [2]:
dna_df = pd.read_csv("./storage/data/base/dna_sequences.csv", keep_default_na=False)
exin_df = pd.read_csv("./storage/data/base/exin_sequences.csv", keep_default_na=False)

In [3]:
merged_df = pd.merge(
	dna_df,
	exin_df,
	right_on="dna_accession",
	left_on="accession",
	how="right"
)

In [None]:
def build_target(
	seq: str,
	exins: DataFrame
) -> str:
	target = np.array(["U"] * len(seq))

	for _, exin in exins.iterrows():
		start, end, etype = int(exin["start"]), int(exin["end"]), exin["type"]
		if etype.lower().startswith("exon"):
			target[start:end] = "E"
		elif etype.lower().startswith("intron"):
			target[start:end] = "I"

	return "".join(target)

In [None]:
filtered_df = merged_df[merged_df["sequence_x"].str.len() < 2000]

results = []
for accession, group in filtered_df.groupby("accession"):
	seq = group.iloc[0]["sequence_x"]
	organism = group.iloc[0]["organism"]
	target = build_target(seq, group)
	results.append({
		"sequence": seq,
		"target": target,
		"organism": organism
	})

result_df = pd.DataFrame(results)

result_df.to_csv("./storage/data/processed/nucl-2000.csv")

In [5]:
filtered_df = merged_df[merged_df["sequence_x"].str.len() < 1000]

results = []
for accession, group in filtered_df.groupby("accession"):
	seq = group.iloc[0]["sequence_x"]
	organism = group.iloc[0]["organism"]
	target = build_target(seq, group)
	results.append({
		"sequence": seq,
		"target": target,
		"organism": organism
	})

result_df = pd.DataFrame(results)

result_df.to_csv("./storage/data/processed/nucl-1000.csv")

In [6]:
filtered_df = merged_df[merged_df["sequence_x"].str.len() < 500]

results = []
for accession, group in filtered_df.groupby("accession"):
	seq = group.iloc[0]["sequence_x"]
	organism = group.iloc[0]["organism"]
	target = build_target(seq, group)
	results.append({
		"sequence": seq,
		"target": target,
		"organism": organism
	})

result_df = pd.DataFrame(results)

result_df.to_csv("./storage/data/processed/nucl-500.csv")

## DNA Translator

In [2]:
dna_df = pd.read_csv("./storage/data/base/dna_sequences.csv", keep_default_na=False)
cds_df = pd.read_csv("./storage/data/base/cds_sequences.csv", keep_default_na=False)

### Only Proteins for PreTraining

In [3]:
filtered_df = cds_df[cds_df["sequence"].str.len() < 1000]

In [8]:
filtered_df = filtered_df[["sequence"]]

filtered_df.to_csv("./storage/data/processed/prot-1000.csv", index=False)

### DNA Sequences With Introns and Exons + Proteins for FineTuning

In [None]:
merged_df = pd.merge(
	dna_df,
	cds_df,
	right_on="dna_accession",
	left_on="accession",
	how="right"
)

In [None]:
filtered_df = merged_df[(merged_df["sequence_x"].str.len() + merged_df["sequence_y"].str.len()) < 1000]

filtered_df = filtered_df.rename(columns={
	"sequence_x": "sequence",
	"sequence_y": "target"
})[["sequence", "target", "organism"]]

filtered_df.to_csv("./storage/data/processed/dna-1000.csv", index=False)