In [1]:
import pickle

In [2]:
file = open("./database/col_ac.mod1", "rb")
data = pickle.load(file)

database = data["train"] + data["test"]

In [3]:
introns_data = []
exons_data = []

for sequence in database:
	introns = sequence["introns"]
	exons = sequence["exons"]

	for intron in introns:
		introns_data.append(intron["data"])

	for exon in exons:
		exons_data.append(exon["data"])

introns_data = list(set(introns_data))
exons_data = list(set(exons_data))

In [4]:
import pandas as pd
from datetime import datetime
import random as rd

data = []

for item in introns_data:
  data.append({"sequence": item, "label": "intron"})
  
for item in exons_data:
  data.append({"sequence": item, "label": "exon"})
  
rd.shuffle(data)
df = pd.DataFrame(data)

df.to_csv("splicing_dataset_small.csv", index=False)

In [None]:
from Bio import SeqIO

# Função para extrair introns, exons e a espécie de um arquivo GenBank
def extract_dna_features(genbank_file):
    dataset = []
    total_records = sum(1 for _ in SeqIO.parse(genbank_file, "genbank"))  # Total de registros no arquivo
    current_record = 0  # Contador para os registros processados
    
    # Iterar sobre cada sequência no arquivo GenBank
    for record in SeqIO.parse(genbank_file, "genbank"):
        current_record += 1
        species = "Unknown"
        
        # Tenta recuperar a informação da espécie no campo "source"
        for feature in record.features:
            if feature.type == "source":
                species = feature.qualifiers.get("organism", ["Unknown"])[0]
        
        # Itera sobre os features de íntrons e exons
        for feature in record.features:
            if feature.type in ["intron", "exon"]:
                # Verifica se a feature faz referência a outra sequência
                if "location_operator" in feature.qualifiers:
                    print(f"Ignorando feature com referência externa: {feature}")
                    continue
                
                # Extrai a sequência e adiciona ao dataset
                sequence = str(feature.extract(record.seq))
                feature_type = feature.type
                dataset.append([sequence, feature_type, species])
        
        # Feedback de progresso
        print(f"Processado {current_record}/{total_records} registros.")
    
    print(f"Total de registros processados: {current_record}/{total_records}")
    return dataset

# Arquivo GenBank a ser processado
genbank_file = "./database/all.gb"

# Extrair as informações e montar o dataset
dataset = extract_dna_features(genbank_file)

# Salvar o dataset em um arquivo CSV
import csv

with open("dna_features_dataset.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Sequence", "Feature Type", "Species"])
    writer.writerows(dataset)

print("Dataset criado com sucesso!")


In [1]:
import pandas as pd

df = pd.read_csv("dna_features_dataset.csv")

In [2]:
chunksize = 1000000
data = pd.read_csv("all_dna.csv", chunksize=chunksize)
sequences, labels = [], []
for chunk in data:
	sequences.extend(chunk.iloc[:, 0].tolist())
	labels.extend(chunk.iloc[:, 1].tolist())

In [2]:
df.to_csv("all_dna.csv")

NameError: name 'df' is not defined

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torch.cuda.amp import GradScaler, autocast

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
df = pd.read_csv("all_dna.csv")
sequences, labels = [], []

In [18]:
len(df)

9825450

In [19]:
df['Sequence'] = df['Sequence'].astype(str)

In [20]:
len(df)

9825450

In [24]:
df_filtrado = df.dropna(subset=[df.columns[1]])

In [28]:
len(df_filtrado)

9825450

In [29]:
df_sem_duplicatas = df_filtrado.drop_duplicates(subset=[df.columns[1]])

In [30]:
len(df_sem_duplicatas)

6347457

In [15]:
sequences = df_sem_duplicatas.iloc[:, 0].tolist()
labels = df_sem_duplicatas.iloc[:, 1].tolist()

In [16]:
len(sequences)

9825450

In [9]:
len(sequences)

9825450

In [6]:
class SplicingSitesDataset(Dataset):
	def __init__(self, sequences, labels, tokenizer, max_length):
		self.sequences = sequences
		self.labels = labels
		self.tokenizer = tokenizer
		self.max_length = max_length
	
	def __len__(self):
		return len(self.sequences)
	
	def __getitem__(self, idx):
		prompt = self.sequences[idx]
		label = self.labels[idx]

		input_text = f"sequence: {prompt} awnser: "
		output_text = f"{label}"
		input_ids = self.tokenizer.encode(input_text, truncation=True, max_length=self.max_length, add_special_tokens=True, padding=True)
		label_ids = self.tokenizer.encode(output_text, truncation=True, max_length=self.max_length, add_special_tokens=False)

		input_ids += label_ids
		labels = [-100] * len(input_ids[:-len(label_ids)]) + label_ids

		return torch.tensor(input_ids), torch.tensor(labels)

In [7]:
def collate_fn(batch):
	input_ids, labels = zip(*batch)
	input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
	labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
	return input_ids_padded, labels_padded

In [8]:
def train(model, dataloader, optimizer, scaler, gradient_accumulation_steps, epochs=3, device="cuda"):
	model.train()
	for epoch in range(epochs):
		total_loss = 0
		for step, batch in enumerate(dataloader):
			input_ids, labels = [b.to(device) for b in batch]
			with autocast():  # Mixed precision
				outputs = model(input_ids=input_ids, labels=labels)
				loss = outputs.loss
				loss = loss / gradient_accumulation_steps  # Normalize loss

			scaler.scale(loss).backward()  # Backpropagation com AMP

			# Acumular gradientes antes de realizar o passo do otimizador
			if (step + 1) % gradient_accumulation_steps == 0 or step == len(dataloader) - 1:
				scaler.step(optimizer)
				scaler.update()
				optimizer.zero_grad()

			total_loss += loss.item() * gradient_accumulation_steps  # Ajuste a perda acumulada

		print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(dataloader)}")

In [9]:
def predict(model, tokenizer, sequence, device="cuda"):
	model.eval()
	input_text = f"sequence: {sequence} awnser: "
	input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

	with torch.no_grad():
		outputs = model.generate(
			input_ids,
			max_new_tokens=10,
			repetition_penalty=2.0,
			top_k=50,
			top_p=0.9,
			pad_token_id=tokenizer.eos_token_id,
		)
		
		completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
		return completion.replace(input_text, "").strip()

In [10]:
import numpy as np

max_length = int(np.percentile([len(seq) for seq in sequences], 95))
print(max_length)

1180


In [11]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token

In [12]:
dataset = SplicingSitesDataset(sequences, labels, tokenizer, max_length=max_length)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=0.0005)
scaler = GradScaler()

gradient_accumulation_steps = 4



In [16]:
assert all(0 <= token < tokenizer.vocab_size or token == -100 for label in labels for token in label), "Labels contain invalid tokens!"

TypeError: '<=' not supported between instances of 'int' and 'str'

In [14]:
train(model, dataloader, optimizer, scaler, gradient_accumulation_steps, epochs=3)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
