In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import os
import pandas as pd
from datasets import Dataset
from transformers import BartTokenizer

# Cargar tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Cargar CSVs
data_path = "/content/drive/MyDrive/LLMPractica/mydatasets"
train_df = pd.read_csv(os.path.join(data_path, "train.csv"))
val_df = pd.read_csv(os.path.join(data_path, "val.csv"))
test_df = pd.read_csv(os.path.join(data_path, "test.csv"))

# Convertir a Dataset
train_dataset = Dataset.from_pandas(train_df[["text", "summary"]])
val_dataset = Dataset.from_pandas(val_df[["text", "summary"]])
test_dataset = Dataset.from_pandas(test_df[["text", "summary"]])

# Preprocesar
def preprocess(example):
    input_text = "Summary: " + example["text"]
    model_input = tokenizer(input_text, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(text_target=example["summary"], max_length=128, truncation=True, padding="max_length")
    model_input["labels"] = labels["input_ids"]
    return model_input

train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)
test_dataset = test_dataset.map(preprocess)

# Guardar datasets preprocesados
output_path = "/content/drive/MyDrive/LLMPractica/preprocessedatasets"
os.makedirs(output_path, exist_ok=True)
train_dataset.save_to_disk(os.path.join(output_path, "train_dataset"))
val_dataset.save_to_disk(os.path.join(output_path, "val_dataset"))
test_dataset.save_to_disk(os.path.join(output_path, "test_dataset"))

print("✅ Datos preprocesados y guardados en:", output_path)

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/20 [00:00<?, ? examples/s]

✅ Datos preprocesados y guardados en: /content/drive/MyDrive/LLMPractica/preprocessedatasets
