In [None]:
!pip install transformers datasets torch

In [None]:
import zipfile
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm.notebook import tqdm
from joblib import dump, load
import os

In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
!pip install kaggle
!kaggle datasets download -d kritanjalijain/amazon-reviews -p /content

In [None]:
zip_file_path = '/content/amazon-reviews.zip'
extract_dir = '/content/amazon-reviews/'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
df = pd.read_csv('/content/amazon-reviews/train.csv', header=None, names=['polarity', 'title', 'text'])
df = df.sample(frac=0.1, random_state=42)

In [None]:
# Convertir etiquetas a 'positive' y 'negative'
df['polarity'] = df['polarity'].apply(lambda x: 'positive' if x == 1 else 'negative')

In [None]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = f"transform to {'negative' if row['polarity'] == 'positive' else 'positive'}: " + row['title'] + " " + row['text']
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(inputs['input_ids'], dtype=torch.long)
        }

tokenizer = T5Tokenizer.from_pretrained('t5-small')
max_len = 128
dataset = AmazonReviewsDataset(df, tokenizer, max_len)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


In [None]:
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, data_loader, optimizer, device)
    print(f'Train loss: {train_loss:.4f}')


In [None]:
def generate_review(model, tokenizer, text, device, target_polarity):
    model.eval()
    input_text = f"transform to {target_polarity}: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    outputs = model.generate(input_ids, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Ejemplo de prueba
positive_review = "The product works perfectly and the service was excellent."
negative_review = generate_review(model, tokenizer, positive_review, device, "negative")
print("Original:", positive_review)
print("Transformed:", negative_review)
