# Initial Setup

In [None]:
!pip install transformers datasets torch



In [None]:
import zipfile
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm.notebook import tqdm
import os

In [None]:
# Set an environment variable to handle CUDA errors
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Download and Data Extraction

In [None]:
# Download the dataset from Kaggle
!pip install kaggle
!kaggle datasets download -d kritanjalijain/amazon-reviews -p /content

Dataset URL: https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews
License(s): CC0-1.0
amazon-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
# Unzip the downloaded dataset
zip_file_path = '/content/amazon-reviews.zip'
extract_dir = '/content/amazon-reviews/'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [None]:
# Load the dataset into a pandas dataframe and smaple 10% of the data for training
df = pd.read_csv('/content/amazon-reviews/train.csv', header=None, names=['polarity', 'title', 'text'])
df = df.sample(frac=0.1, random_state=42)

# Verification and Conversion of Labels

In [None]:
# Replace NaN values with empty strings in 'title' and 'text' columns
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

In [None]:
# Convert columns to strings
df['title'] = df['title'].astype(str)
df['text'] = df['text'].astype(str)

In [None]:
# Check unique values in the 'polarity' column
print(df['polarity'].unique())

[1 2]


In [None]:
# Convert labels from numerical (1, 0) to string ('positive', 'negative')
df['polarity'] = df['polarity'].apply(lambda x: 'positive' if x == 2 else 'negative')

# Data Preprocessing and Dataset Definition

In [None]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df.reset_index(drop=True)  # Reset index to avoid any potential indexing issues
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Build the input text
        polarity = 'negative' if row['polarity'] == 'positive' else 'positive'
        text = f"transform to {polarity}: {row['title']} {row['text']}"
        # Tokenize the input text
        inputs = self.tokenizer.encode_plus(
            text=text.strip(),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        # Prepare the target labels
        targets = self.tokenizer.encode_plus(
            text=row['text'].strip(),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }


# Model Definition and Training Parameters

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)




# Function to Calculate Perplexity

In [None]:
def calculate_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()


# Model Training

In [None]:
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, data_loader, optimizer, device)
    print(f'Train loss: {train_loss:.4f}')

Epoch 1/3


  0%|          | 0/22500 [00:00<?, ?it/s]

# Perplexity Report

In [None]:
perplexity = calculate_perplexity(model, data_loader, device)
print(f'Perplexity: {perplexity}')

# Text Generation and Test

In [None]:
def generate_review(model, tokenizer, text, device, target_polarity):
    model.eval()
    input_text = f"transform to {target_polarity}: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    outputs = model.generate(input_ids, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

positive_review = "The product works perfectly and the service was excellent."
negative_review = generate_review(model, tokenizer, positive_review, device, "negative")
print("Original:", positive_review)
print("Transformed:", negative_review)