# Initial Setup

In [61]:
import zipfile
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm.notebook import tqdm
import os

# Download and Data Extraction

In [62]:
# Load the dataset into a pandas dataframe and smaple 10% of the data for training
df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])
df = df.sample(frac=0.08, random_state=42)

# Verification and Conversion of Labels

In [63]:
# Replace NaN values with empty strings in 'title' and 'text' columns
df['title'] = df['title'].fillna('')
df['text'] = df['text'].fillna('')

In [64]:
# Convert columns to strings
df['title'] = df['title'].astype(str)
df['text'] = df['text'].astype(str)

In [65]:
# Check unique values in the 'polarity' column
print(df['polarity'].unique())

[1 2]


In [66]:
# Convert labels from numerical (1, 0) to string ('positive', 'negative')
df['polarity'] = df['polarity'].apply(lambda x: 'positive' if x == 2 else 'negative')

# Data Preprocessing and Dataset Definition

In [67]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df.reset_index(drop=True)  # Reset index to avoid any potential indexing issues
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # Build the input text
        polarity = 'negative' if row['polarity'] == 'positive' else 'positive'
        text = f"transform to {polarity}: {row['title']} {row['text']}"
        # Tokenize the input text
        inputs = self.tokenizer.encode_plus(
            text=text.strip(),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        # Prepare the target labels
        targets = self.tokenizer.encode_plus(
            text=row['text'].strip(),
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': targets['input_ids'].flatten()
        }


In [68]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# Split the dataframe into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Tokenizer and dataset
tokenizer = T5Tokenizer.from_pretrained('google/t5-efficient-tiny')
train_dataset = AmazonReviewsDataset(train_df, tokenizer, max_len=128)
val_dataset = AmazonReviewsDataset(val_df, tokenizer, max_len=128)

# DataLoaders
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


# Model Definition and Training Parameters

In [69]:
model = T5ForConditionalGeneration.from_pretrained('google/t5-efficient-tiny')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

def train_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


# Function to Calculate Perplexity

In [70]:
def calculate_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    perplexity = torch.exp(torch.tensor(avg_loss))
    return perplexity.item()


# Model Training

In [71]:
epochs = 3
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, train_data_loader, optimizer, device)
    print(f'Train loss: {train_loss:.4f}')

# Save the trained model after all epochs
torch.save(model.state_dict(), "t5_model.pt")
print("Model saved successfully.")


Epoch 1/3


  0%|          | 0/16200 [00:00<?, ?it/s]

Train loss: 0.9392
Epoch 2/3


  0%|          | 0/16200 [00:00<?, ?it/s]

Train loss: 0.3608
Epoch 3/3


  0%|          | 0/16200 [00:00<?, ?it/s]

Train loss: 0.2846
Model saved successfully.


# Perplexity Report

In [74]:
perplexity = calculate_perplexity(model, train_data_loader, device)
print(f'Perplexity: {perplexity}')

  0%|          | 0/16200 [00:00<?, ?it/s]

Perplexity: 1.191820502281189


# Text Generation and Test

In [85]:
def generate_review(model, tokenizer, text, device, target_polarity):
    model.eval()
    input_text = f"transform to {target_polarity}: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)
    outputs = model.generate(input_ids, max_length=128)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

positive_review = "Very good movie."
negative_review = generate_review(model, tokenizer, positive_review, device, "negative")
print("Original:", positive_review)
print("Transformed:", negative_review)

Original: Very good movie.
Transformed: movie.
