In [11]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import numpy as np

In [46]:
class IMDBDataset(Dataset):
    def __init__(self, reviews, sentiments, tokenizer, max_length=256):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for review, sentiment in zip(reviews, sentiments):
            # Prepare sentiment control token
            sentiment_token = "<|pos|>" if sentiment == "positive" else "<|neg|>"
            # Combine sentiment token and review
            text = f"{sentiment_token} {review} <|endoftext|>"
            
            # Encode the text
            encodings = tokenizer(text, 
                                truncation=True,
                                max_length=max_length,
                                padding='max_length',
                                return_tensors='pt')
            
            self.input_ids.append(encodings['input_ids'].squeeze())
            self.attn_masks.append(encodings['attention_mask'].squeeze())
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.input_ids[idx]  # For language modeling
        }

In [48]:
def prepare_tokenizer_and_model():
    # Load tokenizer and add special tokens
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # Add padding token
    tokenizer.pad_token = tokenizer.eos_token
    
    # Add sentiment tokens
    special_tokens = {'additional_special_tokens': ['<|pos|>', '<|neg|>']}
    tokenizer.add_special_tokens(special_tokens)
    
    # Load model and resize token embeddings
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.resize_token_embeddings(len(tokenizer))
    
    # Set pad token id for the model
    model.config.pad_token_id = model.config.eos_token_id
    
    return tokenizer, model

In [60]:
def train(model, tokenizer, train_dataloader, val_dataloader, epochs=3, device='cuda'):
    model = model.to(device)
    
    # Setup optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=5e-5)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        model.train()
        total_train_loss = 0
        
        # Training loop
        for batch in tqdm(train_dataloader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            model.zero_grad()
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss}")
        
        # Validation
        model.eval()
        total_val_loss = 0
        
        for batch in tqdm(val_dataloader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.no_grad():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
            loss = outputs.loss
            total_val_loss += loss.item()
            
        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Average validation loss: {avg_val_loss}")
        
        # Save model and tokenizer after each epoch
        model_save_path = f'sentiment_gpt2_epoch_{epoch+1}'
        model.save_pretrained(model_save_path)
        tokenizer.save_pretrained(model_save_path)
        print(f"Model and tokenizer saved to {model_save_path}")

In [66]:
def main():
    print("Loading IMDB dataset...")
    # Load IMDB dataset
    data = pd.read_csv("input/imdb_dataset.csv")
    train_data = data.iloc[:40000]
    val_data = data.iloc[40000:45000]
    
    print("Preparing tokenizer and model...")
    # Prepare tokenizer and model
    tokenizer, model = prepare_tokenizer_and_model()
    
    print("Creating datasets...")
    # Create datasets
    train_dataset = IMDBDataset(
        train_data['review'].tolist(),
        train_data['sentiment'].tolist(),
        tokenizer
    )
    val_dataset = IMDBDataset(
        val_data['review'].tolist(),
        val_data['sentiment'].tolist(),
        tokenizer
    )
    
    print("Creating dataloaders...")
    # Create dataloaders
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=4,  # Reduced batch size to be safe
        shuffle=True
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=4,  # Reduced batch size to be safe
        shuffle=False
    )
    
    print("Starting training...")
    # Train the model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Print GPU memory info if available
    if torch.cuda.is_available():
        print(f"GPU Memory available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    train(model, tokenizer, train_dataloader, val_dataloader, epochs=3, device=device)  # Added tokenizer here
    
    print("Training completed!")

In [68]:
main()

Loading IMDB dataset...
Preparing tokenizer and model...
Creating datasets...




Creating dataloaders...
Starting training...
Using device: cuda
GPU Memory available: 17.17 GB

Epoch 1/3


Training: 100%|██████████| 10000/10000 [12:52<00:00, 12.94it/s]


Average training loss: 2.8572372406721116


Validation: 100%|██████████| 1250/1250 [00:23<00:00, 53.20it/s]


Average validation loss: 2.739152767944336
Model and tokenizer saved to sentiment_gpt2_epoch_1

Epoch 2/3


Training: 100%|██████████| 10000/10000 [12:36<00:00, 13.22it/s]


Average training loss: 2.7160199007749557


Validation: 100%|██████████| 1250/1250 [00:18<00:00, 66.81it/s]


Average validation loss: 2.7153115202903746
Model and tokenizer saved to sentiment_gpt2_epoch_2

Epoch 3/3


Training: 100%|██████████| 10000/10000 [25:39<00:00,  6.49it/s]


Average training loss: 2.6458864232122896


Validation: 100%|██████████| 1250/1250 [00:52<00:00, 23.66it/s]


Average validation loss: 2.707614884853363
Model and tokenizer saved to sentiment_gpt2_epoch_3
Training completed!


In [70]:
def generate_text(model, tokenizer, sentiment, prompt=None, max_length=100, 
                 temperature=0.7, device='cuda'):
    """
    Generate text with more controlled parameters
    """
    model.eval()
    
    # Prepare sentiment control token
    sentiment_token = "<|pos|>" if sentiment == "positive" else "<|neg|>"
    
    # Combine sentiment token with prompt if provided
    if prompt:
        text = f"{sentiment_token} {prompt}"
    else:
        text = sentiment_token
        
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    
    # Generate text with more controlled parameters
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=temperature,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    
    generated_text = tokenizer.decode(output[0], skip_special_tokens=False)
    return generated_text

In [84]:
def main():
    # Load the fine-tuned model and tokenizer
    model_path = 'sentiment_gpt2_epoch_3'
    tokenizer, model = load_model_and_tokenizer(model_path)    
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Generate a large dataset
    generated_df = generate_dataset(
        model,
        tokenizer,
        num_samples=10000,  # Will generate 5000 positive and 5000 negative reviews
        device=device,
        output_file="generated_reviews.csv"
    )
    
    # Print some example generations
    print("\nExample generations:")
    for sentiment in ['positive', 'negative']:
        examples = generated_df[generated_df['sentiment'] == sentiment].sample(3)
        print(f"\n{sentiment.upper()} examples:")
        for text in examples['review']:
            print(f"\n{text}\n{'='*80}")

In [86]:
main()


Generating 5000 positive reviews...


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
100%|██████████| 5000/5000 [57:38<00:00,  1.45it/s]  



Generating 5000 negative reviews...


100%|██████████| 5000/5000 [1:07:59<00:00,  1.23it/s]


Generated dataset saved to generated_reviews.csv

Dataset Statistics:
Total samples: 10000

Sentiment distribution:
sentiment
positive    5000
negative    5000
Name: count, dtype: int64

Average review length: 116.1279

Example generations:

POSITIVE examples:

This film was just absolutely fantastic. I have been a fan of the TV series since I was little, and when I heard that the BBC were giving this film a second run it absolutely blew my mind! I found myself laughing so hard I would have to say something. The writing is wonderful, the cinematography fantastic in its own right but with an extra twist which makes the whole thing worth watching. A must see for anyone who likes science fiction or horror. I can't stress enough how much I enjoyed every minute (except for a few). The production values are also superb and the acting is top notch. I'm sure it will be on my DVD set soon though... <|endoftext|>

I saw this as an 8th Season on DVD with a 3 of the movie that is in my "favorites




In [76]:
def load_model_and_tokenizer(model_path):
    """Load the saved model and tokenizer"""
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    
    # Set pad token
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id
    
    return tokenizer, model