In [23]:
!pip install transformers torch pandas tqdm sentencepiece sacremoses

Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl.metadata (8.3 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp312-cp312-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/992.0 kB ? eta -:--:--
   --------------------- ------------------ 524.3/992.0 kB 8.2 MB/s eta 0:00:01
   ---------------------------------------- 992.0/992.0 kB 9.4 MB/s eta 0:00:00
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
   ---------------------------------------- 0.0/897.5 kB ? eta -:--:--
   --------------------------------------- 897.5/897.5 kB 20.5 MB/s eta 0:00:00
Installing collected packages: sentencepiece, sacremoses
Successfully installed sacremoses-0.1.1 sentencepiece-0.2.0


In [1]:
import pandas as pd
import numpy as np
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from tqdm import tqdm
import time

In [3]:
def create_translation_pipeline(model_name, device=0):
    """Create a translation pipeline with specified model and device."""
    return pipeline("translation", model=model_name, device=device)

def chunk_text(text, max_length=512):
    """Split text into chunks that won't exceed model's max token limit."""
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        # Rough estimate of token length
        word_length = len(word) + 1  # +1 for space
        if current_length + word_length > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = word_length
        else:
            current_chunk.append(word)
            current_length += word_length
            
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

def back_translate(text, en_to_mid_pipeline, mid_to_en_pipeline, intermediate_lang="fr", max_length=512):
    """Perform back-translation on a single text."""
    try:
        # Split into chunks
        chunks = chunk_text(text, max_length)
        translated_chunks = []
        
        for chunk in chunks:
            # Translate to intermediate language
            mid_translation = en_to_mid_pipeline(chunk)[0]['translation_text']
            
            # Translate back to English
            back_translation = mid_to_en_pipeline(mid_translation)[0]['translation_text']
            translated_chunks.append(back_translation)
            
        # Combine chunks
        return ' '.join(translated_chunks)
    except Exception as e:
        print(f"Error in back translation: {str(e)}")
        return text


In [5]:
def augment_dataset_with_backtranslation(df, batch_size=64, intermediate_langs=["fr", "de", "es"], 
                                       device=0, max_samples=None):
    """
    Augment a dataset using back-translation through multiple intermediate languages.
    
    Args:
        df: DataFrame containing 'review' column
        batch_size: Number of samples to process at once (increased for GPU)
        intermediate_langs: List of intermediate languages to use
        device: Device to run translations on (0 for first GPU)
        max_samples: Maximum number of samples to process (None for all)
    
    Returns:
        DataFrame with original and augmented reviews
    """
    # Verify CUDA is available and being used
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Current device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    
    augmented_reviews = []
    
    if max_samples:
        df = df.head(max_samples)
        
    for lang in intermediate_langs:
        print(f"\nPerforming back-translation through {lang}")
        
        # Initialize translation pipelines
        en_to_mid_model = f"Helsinki-NLP/opus-mt-en-{lang}"
        mid_to_en_model = f"Helsinki-NLP/opus-mt-{lang}-en"
        
        en_to_mid_pipeline = create_translation_pipeline(en_to_mid_model, device)
        mid_to_en_pipeline = create_translation_pipeline(mid_to_en_model, device)
        
        # Process in batches
        for i in tqdm(range(0, len(df), batch_size)):
            batch = df.iloc[i:i+batch_size]
            
            # Back-translate each review in the batch
            translated = []
            for text in batch['review']:
                augmented = back_translate(text, en_to_mid_pipeline, mid_to_en_pipeline, 
                                        intermediate_lang=lang)
                translated.append(augmented)
                
                # Reduced delay since GPU can handle faster processing
                time.sleep(0.05)
            
            augmented_reviews.extend(translated)
            
            # Clear GPU cache periodically
            if i % (batch_size * 10) == 0:
                torch.cuda.empty_cache()
            
        print(f"Completed back-translation through {lang}")
        
        # Clear GPU cache after each language
        torch.cuda.empty_cache()
    
    # Create new dataframe with original and augmented reviews
    augmented_df = pd.DataFrame({
        'review': augmented_reviews,
        'sentiment': df['sentiment'].tolist() * len(intermediate_langs)
    })
    
    return augmented_df

In [7]:
def save_augmented_data(original_df, augmented_df, output_file):
    """Save augmented dataset to CSV."""
    augmented_df.to_csv(output_file, index=False)
    print(f"Saved augmented dataset with {len(augmented_df)} samples to {output_file}")

In [None]:
# Read original data
imdb_data = pd.read_csv("../input/imdb_dataset.csv")
    
# Create train/test split
train_data = imdb_data.iloc[:40000]
test_data = imdb_data.iloc[40000:]
    
# Augment training data
augmented_train = augment_dataset_with_backtranslation(
    train_data,
    batch_size=256,  # Increased batch size for GPU
    intermediate_langs=["fr", "de", "es"],
    device=0,  # Use first GPU
    max_samples=100  # For testing, remove for full dataset
)

save_augmented_data(train_data, augmented_train, "backtranslated_train_reviews.csv")