In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import re
import random
from torch.nn.utils.rnn import pad_sequence

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Ready to prepare translation datasets!")


In [None]:
# Create sample English-French translation pairs
def create_translation_data():
    """Create a sample dataset of English-French translation pairs"""
    
    translation_pairs = [
        # Basic greetings and common phrases
        ("Hello", "Bonjour"),
        ("Good morning", "Bonjour"),
        ("Good evening", "Bonsoir"),
        ("How are you", "Comment allez-vous"),
        ("I am fine", "Je vais bien"),
        ("Thank you", "Merci"),
        ("You are welcome", "De rien"),
        ("Goodbye", "Au revoir"),
        
        # Simple sentences
        ("I love you", "Je t'aime"),
        ("What is your name", "Comment vous appelez-vous"),
        ("My name is John", "Je m'appelle John"),
        ("Where are you from", "D'où venez-vous"),
        ("I am from France", "Je viens de France"),
        ("How old are you", "Quel âge avez-vous"),
        ("I am twenty years old", "J'ai vingt ans"),
        
        # More complex sentences
        ("The weather is nice today", "Il fait beau aujourd'hui"),
        ("I would like to eat something", "Je voudrais manger quelque chose"),
        ("Can you help me please", "Pouvez-vous m'aider s'il vous plaît"),
        ("Where is the train station", "Où est la gare"),
        ("I don't understand French", "Je ne comprends pas le français"),
        ("Do you speak English", "Parlez-vous anglais"),
        
        # Longer sentences for testing attention
        ("The quick brown fox jumps over the lazy dog", "Le renard brun rapide saute par-dessus le chien paresseux"),
        ("I am learning machine learning and artificial intelligence", "J'apprends l'apprentissage automatique et l'intelligence artificielle"),
        ("The attention mechanism revolutionized natural language processing", "Le mécanisme d'attention a révolutionné le traitement du langage naturel"),
        ("Neural networks can learn complex patterns in data", "Les réseaux de neurones peuvent apprendre des motifs complexes dans les données"),
        
        # Questions and responses
        ("What time is it", "Quelle heure est-il"),
        ("It is three o'clock", "Il est trois heures"),
        ("What day is today", "Quel jour sommes-nous"),
        ("Today is Monday", "Aujourd'hui c'est lundi"),
        ("What is the weather like", "Quel temps fait-il"),
        ("It is raining", "Il pleut"),
        
        # Travel and directions
        ("I need a taxi", "J'ai besoin d'un taxi"),
        ("How much does it cost", "Combien ça coûte"),
        ("Turn left at the corner", "Tournez à gauche au coin"),
        ("Go straight ahead", "Allez tout droit"),
        ("The hotel is near the park", "L'hôtel est près du parc"),
        
        # Food and dining
        ("I am hungry", "J'ai faim"),
        ("I am thirsty", "J'ai soif"),
        ("The food is delicious", "La nourriture est délicieuse"),
        ("Can I have the menu please", "Puis-je avoir le menu s'il vous plaît"),
        ("I would like a coffee", "Je voudrais un café"),
        
        # Numbers and time
        ("One two three four five", "Un deux trois quatre cinq"),
        ("Six seven eight nine ten", "Six sept huit neuf dix"),
        ("What is your phone number", "Quel est votre numéro de téléphone"),
        ("My phone number is five five five", "Mon numéro de téléphone est cinq cinq cinq"),
    ]
    
    return translation_pairs

# Load and analyze the dataset
translation_data = create_translation_data()

print("TRANSLATION DATASET OVERVIEW")
print("=" * 40)
print(f"Total translation pairs: {len(translation_data)}")
print()

# Analyze sentence lengths
en_lengths = [len(pair[0].split()) for pair in translation_data]
fr_lengths = [len(pair[1].split()) for pair in translation_data]

print(f"English sentence statistics:")
print(f"  Average length: {np.mean(en_lengths):.1f} words")
print(f"  Min length: {min(en_lengths)} words")
print(f"  Max length: {max(en_lengths)} words")
print()

print(f"French sentence statistics:")
print(f"  Average length: {np.mean(fr_lengths):.1f} words")
print(f"  Min length: {min(fr_lengths)} words")
print(f"  Max length: {max(fr_lengths)} words")
print()

# Show some examples
print("Sample translation pairs:")
for i, (en, fr) in enumerate(translation_data[:10]):
    print(f"  {i+1:2d}. EN: {en}")
    print(f"      FR: {fr}")
    print()

# Visualize length distributions
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

ax1.hist(en_lengths, bins=10, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_xlabel('Sentence Length (words)')
ax1.set_ylabel('Frequency')
ax1.set_title('English Sentence Length Distribution')
ax1.grid(True, alpha=0.3)

ax2.hist(fr_lengths, bins=10, alpha=0.7, color='lightcoral', edgecolor='black')
ax2.set_xlabel('Sentence Length (words)')
ax2.set_ylabel('Frequency')
ax2.set_title('French Sentence Length Distribution')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
