# Problem 1
1) Train and validate a transformer mode, for learningthe above sequence. Use sequence lengths of 10, 20, and 30 for your training. Feel free to adjust other network parameters. Report and compare training loss, validation accuracy, execution time for training, and computational and mode size complexities against RNN-based approaches in Homework 3.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import time

# Define the text sequence
f = open("text.txt", "r")
text= f.read()
# Unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create a mapping from characters to integers and vice versa
char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

# Length of sequences for training
sequence_lengths = [10,20,30]

# Function to generate input-output pairs
def generate_sequences(text, sequence_length):
    sequences = []
    next_chars = []
    for i in range(0, len(text) - sequence_length):
        sequences.append(text[i:i + sequence_length])
        next_chars.append(text[i + sequence_length])
    return sequences, next_chars

# Generate sequences for each length
sequences_data = {}
for length in sequence_lengths:
    sequences, next_chars = generate_sequences(text, length)
    sequences_data[length] = {'sequences': sequences, 'next_chars': next_chars}

# Convert sequences to numerical representation
for length, data in sequences_data.items():
    sequences = data['sequences']
    next_chars = data['next_chars']
    X = np.zeros((len(sequences), length, vocab_size), dtype=np.float32)
    y = np.zeros((len(sequences), vocab_size), dtype=np.float32)
    for i, sequence in enumerate(sequences):
        for t, char in enumerate(sequence):
            X[i, t, char_to_int[char]] = 1
        y[i, char_to_int[next_chars[i]]] = 1
    sequences_data[length]['X'] = torch.from_numpy(X)
    sequences_data[length]['y'] = torch.from_numpy(y)

# Define RNN, LSTM, and GRU models
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CharRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  
        return out

class CharLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CharLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  
        return out

class CharGRU(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CharGRU, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])  
        return out

# Define training parameters
epochs = 50
batch_size = 128
hidden_size = 128

# Train and evaluate models for each sequence length
results = {}
for length, data in sequences_data.items():
    X = data['X']
    y = data['y']
    input_size = X.shape[-1]

    for model_type, Model in [('RNN', CharRNN), ('LSTM', CharLSTM), ('GRU', CharGRU)]:
        model = Model(input_size, hidden_size)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters())

        print(f"Training {model_type} model with sequence length {length}...")
        start_time = time.time()
        for epoch in range(epochs):
            running_loss = 0.0
            for i in range(0, len(X), batch_size):
                inputs = X[i:i+batch_size]
                labels = torch.argmax(y[i:i+batch_size], dim=1)

                optimizer.zero_grad()

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item() * inputs.size(0)

            epoch_loss = running_loss / len(X)
        end_time = time.time()
        execution_time = end_time - start_time

    # Evaluate model
        with torch.no_grad():
            outputs = model(X)
            _, predicted = torch.max(outputs, 1)
            total = y.size(0)
            correct = (predicted == torch.argmax(y, dim=1)).sum().item()
            accuracy = correct / total
        # Model complexity
        model_size = sum(p.numel() for p in model.parameters())

        results.setdefault(model_type, {}).setdefault(length, {})
        results[model_type][length]['loss'] = epoch_loss
        results[model_type][length]['accuracy'] = accuracy
        results[model_type][length]['execution_time'] = execution_time,
        

        print(f"Model {model_type} with sequence length {length} - Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}, Execution Time: {execution_time:.2f} seconds, Model Size: {model_size} parameters\n")

        # Predict next character for a sequence
        test_sequence = sequences_data[length]['sequences'][0]  
        X_test = torch.unsqueeze(sequences_data[length]['X'][0], 0)  
        with torch.no_grad():
            outputs = model(X_test)
            _, predicted_index = torch.max(outputs, 1)
            predicted_char = int_to_char[int(predicted_index)]
            print(f"Next predicted character for sequence '{test_sequence}': {predicted_char}\n")

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results)
results_df.index.name = 'Sequence Length'

# Display the results
print(results_df)


Training RNN model with sequence length 10...
Model RNN with sequence length 10 - Training Loss: 0.9861, Training Accuracy: 0.7156, Execution Time: 27.37 seconds, Model Size: 28205 parameters

Next predicted character for sequence 'Next chara': c

Training LSTM model with sequence length 10...
Model LSTM with sequence length 10 - Training Loss: 1.3095, Training Accuracy: 0.6109, Execution Time: 36.08 seconds, Model Size: 95405 parameters

Next predicted character for sequence 'Next chara': c

Training GRU model with sequence length 10...
Model GRU with sequence length 10 - Training Loss: 1.0053, Training Accuracy: 0.7308, Execution Time: 57.57 seconds, Model Size: 73005 parameters

Next predicted character for sequence 'Next chara': c

Training RNN model with sequence length 20...
Model RNN with sequence length 20 - Training Loss: 0.9853, Training Accuracy: 0.7305, Execution Time: 51.87 seconds, Model Size: 28205 parameters

Next predicted character for sequence 'Next character predi':

# Problem 2
1) Train the models for the sequence of 20 and 30, report and compare training loss, validation accuracy, execution time for training, and computational and mode size complexities, and compare it against RNN based models.
2) Adjust the hyperparameters (number of layers, hidden size, and the number of heads) and compare your results (training and validation loss, computation complexity, model size, training and inference time, and the output sequence). Analyze their influence on accuracy, running time, and computational perplexity.
3) What if we increase the sequence length to 50. Perform the training and report the accuracy and model complexity results.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import requests
import time

# Step 1: Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text 

# Step 2: Prepare the dataset
sequence_lengths = [20, 30]

# Create a character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}

# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

# Define dataset class
class CharDataset(Dataset):
    def __init__(self, sequence, target):
        self.sequence = sequence
        self.target = target

    def __len__(self):
        return len(self.sequence)

    def __getitem__(self, idx):
        return self.sequence[idx], self.target[idx]

# Define LSTM and GRU models
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, rnn_type='lstm'):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        if rnn_type == 'lstm':
            self.rnn = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embed = self.embedding(x)
        out, _ = self.rnn(embed)
        out = self.fc(out[:, -1, :])
        return out

# Define training function
def train_model(model, train_loader, test_loader, criterion, optimizer, epochs):
    train_losses = []
    test_losses = []
    accuracies = []
    start_time = time.time()
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        train_loss = running_loss / len(train_loader)
        train_losses.append(train_loss)

        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, targets in test_loader:
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                test_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()

        test_loss = test_loss / len(test_loader)
        test_losses.append(test_loss)

        accuracy = correct / total
        accuracies.append(accuracy)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.6f}, Test Loss: {test_loss:.6f}, Accuracy: {accuracy:.4f}")

    end_time = time.time()
    execution_time = end_time - start_time
    return train_losses, test_losses, accuracies, execution_time

# Step 3: Create data loaders
batch_size = 128
train_loaders = []
test_loaders = []
for seq_length in sequence_lengths:
    sequences = []
    targets = []
    for i in range(0, len(encoded_text) - seq_length):
        seq = encoded_text[i:i+seq_length]
        target = encoded_text[i+seq_length]
        sequences.append(seq)
        targets.append(target)

    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)

    dataset = CharDataset(sequences, targets)
    train_size = int(len(dataset) * 0.8)
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

    train_loaders.append(train_loader)
    test_loaders.append(test_loader)

# Step 4: Define hyperparameters
input_size = len(chars)
hidden_size = 256
output_size = len(chars)
num_layers = 2
epochs = 1
learning_rate = 0.001

# Step 5: Initialize and train LSTM models
lstm_train_losses = []
lstm_test_losses = []
lstm_accuracies = []
lstm_execution_times = []
for i, seq_length in enumerate(sequence_lengths):
    print(f"\nTraining LSTM for sequence length {seq_length}")
    lstm_model = CharRNN(input_size, hidden_size, output_size, num_layers, rnn_type='lstm')
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)
    lstm_train_loss, lstm_test_loss, lstm_accuracy, lstm_execution_time = train_model(lstm_model, train_loaders[i], test_loaders[i], criterion, optimizer, epochs)
    lstm_train_losses.append(lstm_train_loss)
    lstm_test_losses.append(lstm_test_loss)
    lstm_accuracies.append(lstm_accuracy)
    lstm_execution_times.append(lstm_execution_time)

# Step 6: Initialize and train GRU models
gru_train_losses = []
gru_test_losses = []
gru_accuracies = []
gru_execution_times = []
for i, seq_length in enumerate(sequence_lengths):
    print(f"\nTraining GRU for sequence length {seq_length}")
    gru_model = CharRNN(input_size, hidden_size, output_size, num_layers, rnn_type='gru')
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(gru_model.parameters(), lr=learning_rate)
    gru_train_loss, gru_test_loss, gru_accuracy, gru_execution_time = train_model(gru_model, train_loaders[i], test_loaders[i], criterion, optimizer, epochs)
    gru_train_losses.append(gru_train_loss)
    gru_test_losses.append(gru_test_loss)
    gru_accuracies.append(gru_accuracy)
    gru_execution_times.append(gru_execution_time)

# Print results
print("\nLSTM Model Results:")
for i, seq_length in enumerate(sequence_lengths):
    print(f"\nResults for sequence length {seq_length}:")
    print("Train Loss:", lstm_train_losses[i])
    print("Test Loss:", lstm_test_losses[i])
    print("Accuracy:", lstm_accuracies[i])
    print("Execution Time:", lstm_execution_times[i])

print("\n\nGRU Model Results:")
for i, seq_length in enumerate(sequence_lengths):
    print(f"\nResults for sequence length {seq_length}:")
    print("Train Loss:", gru_train_losses[i])
    print("Test Loss:", gru_test_losses[i])
    print("Accuracy:", gru_accuracies[i])
    print("Execution Time:", gru_execution_times[i])



Training LSTM for sequence length 20
Epoch 1/1, Train Loss: 1.676694, Test Loss: 1.490935, Accuracy: 0.5449

Training LSTM for sequence length 30
Epoch 1/1, Train Loss: 1.663291, Test Loss: 1.469452, Accuracy: 0.5517

Training GRU for sequence length 20
Epoch 1/1, Train Loss: 1.640184, Test Loss: 1.496257, Accuracy: 0.5427

Training GRU for sequence length 30
Epoch 1/1, Train Loss: 1.632545, Test Loss: 1.484360, Accuracy: 0.5477

LSTM Model Results:

Results for sequence length 20:
Train Loss: [1.676693968265411]
Test Loss: [1.490935357356988]
Accuracy: [0.5449198699988793]
Execution Time: 1706.71955037117

Results for sequence length 30:
Train Loss: [1.6632906275785866]
Test Loss: [1.4694518446580336]
Accuracy: [0.5517476341825321]
Execution Time: 4389.573019742966


GRU Model Results:

Results for sequence length 20:
Train Loss: [1.6401844349012795]
Test Loss: [1.4962571951690515]
Accuracy: [0.54270088535246]
Execution Time: 17468.664876699448

Results for sequence length 30:
Train 

# Problem 3
Developed a Transformer-based encoder-decoder architecture for English to French Translation. Train the model on the entire dataset and evaluate it on the entire dataset. Report training loss, validation loss, and validation accuracy. Also, try some qualitative validation as well, asking the network to generate French translations for some English sentences. Compare your results against and RNN-based network with attention and without attention.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

english_to_french = [
 ("I am cold", "J'ai froid"),
 ("You are tired", "Tu es fatigué"),
 ("He is hungry", "Il a faim"),
 ("She is happy", "Elle est heureuse"),
 ("We are friends", "Nous sommes amis"),
 ("They are students", "Ils sont étudiants"),
 ("The cat is sleeping", "Le chat dort"),
 ("The sun is shining", "Le soleil brille"),
 ("We love music", "Nous aimons la musique"),
 ("She speaks French fluently", "Elle parle français couramment"),
 ("He enjoys reading books", "Il aime lire des livres"),
 ("They play soccer every weekend", "Ils jouent au football chaque week-end"),
 ("The movie starts at 7 PM", "Le film commence à 19 heures"),
 ("She wears a red dress", "Elle porte une robe rouge"),
 ("We cook dinner together", "Nous cuisinons le dîner ensemble"),
 ("He drives a blue car", "Il conduit une voiture bleue"),
 ("They visit museums often", "Ils visitent souvent des musées"),
 ("The restaurant serves delicious food", "Le restaurant sert une délicieuse cuisine"),
 ("She studies mathematics at university", "Elle étudie les mathématiques àl'université"),
 ("We watch movies on Fridays", "Nous regardons des films le vendredi"),
 ("He listens to music while jogging", "Il écoute de la musique en faisant du jogging"),
 ("They travel around the world", "Ils voyagent autour du monde"),
 ("The book is on the table", "Le livre est sur la table"),
 ("She dances gracefully", "Elle danse avec grâce"),
 ("We celebrate birthdays with cake", "Nous célébrons les anniversaires avec ungâteau"),
 ("He works hard every day", "Il travaille dur tous les jours"),
 ("They speak different languages", "Ils parlent différentes langues"),
 ("The flowers bloom in spring", "Les fleurs fleurissent au printemps"),
 ("She writes poetry in her free time", "Elle écrit de la poésie pendant son tempslibre"),
 ("We learn something new every day", "Nous apprenons quelque chose de nouveauchaque jour"),
 ("The dog barks loudly", "Le chien aboie bruyamment"),
 ("He sings beautifully", "Il chante magnifiquement"),
 ("They swim in the pool", "Ils nagent dans la piscine"),
 ("The birds chirp in the morning", "Les oiseaux gazouillent le matin"),
 ("She teaches English at school", "Elle enseigne l'anglais à l'école"),
 ("We eat breakfast together", "Nous prenons le petit déjeuner ensemble"),
 ("He paints landscapes", "Il peint des paysages"),
 ("They laugh at the joke", "Ils rient de la blague"),
 ("The clock ticks loudly", "L'horloge tic-tac bruyamment"),
 ("She runs in the park", "Elle court dans le parc"),
 ("We travel by train", "Nous voyageons en train"),
 ("He writes a letter", "Il écrit une lettre"),
 ("They read books at the library", "Ils lisent des livres à la bibliothèque"),
 ("The baby cries", "Le bébé pleure"),
 ("She studies hard for exams", "Elle étudie dur pour les examens"),
 ("We plant flowers in the garden", "Nous plantons des fleurs dans le jardin"),
 ("He fixes the car", "Il répare la voiture"),
 ("They drink coffee in the morning", "Ils boivent du café le matin"),
 ("The sun sets in the evening", "Le soleil se couche le soir"),
 ("She dances at the party", "Elle danse à la fête"),
 ("We play music at the concert", "Nous jouons de la musique au concert"),
 ("He cooks dinner for his family", "Il cuisine le dîner pour sa famille"),
 ("They study French grammar", "Ils étudient la grammaire française"),
 ("The rain falls gently", "La pluie tombe doucement"),
 ("She sings a song", "Elle chante une chanson"),
 ("We watch a movie together", "Nous regardons un film ensemble"),
 ("He sleeps deeply", "Il dort profondément"),
 ("They travel to Paris", "Ils voyagent à Paris"),
 ("The children play in the park", "Les enfants jouent dans le parc"),
 ("She walks along the beach", "Elle se promène le long de la plage"),
 ("We talk on the phone", "Nous parlons au téléphone"),
 ("He waits for the bus", "Il attend le bus"),
 ("They visit the Eiffel Tower", "Ils visitent la tour Eiffel"),
 ("The stars twinkle at night", "Les étoiles scintillent la nuit"),
 ("She dreams of flying", "Elle rêve de voler"),
 ("We work in the office", "Nous travaillons au bureau"),
 ("He studies history", "Il étudie l'histoire"),
 ("They listen to the radio", "Ils écoutent la radio"),
 ("The wind blows gently", "Le vent souffle doucement"),
 ("She swims in the ocean", "Elle nage dans l'océan"),
 ("We dance at the wedding", "Nous dansons au mariage"),
 ("He climbs the mountain", "Il gravit la montagne"),
 ("They hike in the forest", "Ils font de la randonnée dans la forêt"),
 ("The cat meows loudly", "Le chat miaule bruyamment"),
 ("She paints a picture", "Elle peint un tableau"),
 ("We build a sandcastle", "Nous construisons un château de sable"),
 ("He sings in the choir", "Il chante dans le chœur")
]


SOS_token = 0  
EOS_token = 1 
max_length = 12

def build_vocab(sentences):
    vocab = set()
    for pair in sentences:
        english_sentence, french_sentence = pair
        for word in english_sentence.split():
            vocab.add(word)
        for word in french_sentence.split():
            vocab.add(word)
    return vocab

english_vocab = build_vocab(english_to_french)
french_vocab = english_vocab  # Lets say French and English have the same vocabulary

char_to_index_english = {"SOS": SOS_token, "EOS": EOS_token, **{char: i+2 for i, char in enumerate(sorted(list(english_vocab)))}}
index_to_char_english = {i: char for char, i in char_to_index_english.items()}

char_to_index_french = {"SOS": SOS_token, "EOS": EOS_token, **{char: i+2 for i, char in enumerate(sorted(list(french_vocab)))}}
index_to_char_french = {i: char for char, i in char_to_index_french.items()}

class CustomDataset(Dataset):
    def __init__(self, dataset, char_to_index_english, char_to_index_french):
        self.dataset = dataset
        self.char_to_index_english = char_to_index_english
        self.char_to_index_french = char_to_index_french

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        english_sentence, french_sentence = self.dataset[idx]
        english_tensor = torch.tensor([self.char_to_index_english[word] for word in english_sentence.split()] + [EOS_token], dtype=torch.long)
        french_tensor = torch.tensor([self.char_to_index_french[word] for word in french_sentence.split()] + [EOS_token], dtype=torch.long)
        return english_tensor, french_tensor

custom_dataset = CustomDataset(english_to_french, char_to_index_english, char_to_index_french)
dataloader = DataLoader(custom_dataset, batch_size=1, shuffle=True)

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size))

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = torch.log_softmax(self.out(output[0]), dim=1)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

learning_rate = 0.01
criterion = nn.NLLLoss()
encoder = EncoderRNN(input_size=len(char_to_index_english), hidden_size=256).to(device)
decoder = DecoderRNN(hidden_size=256, output_size=len(char_to_index_french)).to(device)
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

# Training loop
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=12):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()

        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        if decoder_input.item() == EOS_token:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

n_epochs = 100

for epoch in range(n_epochs):
    total_loss = 0
    for input_tensor, target_tensor in dataloader:
        input_tensor = input_tensor[0].to(device)
        target_tensor = target_tensor[0].to(device)

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Training Loss: {total_loss / len(dataloader)}')

def evaluate(encoder, decoder, dataloader, criterion, n_samples=4):
    encoder.eval()
    decoder.eval()

    total_loss = 0
    total_sentences = 0
    correct_predictions = 0

    with torch.no_grad():
        for i, (input_tensor, target_tensor) in enumerate(dataloader):
            input_tensor = input_tensor[0].to(device)
            target_tensor = target_tensor[0].to(device)

            encoder_hidden = encoder.initHidden()
            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

            loss = 0

            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)
                encoder_outputs[ei] = encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = []

            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                predicted_indices.append(topi.item())
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
                if decoder_input.item() == EOS_token:
                    break

            total_loss += loss.item() / target_length
            total_sentences += 1

            if predicted_indices == target_tensor.tolist():
                correct_predictions += 1

            if i < n_samples:
                predicted_sentence = ' '.join([index_to_char_french[index] for index in predicted_indices if index not in (SOS_token, EOS_token)])
                target_sentence = ' '.join([index_to_char_french[index.item()] for index in target_tensor if index.item() not in (SOS_token, EOS_token)])
                input_sentence = ' '.join([index_to_char_english[index.item()] for index in input_tensor if index.item() not in (SOS_token, EOS_token)])

                print(f'Input String: {input_sentence}, Target String: {target_sentence}, Predicted String: {predicted_sentence}')

        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / total_sentences
        print(f'Evaluation Loss: {average_loss}, Accuracy: {accuracy}')

# Perform evaluation
evaluate(encoder, decoder, dataloader, criterion)


Epoch 0, Training Loss: 4.106473724927099
Epoch 10, Training Loss: 2.9174201144505667
Epoch 20, Training Loss: 2.448108242104113
Epoch 30, Training Loss: 1.7080453643747713
Epoch 40, Training Loss: 1.0160949710545355
Epoch 50, Training Loss: 0.2672420496471614
Epoch 60, Training Loss: 0.11822145122368906
Epoch 70, Training Loss: 0.06757466081929951
Epoch 80, Training Loss: 0.03951154294061099
Epoch 90, Training Loss: 0.02910678185448834
Input String: The cat is sleeping, Target String: Le chat dort, Predicted String: Le chat dort
Input String: The flowers bloom in spring, Target String: Les fleurs fleurissent au printemps, Predicted String: Les fleurs fleurissent au printemps
Input String: They read books at the library, Target String: Ils lisent des livres à la bibliothèque, Predicted String: Ils lisent des livres à la bibliothèque
Input String: The movie starts at 7 PM, Target String: Le film commence à 19 heures, Predicted String: Le film commence à 19 heures
Evaluation Loss: 0.0229

# Problem 4
Like homework 4, Repeat problem 3, this time try to translate from French to English. Train the model on the entire dataset and evaluate it on the entire dataset. Report training loss, validation loss, and validation accuracy. Also, try some qualitative validation as well, asking the network to generate French translations for some English sentences. Which one seems to be more effective, French-to-English or English-to-French? Compare your results against RNN-based models.

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


english_to_french = [
 ("I am cold", "J'ai froid"),
 ("You are tired", "Tu es fatigué"),
 ("He is hungry", "Il a faim"),
 ("She is happy", "Elle est heureuse"),
 ("We are friends", "Nous sommes amis"),
 ("They are students", "Ils sont étudiants"),
 ("The cat is sleeping", "Le chat dort"),
 ("The sun is shining", "Le soleil brille"),
 ("We love music", "Nous aimons la musique"),
 ("She speaks French fluently", "Elle parle français couramment"),
 ("He enjoys reading books", "Il aime lire des livres"),
 ("They play soccer every weekend", "Ils jouent au football chaque week-end"),
 ("The movie starts at 7 PM", "Le film commence à 19 heures"),
 ("She wears a red dress", "Elle porte une robe rouge"),
 ("We cook dinner together", "Nous cuisinons le dîner ensemble"),
 ("He drives a blue car", "Il conduit une voiture bleue"),
 ("They visit museums often", "Ils visitent souvent des musées"),
 ("The restaurant serves delicious food", "Le restaurant sert une délicieuse cuisine"),
 ("She studies mathematics at university", "Elle étudie les mathématiques àl'université"),
 ("We watch movies on Fridays", "Nous regardons des films le vendredi"),
 ("He listens to music while jogging", "Il écoute de la musique en faisant du jogging"),
 ("They travel around the world", "Ils voyagent autour du monde"),
 ("The book is on the table", "Le livre est sur la table"),
 ("She dances gracefully", "Elle danse avec grâce"),
 ("We celebrate birthdays with cake", "Nous célébrons les anniversaires avec ungâteau"),
 ("He works hard every day", "Il travaille dur tous les jours"),
 ("They speak different languages", "Ils parlent différentes langues"),
 ("The flowers bloom in spring", "Les fleurs fleurissent au printemps"),
 ("She writes poetry in her free time", "Elle écrit de la poésie pendant son tempslibre"),
 ("We learn something new every day", "Nous apprenons quelque chose de nouveauchaque jour"),
 ("The dog barks loudly", "Le chien aboie bruyamment"),
 ("He sings beautifully", "Il chante magnifiquement"),
 ("They swim in the pool", "Ils nagent dans la piscine"),
 ("The birds chirp in the morning", "Les oiseaux gazouillent le matin"),
 ("She teaches English at school", "Elle enseigne l'anglais à l'école"),
 ("We eat breakfast together", "Nous prenons le petit déjeuner ensemble"),
 ("He paints landscapes", "Il peint des paysages"),
 ("They laugh at the joke", "Ils rient de la blague"),
 ("The clock ticks loudly", "L'horloge tic-tac bruyamment"),
 ("She runs in the park", "Elle court dans le parc"),
 ("We travel by train", "Nous voyageons en train"),
 ("He writes a letter", "Il écrit une lettre"),
 ("They read books at the library", "Ils lisent des livres à la bibliothèque"),
 ("The baby cries", "Le bébé pleure"),
 ("She studies hard for exams", "Elle étudie dur pour les examens"),
 ("We plant flowers in the garden", "Nous plantons des fleurs dans le jardin"),
 ("He fixes the car", "Il répare la voiture"),
 ("They drink coffee in the morning", "Ils boivent du café le matin"),
 ("The sun sets in the evening", "Le soleil se couche le soir"),
 ("She dances at the party", "Elle danse à la fête"),
 ("We play music at the concert", "Nous jouons de la musique au concert"),
 ("He cooks dinner for his family", "Il cuisine le dîner pour sa famille"),
 ("They study French grammar", "Ils étudient la grammaire française"),
 ("The rain falls gently", "La pluie tombe doucement"),
 ("She sings a song", "Elle chante une chanson"),
 ("We watch a movie together", "Nous regardons un film ensemble"),
 ("He sleeps deeply", "Il dort profondément"),
 ("They travel to Paris", "Ils voyagent à Paris"),
 ("The children play in the park", "Les enfants jouent dans le parc"),
 ("She walks along the beach", "Elle se promène le long de la plage"),
 ("We talk on the phone", "Nous parlons au téléphone"),
 ("He waits for the bus", "Il attend le bus"),
 ("They visit the Eiffel Tower", "Ils visitent la tour Eiffel"),
 ("The stars twinkle at night", "Les étoiles scintillent la nuit"),
 ("She dreams of flying", "Elle rêve de voler"),
 ("We work in the office", "Nous travaillons au bureau"),
 ("He studies history", "Il étudie l'histoire"),
 ("They listen to the radio", "Ils écoutent la radio"),
 ("The wind blows gently", "Le vent souffle doucement"),
 ("She swims in the ocean", "Elle nage dans l'océan"),
 ("We dance at the wedding", "Nous dansons au mariage"),
 ("He climbs the mountain", "Il gravit la montagne"),
 ("They hike in the forest", "Ils font de la randonnée dans la forêt"),
 ("The cat meows loudly", "Le chat miaule bruyamment"),
 ("She paints a picture", "Elle peint un tableau"),
 ("We build a sandcastle", "Nous construisons un château de sable"),
 ("He sings in the choir", "Il chante dans le chœur")
]


french_to_english = [(french, english) for (english, french) in english_to_french]

SOS_token = 0 
EOS_token = 1  
max_length = 12

def build_vocab(sentences):
    vocab = set()
    for pair in sentences:
        english_sentence, french_sentence = pair
        for word in english_sentence.split():
            vocab.add(word)
        for word in french_sentence.split():
            vocab.add(word)
    return vocab

english_vocab = build_vocab(english_to_french)
french_vocab = english_vocab

char_to_index_eng = {"SOS": SOS_token, "EOS": EOS_token, **{char: i+2 for i, char in enumerate(sorted(list(english_vocab)))}}
index_to_char_eng = {i: char for char, i in char_to_index_eng.items()}

char_to_index_fr = {"SOS": SOS_token, "EOS": EOS_token, **{char: i+2 for i, char in enumerate(sorted(list(french_vocab)))}}
index_to_char_fr = {i: char for char, i in char_to_index_fr.items()}

class CustomDataset(Dataset):
    def __init__(self, dataset, char_to_index_eng, char_to_index_fr):
        self.dataset = dataset
        self.char_to_index_eng = char_to_index_eng
        self.char_to_index_fr = char_to_index_fr

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        eng_sentence, fr_sentence = self.dataset[idx]
        eng_tensor = torch.tensor([self.char_to_index_eng[word] for word in eng_sentence.split()] + [EOS_token], dtype=torch.long)
        fr_tensor = torch.tensor([self.char_to_index_fr[word] for word in fr_sentence.split()] + [EOS_token], dtype=torch.long)
        return eng_tensor, fr_tensor

custom_dataset = CustomDataset(french_to_english, char_to_index_fr, char_to_index_eng)
dataloader = DataLoader(custom_dataset, batch_size=1, shuffle=True)

# Initializing encoder and decoder
input_size_eng = len(char_to_index_eng)
input_size_fr = len(char_to_index_fr)
hidden_size = 256  

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, max_length=12, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = torch.softmax(
            self.attn(torch.cat((embedded[0], hidden[0][0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = torch.relu(output)
        output, hidden = self.lstm(output, hidden)

        output = torch.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


learning_rate = 0.01
criterion = nn.NLLLoss()
encoder = EncoderRNN(input_size=input_size_eng, hidden_size=hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size=hidden_size, output_size=input_size_fr).to(device)
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

# Training loop
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=12):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)
    decoder_hidden = encoder_hidden

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()

        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        if decoder_input.item() == EOS_token:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

n_epochs = 100

for epoch in range(n_epochs):
    total_loss = 0
    for input_tensor_eng, target_tensor_fr in dataloader:
        input_tensor_eng = input_tensor_eng[0].to(device)
        target_tensor_fr = target_tensor_fr[0].to(device)

        loss = train(input_tensor_eng, target_tensor_fr, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Training Loss: {total_loss / len(dataloader)}')

def evaluate(encoder, decoder, dataloader, criterion, n_samples=4):
    encoder.eval()
    decoder.eval()

    total_loss = 0
    total_sentences = 0
    correct_predictions = 0

    with torch.no_grad():
        for i, (input_tensor_eng, target_tensor_fr) in enumerate(dataloader):
            input_tensor_eng = input_tensor_eng[0].to(device)
            target_tensor_fr = target_tensor_fr[0].to(device)

            encoder_hidden = encoder.initHidden()
            input_length = input_tensor_eng.size(0)
            target_length = target_tensor_fr.size(0)

            encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

            loss = 0

            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor_eng[ei].unsqueeze(0), encoder_hidden)
                encoder_outputs[ei] = encoder_output[0, 0]

            decoder_input = torch.tensor([[SOS_token]], device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = []

            for di in range(target_length):
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                topv, topi = decoder_output.topk(1)
                predicted_indices.append(topi.item())
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output, target_tensor_fr[di].unsqueeze(0))
                if decoder_input.item() == EOS_token:
                    break

            total_loss += loss.item() / target_length
            total_sentences += 1

            if predicted_indices == target_tensor_fr.tolist():
                correct_predictions += 1

            if i < n_samples:
                predicted_sentence = ' '.join([index_to_char_fr[index] for index in predicted_indices if index not in (SOS_token, EOS_token)])
                target_sentence = ' '.join([index_to_char_fr[index.item()] for index in target_tensor_fr if index.item() not in (SOS_token, EOS_token)])
                input_sentence = ' '.join([index_to_char_eng[index.item()] for index in input_tensor_eng if index.item() not in (SOS_token, EOS_token)])

                print(f'Input String: {input_sentence}, Target String: {target_sentence}, Predicted String: {predicted_sentence}')

        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / total_sentences
        print(f'Evaluation Loss: {average_loss}, Accuracy: {accuracy}')

# Perform evaluation
evaluate(encoder, decoder, dataloader, criterion)


Epoch 0, Training Loss: 4.06513884655841
Epoch 10, Training Loss: 2.843822609545201
Epoch 20, Training Loss: 2.1987189512010996
Epoch 30, Training Loss: 1.0845531671483648
Epoch 40, Training Loss: 0.2560738203688839
Epoch 50, Training Loss: 0.08998587696537162
Epoch 60, Training Loss: 0.05212681795791225
Epoch 70, Training Loss: 0.03579304960700893
Epoch 80, Training Loss: 0.02710383245093666
Epoch 90, Training Loss: 0.021757809355182265
Input String: Il travaille dur tous les jours, Target String: He works hard every day, Predicted String: He works hard every day
Input String: Il attend le bus, Target String: He waits for the bus, Predicted String: He waits for the bus
Input String: Ils écoutent la radio, Target String: They listen to the radio, Predicted String: They listen to the radio
Input String: Elle danse à la fête, Target String: She dances at the party, Predicted String: She dances at the party
Evaluation Loss: 0.01715786101834488, Accuracy: 1.0
