In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from nltk.tokenize import word_tokenize
from torch.optim.lr_scheduler import ReduceLROnPlateau
import pickle

In [78]:
# Determine the best device to run on
def get_device():
    if torch.backends.mps.is_available():
        return 'mps'
    elif torch.cuda.is_available():
        return 'cuda'
    else:
        return 'cpu'
device = get_device()
print("Using device:", device)

Using device: mps


In [79]:
# Define the translation dataset
class TranslationDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.src = self.data.iloc[:, 0]
        self.trg = self.data.iloc[:, 1]
        self.src_tokenizer = word_tokenize
        self.trg_tokenizer = word_tokenize
        self.word2idx = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
        self.idx2word = {}  # Initialize the idx2word dictionary
        self.build_vocab()

    def build_vocab(self):
        for index, row in self.data.iterrows():
            src_words = self.src_tokenizer(row[0].lower()) + ["<sos>", "<eos>"]
            trg_words = self.trg_tokenizer(row[1].lower()) + ["<sos>", "<eos>"]
            for word in src_words + trg_words:
                if word not in self.word2idx:
                    self.word2idx[word] = len(self.word2idx)
                    self.idx2word[self.word2idx[word]] = word  # Add to idx2word dictionary


    def tokenize(self, text):
        return [self.word2idx.get(word, self.word2idx["<unk>"]) for word in word_tokenize(text.lower()) + ["<eos>"]]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = torch.tensor([self.word2idx["<sos>"]] + self.tokenize(self.src.iloc[idx]), dtype=torch.long)
        trg = torch.tensor([self.word2idx["<sos>"]] + self.tokenize(self.trg.iloc[idx]), dtype=torch.long)
        return src, trg

In [80]:
# Load data
data = pd.read_csv('data.csv', header=None)
data_subset = data[:10000]
dataset = TranslationDataset(data_subset)
train_size = int(0.7 * len(dataset))
valid_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])


In [81]:
# DataLoader setup
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_batch = nn.utils.rnn.pad_sequence(trg_batch, padding_value=0, batch_first=True)
    return src_batch, trg_batch

In [82]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

In [83]:
# Define model architecture
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)

    def forward(self, input):
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded)
        return output, hidden

In [84]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_output):
        # Add batch dimension to input if necessary (making sure it's always 3D)
        if input.dim() == 1:
            input = input.unsqueeze(0)  # Add batch dimension if it's missing
        embedded = self.embedding(input)

        # Adjust hidden state dimensions if necessary
        if hidden[0].dim() == 2:
            hidden = (hidden[0].unsqueeze(0), hidden[1].unsqueeze(0))  # Ensure hidden is 3D by adding batch dimension

        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output.squeeze(0))  # Remove batch dimension for linear layer if needed
        return output, hidden

In [85]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        encoder_output, encoder_hidden = self.encoder(src)
        decoder_output, decoder_hidden = self.decoder(trg, encoder_hidden, encoder_output)
        return decoder_output

In [86]:
# Initialize the model
input_size = len(dataset.word2idx)
hidden_size = 256
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(input_size, hidden_size)
model = Seq2Seq(encoder, decoder)
model = model.to(device)

In [87]:
# Optimization setup
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Scheduler for learning rate adjustment
scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5, verbose=True)




In [88]:
# Early Stopping and Model Checkpointing
class EarlyStopping:
    def __init__(self, patience=10, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = float('inf')
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [89]:
# Initialize EarlyStopping
early_stopping = EarlyStopping(patience=20, verbose=True, path='best_model_final5.pth')


In [90]:
# Define training and evaluation functions
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, trg in loader:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])  # Ignore <eos> for input sequence
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)  # Ignore <sos> for target sequence
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [91]:
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, trg in loader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg[:, :-1])  # Ignore <eos> for input sequence
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)  # Ignore <sos> for target sequence
            loss = criterion(output, trg)
            total_loss += loss.item()
    return total_loss / len(loader)

In [92]:
# Run training and evaluate model
n_epochs = 1000
best_valid_loss = float('inf')
for epoch in range(n_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    valid_loss = evaluate(model, valid_loader, criterion, device)

    # Checkpoint the model if it's the best so far
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model_final5.pth')
        print(f'Epoch {epoch}: New optimal model saved with loss {valid_loss:.4f}')

    # Early Stopping check
    early_stopping(valid_loss, model)
    if early_stopping.early_stop:
        print("Early stopping triggered")
        break

Epoch 0: New optimal model saved with loss 6.4343
Validation loss decreased (inf --> 6.434255).  Saving model ...
Epoch 1: New optimal model saved with loss 6.3297
Validation loss decreased (6.434255 --> 6.329722).  Saving model ...
Epoch 2: New optimal model saved with loss 6.2602
Validation loss decreased (6.329722 --> 6.260222).  Saving model ...
Epoch 3: New optimal model saved with loss 6.1996
Validation loss decreased (6.260222 --> 6.199623).  Saving model ...
Epoch 4: New optimal model saved with loss 6.1452
Validation loss decreased (6.199623 --> 6.145158).  Saving model ...
Epoch 5: New optimal model saved with loss 6.0950
Validation loss decreased (6.145158 --> 6.095039).  Saving model ...
Epoch 6: New optimal model saved with loss 6.0491
Validation loss decreased (6.095039 --> 6.049051).  Saving model ...
Epoch 7: New optimal model saved with loss 6.0083
Validation loss decreased (6.049051 --> 6.008292).  Saving model ...
Epoch 8: New optimal model saved with loss 5.9750
Val

In [93]:
# Prepare a sentence for translation
def prepare_sentence(sentence, dataset, device):
    tokens = dataset.tokenize(sentence)
    numerical = torch.tensor([tokens], dtype=torch.long).to(device)
    return numerical

In [94]:
# Load the best model for testing
model.load_state_dict(torch.load('best_model_final5.pth'))
test_loss = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}')

Test Loss: 5.6747


In [95]:
# Translate the sentence
def translate(model, src_tensor, dataset, device):
    model.eval()
    src_tensor = src_tensor.to(device)
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)

    trg_indexes = [dataset.word2idx['<sos>']]  # Start token

    for _ in range(100):  # Maximum length of the translated sentence
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == dataset.word2idx['<eos>']:  # End token
            break

    translated_sentence = ' '.join(dataset.idx2word.get(idx, '<unk>') for idx in trg_indexes[1:-1])  # Skip <sos> and exclude <eos>
    return translated_sentence

In [96]:
'''
# Run training and evaluate model
n_epochs = 100
best_valid_loss = float('inf')
for epoch in range(n_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    valid_loss = evaluate(model, valid_loader, criterion, device)
    scheduler.step(valid_loss)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pth')
    if epoch % 5 == 0:
        print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')

# Test model performance
model.load_state_dict(torch.load('best_model.pth'))
test_loss = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}')



'''

"\n# Run training and evaluate model\nn_epochs = 100\nbest_valid_loss = float('inf')\nfor epoch in range(n_epochs):\n    train_loss = train(model, train_loader, optimizer, criterion, device)\n    valid_loss = evaluate(model, valid_loader, criterion, device)\n    scheduler.step(valid_loss)\n    if valid_loss < best_valid_loss:\n        best_valid_loss = valid_loss\n        torch.save(model.state_dict(), 'best_model.pth')\n    if epoch % 5 == 0:\n        print(f'Epoch: {epoch}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')\n\n# Test model performance\nmodel.load_state_dict(torch.load('best_model.pth'))\ntest_loss = evaluate(model, test_loader, criterion, device)\nprint(f'Test Loss: {test_loss:.4f}')\n\n\n\n"

In [97]:
# Example sentence
korean_sentence = "안녕하세요. 저는 학생입니다."
input_tensor = prepare_sentence(korean_sentence, dataset, device)

# Output translation
translation = translate(model, input_tensor, dataset, device)
print("Original :", korean_sentence, "Translation:", translation)

Original : 안녕하세요. 저는 학생입니다. Translation: the most of the same .
