In [170]:
import torch

def get_device():
    if torch.backends.mps.is_available():
        return 'mps'
    elif torch.cuda.is_available():
        return 'cuda'
    else:
        return 'cpu'

device = 'mps'
print("Using device:", device)

Using device: cpu


In [171]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from nltk.tokenize import word_tokenize

# Load and preprocess data
data = pd.read_csv('data.csv', header=None)
data_subset = data.iloc[:1000]

In [172]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.src = self.data.iloc[:, 0]
        self.trg = self.data.iloc[:, 1]

        self.src_tokenizer = word_tokenize
        self.trg_tokenizer = word_tokenize
        # Ensure special tokens are in the dictionary
        self.word2idx = {"<pad>": 0, "<unk>": 1, "<sos>": 2, "<eos>": 3}
        self.build_vocab()

    def build_vocab(self):
        for index, row in self.data.iterrows():
            src_words = self.src_tokenizer(row[0].lower()) + ["<sos>", "<eos>"]
            trg_words = self.trg_tokenizer(row[1].lower()) + ["<sos>", "<eos>"]
            for word in src_words + trg_words:
                if word not in self.word2idx:
                    self.word2idx[word] = len(self.word2idx)

    def tokenize(self, text):
        return [self.word2idx.get(word, self.word2idx["<unk>"]) for word in word_tokenize(text.lower()) + ["<eos>"]]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src = torch.tensor([self.word2idx["<sos>"]] + self.tokenize(self.src.iloc[idx]), dtype=torch.long)
        trg = torch.tensor([self.word2idx["<sos>"]] + self.tokenize(self.trg.iloc[idx]), dtype=torch.long)
        return src, trg


In [173]:
dataset = TranslationDataset(data_subset)
train_size = int(0.7 * len(dataset))
valid_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - valid_size
train_dataset, valid_dataset, test_dataset = random_split(dataset, [train_size, valid_size, test_size])


In [174]:
# Define collate function for DataLoader
def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_batch = torch.nn.utils.rnn.pad_sequence(trg_batch, padding_value=0, batch_first=True)
    return src_batch, trg_batch

In [175]:
# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [176]:
# Define the models
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)

    def forward(self, input):
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded)
        return output, hidden

In [177]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, encoder_output):
        embedded = self.embedding(input)
        # Ensure hidden states are correctly dimensioned
        # Assuming hidden is a tuple (h_n, c_n)
        if hidden[0].dim() == 3 and hidden[1].dim() == 3:
            output, hidden = self.rnn(embedded, hidden)
        else:
            raise ValueError("Hidden states should be 3-D tensors")
        output = self.out(output)
        return output, hidden


In [178]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg):
        encoder_output, encoder_hidden = self.encoder(src)
        decoder_output, decoder_hidden = self.decoder(trg, encoder_hidden, encoder_output)
        return decoder_output

In [179]:
# Initialize the model
input_size = len(dataset.word2idx)
hidden_size = 256
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(input_size, hidden_size)


In [180]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Initialize the model, optimizer, and loss function as before
# Initialize the model
model = Seq2Seq(encoder, decoder)
model = model.to(device)  # Move model to the appropriate device

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)

In [181]:
# Define a learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)




In [182]:
# Training and evaluation function modifications
def train(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src, trg in loader:
        src = src.to(device)  # Move data to the appropriate device
        trg = trg.to(device)  # Move data to the appropriate device
        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [183]:
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for src, trg in loader:
            src = src.to(device)  # Move data to the appropriate device
            trg = trg.to(device)  # Move data to the appropriate device
            output = model(src, trg[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:, 1:].contiguous().view(-1)
            loss = criterion(output, trg)
            total_loss += loss.item()
    return total_loss / len(loader)

In [184]:
# Run training and evaluation with early stopping and learning rate scheduler
n_epochs = 100
best_valid_loss = float('inf')
no_improvement_count = 0  # Counter to track epochs without improvement
import pickle

In [185]:
for epoch in range(n_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    valid_loss = evaluate(model, valid_loader, criterion, device)

    # Step the scheduler with the current validation loss
    scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        no_improvement_count = 0  # Reset counter
        # Save model
        torch.save(model.state_dict(), 'best_model.pth')
        with open('vocab.pkl', 'wb') as f:
            pickle.dump(dataset.word2idx, f)
        print(f'Epoch {epoch+1}: Validation loss improved, model saved.')
    else:
        no_improvement_count += 1
        print(f'Epoch {epoch+1}: No improvement in validation loss for {no_improvement_count} epochs.')

    print(f'Train Loss: {train_loss:.3f}, Valid Loss: {valid_loss:.3f}')

    # Check if early stopping is needed
    if no_improvement_count >= 10:
        print("No improvement in validation loss for 10 consecutive epochs, stopping training.")
        break

Epoch 1: Validation loss improved, model saved.
Train Loss: 7.848, Valid Loss: 6.588
Epoch 2: Validation loss improved, model saved.
Train Loss: 6.015, Valid Loss: 6.508
Epoch 3: Validation loss improved, model saved.
Train Loss: 5.691, Valid Loss: 6.485
Epoch 4: Validation loss improved, model saved.
Train Loss: 5.482, Valid Loss: 6.458
Epoch 5: Validation loss improved, model saved.
Train Loss: 5.297, Valid Loss: 6.426
Epoch 6: Validation loss improved, model saved.
Train Loss: 5.097, Valid Loss: 6.391
Epoch 7: No improvement in validation loss for 1 epochs.
Train Loss: 4.903, Valid Loss: 6.401
Epoch 8: Validation loss improved, model saved.
Train Loss: 4.712, Valid Loss: 6.370
Epoch 9: No improvement in validation loss for 1 epochs.
Train Loss: 4.518, Valid Loss: 6.371
Epoch 10: Validation loss improved, model saved.
Train Loss: 4.324, Valid Loss: 6.370
Epoch 11: No improvement in validation loss for 1 epochs.
Train Loss: 4.136, Valid Loss: 6.402
Epoch 12: No improvement in validati

In [186]:
# Load the best model and test
model.load_state_dict(torch.load('best_model.pth'))
test_loss = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.3f}')

Test Loss: 6.303


In [187]:
# Load vocabulary
import pickle
with open('vocab.pkl', 'rb') as f:
    loaded_word2idx = pickle.load(f)

# Update the dataset to use the loaded vocabulary
dataset.word2idx = loaded_word2idx
input_size = len(loaded_word2idx)

# Redefine the model with the correct input size
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(input_size, hidden_size)
model = Seq2Seq(encoder, decoder)

# Load the saved model weights
model.load_state_dict(torch.load('best_model.pth'))
model.eval()


Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(8694, 256)
    (rnn): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(8694, 256)
    (rnn): LSTM(256, 256, batch_first=True)
    (out): Linear(in_features=256, out_features=8694, bias=True)
  )
)

In [188]:
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

# Example of preparing a sentence and translating
def prepare_sentence(sentence, dataset, device):
    tokens = dataset.tokenize(sentence)
    numerical = torch.tensor([tokens], dtype=torch.long).to(device)  # Move tensor to the appropriate device
    return numerical

def translate(model, src_tensor, dataset, device='cpu'):
    src_tensor = src_tensor.to(device)  # Move tensor to the correct device
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor)

    trg_indexes = [dataset.word2idx['<sos>']]  # Start with the <sos> token

    # Initial hidden state setup, ensuring it's 3-D
    hidden = (hidden[0].unsqueeze(0), hidden[1].unsqueeze(0))

    for i in range(100):
        trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
        with torch.no_grad():
            output, hidden = model.decoder(trg_tensor, hidden, encoder_outputs)
        pred_token = output.argmax(1).item()
        trg_indexes.append(pred_token)
        if pred_token == dataset.word2idx['<eos>']:  # Stop if <eos> token is generated
            break

    trg_tokens = [dataset.idx2word.get(i, "<unk>") for i in trg_indexes[1:]]  # Convert indices to tokens
    return " ".join(trg_tokens)



# Example usage:
korean_sentence = "안녕하세요. 저는 학생입니다."
input_tensor = prepare_sentence(korean_sentence, dataset, device)
translation = translate(model, input_tensor, dataset, device)
print("Translated:", translation)



ValueError: Hidden states should be 3-D tensors