<a href="https://colab.research.google.com/github/Ha1ion/2025_NLP_HW2/blob/main/nlp_hw2_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RNN-arithmetic

## Dataset
- [Arithmetic dataset](https://drive.google.com/file/d/1cMuL3hF9jefka9RyF4gEBIGGeFGZYHE-/view?usp=sharing)

此作業有使用Gemini幫忙下註解

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install seaborn
! pip install opencc
! pip install -U scikit-learn

import numpy as np
import pandas as pd
import torch
import torch.nn
import torch.nn.utils.rnn
import torch.utils.data
import matplotlib.pyplot as plt
import seaborn as sns
import opencc
import os
from sklearn.model_selection import train_test_split

data_path = './data'

Collecting opencc
  Downloading OpenCC-1.1.9-cp312-cp312-manylinux2014_x86_64.whl.metadata (13 kB)
Downloading OpenCC-1.1.9-cp312-cp312-manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencc
Successfully installed opencc-1.1.9
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
Successfully installed scikit-learn-1.7.2


In [None]:
data_path = '/content/drive/MyDrive/2025_NLP_HW2'
df_train = pd.read_csv(os.path.join(data_path, 'arithmetic_train.csv'))
df_eval = pd.read_csv(os.path.join(data_path, 'arithmetic_eval.csv'))
df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt
0,2285313,14*(43+20)=,882
1,317061,(6+1)*5=,35
2,718770,13+32+29=,74
3,170195,31*(3-11)=,-248
4,2581417,24*49+1=,1177


In [None]:
# transform the input data to string
df_train['tgt'] = df_train['tgt'].apply(lambda x: str(x))

df_train['src'] = df_train['src'].add(df_train['tgt'])
df_train['len'] = df_train['src'].apply(lambda x: len(x))

df_eval['tgt'] = df_eval['tgt'].apply(lambda x: str(x))

# Build Dictionary
 - The model cannot perform calculations directly with plain text.
 - Convert all text (numbers/symbols) into numerical representations.
 - Special tokens
    - '&lt;pad&gt;'
        - Each sentence within a batch may have different lengths.
        - The length is padded with '&lt;pad&gt;' to match the longest sentence in the batch.
    - '&lt;eos&gt;'
        - Specifies the end of the generated sequence.
        - Without '&lt;eos&gt;', the model will not know when to stop generating.

In [None]:
char_to_id = {}
id_to_char = {}

# write your code here
# Build a dictionary and give every token in the train dataset an id
# The dictionary should contain <eos> and <pad>
# char_to_id is to conver charactors to ids, while id_to_char is the opposite

# 建立一個包含所有字元的集合
vocab = set()
for text in df_train['src']:
    vocab.update(list(text))

# 特殊 tokens
special_tokens = ['<pad>', '<eos>']

# 建立 char_to_id 和 id_to_char 字典
# <pad> 的 id 必須是 0，因為 PyTorch 的 padding 函式預設用 0 來填充
char_to_id = {token: i for i, token in enumerate(special_tokens + sorted(list(vocab)))}
id_to_char = {i: token for token, i in char_to_id.items()}

# 更新 df_train 中的 'src' 欄位，確保它包含了答案部分
# 這一行在助教的程式碼中是下一個 cell，但邏輯上屬於這裡的前處理
# df_train['src'] = df_train['src'].add(df_train['tgt'])
# 檢查一下助教的程式碼，如果這行已經在下一個 cell，就不用重複加

vocab_size = len(char_to_id)
print('Vocab size{}'.format(vocab_size))

Vocab size18


# Data Preprocessing
 - The data is processed into the format required for the model's input and output. (End with \<eos\> token)


In [None]:
def text_to_ids(text, char_map):
    return [char_map[char] for char in text]


#df_train['src'] = df_train['src'].str.replace('=', '') + '=' + df_train['tgt']

df_train['char_id_list'] = df_train['src'].apply(lambda x: text_to_ids(x, char_to_id) + [char_to_id['<eos>']])

def create_shifted_label(char_ids):
    return char_ids[1:] + [char_to_id['<pad>']]

df_train['label_id_list'] = df_train['char_id_list'].apply(create_shifted_label)

df_eval['src'] = df_eval['src'].str.replace('=', '') + '='

df_train.head()

Unnamed: 0.1,Unnamed: 0,src,tgt,len,char_id_list,label_id_list
0,2285313,14*(43+20)882=882,882,14,"[8, 11, 4, 2, 11, 10, 5, 9, 7, 3, 15, 15, 9, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15,..."
1,317061,(6+1)*535=35,35,10,"[2, 13, 5, 8, 3, 4, 12, 10, 12, 17, 10, 12, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 12, 1]"
2,718770,13+32+2974=74,74,11,"[8, 10, 5, 10, 9, 5, 9, 16, 14, 11, 17, 14, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 1]"
3,170195,31*(3-11)-248=-248,-248,14,"[10, 8, 4, 2, 10, 6, 8, 8, 3, 6, 9, 11, 15, 17...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, ..."
4,2581417,24*49+11177=1177,1177,12,"[9, 11, 4, 11, 16, 5, 8, 8, 8, 14, 14, 17, 8, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 14,..."


# Hyper Parameters

|Hyperparameter|Meaning|Value|
|-|-|-|
|`batch_size`|Number of data samples in a single batch|64|
|`epochs`|Total number of epochs to train|10|
|`embed_dim`|Dimension of the word embeddings|256|
|`hidden_dim`|Dimension of the hidden state in each timestep of the LSTM|256|
|`lr`|Learning Rate|0.001|
|`grad_clip`|To prevent gradient explosion in RNNs, restrict the gradient range|1|

In [None]:
batch_size = 64
epochs = 2
embed_dim = 256
hidden_dim = 256
lr = 0.001
grad_clip = 1

# Data Batching
- Use `torch.utils.data.Dataset` to create a data generation tool called  `dataset`.
- The, use `torch.utils.data.DataLoader` to randomly sample from the `dataset` and group the samples into batches.

- Example: 1+2-3=0
    - Model input: 1 + 2 - 3 = 0
    - Model output: / / / / / 0 &lt;eos&gt;  (the '/' can be replaced with &lt;pad&gt;)
    - The key for the model's output is that the model does not need to predict the next character of the previous part. What matters is that once the model sees '=', it should start generating the answer, which is '0'. After generating the answer, it should also generate&lt;eos&gt;

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        # return the amount of data
        return len(self.sequences) # Write your code here

    def __getitem__(self, index):
        # Extract the input data x and the ground truth y from the data
        row = self.sequences.iloc[index]
        x = row['char_id_list'] # Write your code here
        y = row['label_id_list'] # Write your code here
        return x, y

# collate function, used to build dataloader
def collate_fn(batch):
    batch_x = [torch.tensor(data[0]) for data in batch]
    batch_y = [torch.tensor(data[1]) for data in batch]
    batch_x_lens = torch.LongTensor([len(x) for x in batch_x])
    batch_y_lens = torch.LongTensor([len(y) for y in batch_y])

    # Pad the input sequence
    pad_batch_x = torch.nn.utils.rnn.pad_sequence(batch_x,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])

    pad_batch_y = torch.nn.utils.rnn.pad_sequence(batch_y,
                                                  batch_first=True,
                                                  padding_value=char_to_id['<pad>'])

    return pad_batch_x, pad_batch_y, batch_x_lens, batch_y_lens

In [None]:
ds_train = Dataset(df_train[['char_id_list', 'label_id_list']])

In [None]:
# Build dataloader of train set and eval set, collate_fn is the collate function
ds_train = Dataset(df_train) # Write your code here

dl_train = torch.utils.data.DataLoader(ds_train,
                                       batch_size=batch_size,
                                       shuffle=True,
                                       collate_fn=collate_fn)

# Model Design

## Execution Flow
1. Convert all characters in the sentence into embeddings.
2. Pass the embeddings through an RNN sequentially.
3. The output of the RNN is passed into another RNN, and additional layers can be added.
4. The output from all time steps of the final RNN is passed through a Fully Connected layer.
5. The character corresponding to the maximum value across all output dimensions is selected as the next character.

## Loss Function
Since this is a classification task, Cross Entropy is used as the loss function.

## Gradient Update
Adam algorithm is used for gradient updates.

In [None]:
class CharRNN(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(CharRNN, self).__init__()

        self.embedding = torch.nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=embed_dim,
                                            padding_idx=char_to_id['<pad>'])

        # --- 修改處：將 LSTM 改為 RNN ---
        self.rnn_layer1 = torch.nn.RNN(input_size=embed_dim,
                                       hidden_size=hidden_dim,
                                       batch_first=True)

        # --- 修改處：將 LSTM 改為 RNN ---
        self.rnn_layer2 = torch.nn.RNN(input_size=hidden_dim,
                                       hidden_size=hidden_dim,
                                       batch_first=True)

        self.linear = torch.nn.Sequential(torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=hidden_dim),
                                          torch.nn.ReLU(),
                                          torch.nn.Linear(in_features=hidden_dim,
                                                          out_features=vocab_size))

    def forward(self, batch_x, batch_x_lens):
        return self.encoder(batch_x, batch_x_lens)

    # The forward pass of the model
    def encoder(self, batch_x, batch_x_lens):
        batch_x = self.embedding(batch_x)

        batch_x = torch.nn.utils.rnn.pack_padded_sequence(batch_x,
                                                          batch_x_lens,
                                                          batch_first=True,
                                                          enforce_sorted=False)

        batch_x, _ = self.rnn_layer1(batch_x)
        batch_x, _ = self.rnn_layer2(batch_x)

        batch_x, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_x,
                                                            batch_first=True)

        batch_x = self.linear(batch_x)

        return batch_x

    def generator(self, start_char, max_len=200):

        char_list = [char_to_id[c] for c in start_char]

        next_char = None

        while len(char_list) < max_len:
            input_seq = torch.LongTensor([char_list]).to(next(self.parameters()).device)
            y = self.encoder(input_seq, torch.LongTensor([len(char_list)]))

            last_time_step_pred = y[0, -1, :]
            next_char_id = torch.argmax(last_time_step_pred).item()
            next_char = next_char_id

            if next_char == char_to_id['<eos>']:
                break

            char_list.append(next_char)

        return [id_to_char[ch_id] for ch_id in char_list]

In [None]:
torch.manual_seed(2)

if not torch.cuda.is_available():
    raise RuntimeError("GPU not available. Please change the runtime type to GPU.")
device = torch.device('cuda')
print(f"Device set to: {device}")

model = CharRNN(vocab_size,
                embed_dim,
                hidden_dim)

model.to(device)

Device set to: cuda


CharRNN(
  (embedding): Embedding(18, 256, padding_idx=0)
  (rnn_layer1): LSTM(256, 256, batch_first=True)
  (rnn_layer2): LSTM(256, 256, batch_first=True)
  (linear): Sequential(
    (0): Linear(in_features=256, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=18, bias=True)
  )
)

In [None]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_id['<pad>'])
# Write your code here. Cross-entropy loss function. The loss function should ignore <pad>
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)# Write your code here. Use Adam or AdamW for Optimizer

# Training
1. The outer `for` loop controls the `epoch`
    1. The inner `for` loop uses `data_loader` to retrieve batches.
        1. Pass the batch to the `model` for training.
        2. Compare the predicted results `batch_pred_y` with the true labels `batch_y` using Cross Entropy to calculate the loss `loss`
        3. Use `loss.backward` to automatically compute the gradients.
        4. Use `torch.nn.utils.clip_grad_value_` to limit the gradient values between `-grad_clip` &lt; and &lt; `grad_clip`.
        5. Use `optimizer.step()` to update the model (backpropagation).
2.  After every `1000` batches, output the current loss to monitor whether it is converging.

In [None]:
from tqdm import tqdm
from copy import deepcopy
model = model.to(device)
i = 0

# --- 外層主迴圈 ---
for epoch in range(1, epochs + 1):
    # --- 1. 訓練階段 ---
    model.train()
    bar = tqdm(dl_train, desc=f"Train epoch {epoch}")
    for batch_x, batch_y, batch_x_lens, batch_y_lens in bar:
        optimizer.zero_grad()
        batch_pred_y = model(batch_x.to(device), batch_x_lens)
        loss = criterion(batch_pred_y.view(-1, vocab_size), batch_y.to(device).view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_value_(model.parameters(), grad_clip)
        optimizer.step()
        bar.set_postfix(loss=loss.item())

    # --- 2. 評估階段 ---
    model.eval()
    matched = 0
    total = 0
    bar_eval = tqdm(df_eval.iterrows(), desc=f"Validation epoch {epoch}")

    with torch.no_grad(): # 在評估時加入 no_grad() 是個好習慣
        for _, row in bar_eval:
            batch_x = row['src']
            batch_y = row['tgt']

            prediction_chars = model.generator(batch_x)
            prediction_str = "".join(prediction_chars)

            if '=' in prediction_str:
                answer_part = prediction_str.split('=', 1)[1]
                predicted_answer = answer_part.split('<eos>', 1)[0]

                predicted_answer = predicted_answer.strip()

            else:
                predicted_answer = ""

            if predicted_answer == batch_y:
                matched += 1
            total += 1

    # --- 3. 印出該 epoch 的準確率 ---
    if total > 0:
        accuracy = matched / total
        print(f"Epoch {epoch} Validation Accuracy: {accuracy:.4f}")
    else:
        print(f"Epoch {epoch} Validation: No data evaluated.")

Train epoch 1: 100%|██████████| 37020/37020 [07:53<00:00, 78.22it/s, loss=1.94e-9]
Validation epoch 1: 1774it [19:06,  1.49it/s]