<a href="https://colab.research.google.com/github/GuanRuLai/Python-Deep-Learning/blob/main/Pytorch_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preprocessing

## Load data

In [None]:
academy_titles = []
job_titles = []

with open("academy_titles.txt", "r", encoding= "utf-8") as f:
    for l in f:
      academy_titles.append(l.strip()) # remove spaces of head and tail

with open("job_titles.txt", "r", encoding= "utf-8") as f:
    for l in f:
      job_titles.append(l.strip()) # remove spaces of head and tail

print(academy_titles[:5])
print(job_titles[:5])

['北师教育学，你我一起努力，让胜利酣畅淋漓。', '考博英语词汇', '出售人大新闻学院2015年考研权威资料', '【脑科院 郭桃梅课题组】科研助理招聘', '管理学院的同学帮帮忙呐～']
['【字节跳动内推】校招岗位全面开放，帮查进度！', '招聘兼职/ 笔试考务 /200-300 每人', '国企出版社招聘坐班兼职生', '【在线早教】教研实习生招聘', '【兼职】心理学公众号寻兼职写手']


## Word tokenizing

In [None]:
char_set = set()

for title in academy_titles:
    for char in title:
        char_set.add(char)

for title in job_titles:
    for char in title:
        char_set.add(char)

print(char_set)
print(len(char_set))

char_list = list(char_set) # typecasting: set to list
n_chars = len(char_list) + 1 # +1 for non-existent characters(<unk>)

{'建', '青', '行', '在', '法', '资', '涛', '皮', '裁', '宾', '虚', '督', '增', '妈', '披', '趣', '原', '打', '翱', '企', '小', '熙', '细', '嘉', '旁', '龟', '已', '后', '们', 'k', '式', '食', '府', '哲', 'U', '筛', '断', '燃', '段', '祖', '律', '太', '益', '欢', '运', '专', '世', '球', '七', '滑', '哦', '条', '树', '申', '档', '嘎', '技', '剧', '沉', '叒', '鑫', '绘', '孚', '队', '得', '报', '威', '料', '将', '东', '徒', '逸', 'z', '●', '*', '什', '柳', '介', '态', '恩', '佳', '账', '馆', '花', '咕', '斩', '核', '住', 'W', '说', '忙', '圈', '农', 'm', '选', '去', '科', '是', '跨', '匠', '汇', '朋', '协', '战', 'V', '淀', '!', '政', '象', '航', '端', '篇', '朗', '右', '闻', '未', '加', '暇', '驾', '义', '央', '～', '障', '一', '页', 'B', '充', '旷', '冠', '滴', '矩', '也', '苏', '塾', '审', '季', '渠', '️', '芯', '望', '造', '沟', '承', '看', '高', '闲', '史', '顾', '或', '阅', '班', '极', '径', '启', '开', '边', '究', '箱', '变', '设', '招', '宏', '字', '立', '棋', '装', '尚', '缘', '千', '总', '帝', '署', '男', '试', '办', '浙', '改', '面', '举', '霸', 's', '尽', '宿', '托', '年', '>', '妆', '豆', '见', '大', '驶', '剑', '蒙', '艾', '！', '藤', '织', '锁', '精', '武',

## Convert title strinsg to a tensor of character indices

In [None]:
import torch

def title_to_tensor(title):
    tensor = torch.zeros(len(title), dtype=torch.long) # initialize a tensor of zeros with the length of the title

    for li, char in enumerate(title):
      try:
        ind = char_list.index(char) + 1 # +1 to differentiate "0" in index and "0" in zero tensor
      except ValueError:
        ind = n_chars - 1 # -1 to ensure the highest index for unknown characters are reserved
      tensor[li] = ind

    return tensor

## Build dataset & Split independent variables and dependent variable

In [None]:
all_data = []

for l in academy_titles:
    all_data.append((title_to_tensor(l), torch.tensor([0], dtype=torch.long)))

for l in job_titles:
    all_data.append((title_to_tensor(l), torch.tensor([1], dtype=torch.long)))

print(all_data[:5])

[(tensor([ 250,  294,  390,  328,  838, 1476,  963,  296,  124,  538, 1365,  825,
        1476, 1025, 1276, 1408, 1456,  519,  517,  660,  414]), tensor([0])), (tensor([ 726,  595,  739, 1355,  562,  101]), tensor([0])), (tensor([ 833, 1541, 1041,  190,  262,  115,  838,  243,  839,  869, 1071,  399,
         185,  726, 1125,  792,   67,    6,   68]), tensor([0])), (tensor([1528,  221,   97,  243, 1187,  442, 1077, 1037,  951,  697, 1028, 1401,
          97, 1125,  301,  226,  161,  727]), tensor([0])), (tensor([1174,  226,  838,  243,  455,  831,  838,  264,  264,   91,  431,  122]), tensor([0]))]


## Split training set and testing set

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)
print(len(train_data))
print(len(test_data))

5686
1422


## Define collate function to handle variable-length sequences

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    titles, labels = zip(*batch)
    titles_padded = pad_sequence(titles, batch_first=True, padding_value=0) # make all sequences the same length
    labels = torch.cat(labels) # concatenate labels
    return titles_padded, labels

# Neural network processing

## Check if there is GPU to use

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("There are %d GPU(s) available." % torch.cuda.device_count())
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU instead.")

There are 1 GPU(s) available.


## Define model

In [None]:
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim

class RnnModel(nn.Module):

    # define frameworks of each neural layer
    def __init__(self, word_count, embedding_size, hidden_size, output_size):
        super(RnnModel, self).__init__()

        # define neural layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(word_count, embedding_size) # embedding layer
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True) # RNN layer
        self.fc = nn.Linear(hidden_size, output_size) # full connected layer

        # define weight initializers of each layer(default)
        init.xavier_normal_(self.fc.weight)

    # define forward propagation function to connect layers(including activation function)
    def forward(self, input_tensor):
      word_vector = self.embedding(input_tensor)
      output, (hidden, _) = self.rnn(word_vector) # LSTM returns output: (hidden, cell), we need hidden state of the last time step.
      output = self.fc(hidden[-1]) # extract the hidden state of the last layer
      return output

# set hyperparameter values
word_count = n_chars
embedding_size = 200
hidden_size = 10
output_size = 2

# initialize model
model = RnnModel(word_count, embedding_size, hidden_size, output_size).to(device)

# define loss function
criterion = nn.CrossEntropyLoss()

# define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.005)

## Model training & evaluation

In [None]:
from torch.utils.data import DataLoader

batch_size = 10

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [None]:
epochs = 5
for epoch in range(epochs):
  model.train() # weights can be modified

  correct = 0
  total = 0

  for X_batch, Y_batch in train_loader:
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
    optimizer.zero_grad() # return zero of every previous batch

    Y_pred = model(X_batch)
    loss = criterion(Y_pred, Y_batch.squeeze())
    loss.backward() # calculate gradient(min loss weights)
    optimizer.step() # update weights

    _, predicted = torch.max(Y_pred.data, 1) # get the index of max value in each row of axis 1
    total += Y_batch.size(0) # get the number of samples
    correct += (predicted == Y_batch.squeeze()).sum().item()

  accuracy = correct / total
  print(f"Epoch: {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Acc: {accuracy:.4f}")

Epoch: 1/5, Loss: 0.0073, Acc: 0.9559
Epoch: 2/5, Loss: 0.0003, Acc: 0.9989
Epoch: 3/5, Loss: 0.0002, Acc: 0.9995
Epoch: 4/5, Loss: 0.0001, Acc: 0.9996
Epoch: 5/5, Loss: 0.0001, Acc: 0.9995


## Test evaluation

In [None]:
model.eval() # weights cannot be modified(frozen)

y_true = []
y_pred = []

with torch.no_grad(): # close the gradient calculation mechanism
  correct = 0
  total = 0

  for X_batch, Y_batch in test_loader:
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)

    Y_pred = model(X_batch)
    _, predicted = torch.max(Y_pred.data, 1)
    total += Y_batch.size(0)
    correct += (predicted == Y_batch.squeeze()).sum().item()
    y_true.extend(Y_batch.squeeze().cpu().numpy())
    y_pred.extend(predicted.cpu().numpy())
  print(f"Test Acc: {correct / total:.4f}")

Test Acc: 1.0000


## Answer prediction

In [None]:
import pandas as pd

results_df = pd.DataFrame({
    "Y_true": y_true,
    "Y_pred": y_pred
})
print(results_df)

      Y_true  Y_pred
0          1       1
1          1       1
2          1       1
3          1       1
4          1       1
...      ...     ...
1417       1       1
1418       0       0
1419       1       1
1420       0       0
1421       0       0

[1422 rows x 2 columns]
