In [3]:
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:
# 数据集载入
TRAIN_PATH='./data/MovieReview/train.txt'
TEST_PATH='./data/MovieReview/test.txt'
VAL_PATH='./data/MovieReview/validation.txt'
def load_data(path):
    with open(path, encoding='utf-8') as f:
        data = []
        lable = []
        for line in f.readlines():
            if line == '\n':
                continue
            data.append(line[2:].split())
            lable.append(int(line[0]))
        return data, lable
train_data, train_lable = load_data(TRAIN_PATH)
test_data, test_lable = load_data(TEST_PATH)
val_data, val_lable = load_data(VAL_PATH)

In [5]:
# 数据集pading
SEQUENCE_LEN = 50 
def data_padding(sequence):
    if len(sequence) >= SEQUENCE_LEN:
        sequence = sequence[:SEQUENCE_LEN]
    else:
        pad_len = SEQUENCE_LEN - len(sequence)
        for _ in range(pad_len):
            sequence.append("<PAD>")
    assert len(sequence) == SEQUENCE_LEN
    return sequence
train_data = [data_padding(seq) for seq in train_data]
test_data = [data_padding(seq) for seq in test_data]
val_data = [data_padding(seq) for seq in val_data]

In [6]:
# 数据集映射
word2idx = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}
idx2word = {0: '<PAD>', 1: '<UNK>', 2: '<SOS>', 3: '<EOS>'}
for data in [train_data, test_data, val_data]:
    for review in data:
        for word in review:
            if word not in word2idx.keys():
                word2idx[word] = len(word2idx)
                idx2word[len(word2idx)] = word
train_data = [[word2idx[word] for word in review] for review in train_data]
test_data = [[word2idx[word] for word in review] for review in test_data]
val_data = [[word2idx[word] for word in review] for review in val_data]


In [7]:
# 数据集embedding
W2V_FILE='./data/MovieReview/wiki_word2vec_50.bin'
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(W2V_FILE, binary=True)
idx2vec = np.array(np.random.uniform(-1., 1., [len(word2idx), model.vector_size]))
for word in word2idx.keys():
   if model.has_index_for(word):
        idx2vec[word2idx[word]] = model[word] 
idx2vec = torch.from_numpy(idx2vec).requires_grad_(True).float()

In [8]:
class MovieReviewDataset(Dataset):
    def __init__(self, data, lable):
        self.data = torch.tensor(data, dtype=torch.long)
        self.label = torch.tensor(lable, dtype=torch.long)
    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)


In [9]:
class SelfAttention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.attention_weights = nn.Linear(hidden_dim, hidden_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        attention_scores = self.attention_weights(x)
        attention_weights = self.softmax(attention_scores)
        attended_representation = torch.bmm(attention_weights.unsqueeze(1), x)
        return attended_representation.squeeze(1)

In [10]:
class MovieReviewSentimentClassificationModel(nn.Module):

    def __init__(self, hidden_dim, pretrained_embeded_weight) -> None:
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding_layer = nn.Embedding(pretrained_embeded_weight.shape[0], pretrained_embeded_weight.shape[1])
        self.embedding_layer.weight.data.copy_(pretrained_embeded_weight)
        self.lstm_layer = nn.LSTM(pretrained_embeded_weight.shape[1], hidden_dim, num_layers=1, batch_first=True)
        # self.attention = SelfAttention(hidden_dim * 2)
        self.linear_layer = nn.Sequential(
            nn.Linear(hidden_dim, 1024),
            nn.LeakyReLU(),
            nn.Linear(1024,512),
            nn.LeakyReLU(),
            nn.Linear(512,2)
        ) 

    def forward(self, input):
        batch_size, seq_len = input.shape
        embedded = self.embedding_layer(input)
        h_0 = torch.zeros(1, batch_size, self.hidden_dim, device=input.device, dtype=torch.float, requires_grad=True)
        c_0 = torch.zeros(1, batch_size, self.hidden_dim, device=input.device, dtype=torch.float, requires_grad=True)
        output, hidden = self.lstm_layer(embedded, (h_0,c_0))
        # output = self.attention(output)
        output = self.linear_layer(output[:,-1,:])
        return output, hidden

In [11]:
BATCH_SIZE=56
HIDDEN_DIM=256
LR=1e-3
EPOCHS=10
PATH = './MovieReviewSentimentClassificationModel.pt'

In [12]:
def test(model, criterion, dataloader):
    model.eval()
    print(">>>>>> Model Test Begin......")
    correct = 0
    batch_loss = 0
    with torch.no_grad():
        for batch_data, batch_lable in dataloader:
            batch_data = batch_data.to(device)
            batch_lable = batch_lable.to(device)
            output, _ = model(batch_data)
            loss = criterion(output, batch_lable)
            batch_loss += loss
            correct += (output.argmax(1) ==
                        batch_lable).type(torch.float).sum().item()
    correct /= len(dataloader.dataset)
    print(
        f"Test Error: \n Accuracy:{(100*correct):>0.1f} % , Avg loss : {batch_loss/len(dataloader):>8f} \n")
    print(">>>>>> Model Train End.")

In [13]:
def train(model, optimizer, criterion, dataloader):
    model.train()
    print(">>>>>> Model Train Begin......")
    for epoch_idx in range(EPOCHS):
        epoch_loss = 0
        print(f"Epoch {epoch_idx}\n-------------------------------")
        for batch_idx, (batch_data, batch_lable) in enumerate(dataloader):
            batch_data = batch_data.to(device)
            batch_lable = batch_lable.to(device)
            output, _ = model(batch_data)
            loss = criterion(output, batch_lable)
            epoch_loss += loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (batch_idx+1) % 100 == 0:
                print(
                    f'[{(batch_idx+1) * len(batch_data)}/{len(dataloader.dataset)} \
                      ({100. * batch_idx / len(dataloader):.0f}%)]\t \
                      loss: {loss.item():.6f}')
        print(f'Epoch {epoch_idx}\tAVG loss= {epoch_loss/len(dataloader):.6f}\n')
        
    print(">>>>>> Model Train End.")


In [14]:
train_dataset = MovieReviewDataset(train_data, train_lable)
test_dataset = MovieReviewDataset(test_data, test_lable)
val_dataset = MovieReviewDataset(val_data, val_lable)
train_dataloader = DataLoader(train_dataset,BATCH_SIZE,True)
test_dataloader = DataLoader(test_dataset,BATCH_SIZE,True)
val_dataloader = DataLoader(val_dataset,BATCH_SIZE,True)


In [15]:
model = MovieReviewSentimentClassificationModel(HIDDEN_DIM,idx2vec).to(device=device)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss()
train(model, optimizer, criterion, train_dataloader)
# torch.save(model.state_dict(), PATH)
model.load_state_dict(torch.load(PATH))


>>>>>> Model Train Begin......
Epoch 0
-------------------------------
Epoch 0	AVG loss= 0.548735

Epoch 1
-------------------------------
Epoch 1	AVG loss= 0.381541

Epoch 2
-------------------------------
Epoch 2	AVG loss= 0.274350

Epoch 3
-------------------------------
Epoch 3	AVG loss= 0.188365

Epoch 4
-------------------------------
Epoch 4	AVG loss= 0.122965

Epoch 5
-------------------------------
Epoch 5	AVG loss= 0.091146

Epoch 6
-------------------------------
Epoch 6	AVG loss= 0.063203

Epoch 7
-------------------------------
Epoch 7	AVG loss= 0.045649

Epoch 8
-------------------------------
Epoch 8	AVG loss= 0.031100

Epoch 9
-------------------------------
Epoch 9	AVG loss= 0.023448

>>>>>> Model Train End.


<All keys matched successfully>

In [16]:
test(model, criterion, test_dataloader)


>>>>>> Model Test Begin......
Test Error: 
 Accuracy:83.2 % , Avg loss : 0.696980 

>>>>>> Model Train End.
