In [1]:
import torch
import torch.nn as nn
import os
from pathlib import Path
from torchtext.vocab import Vocab
import pandas as pd
from tokenizers import Tokenizer
import math

In [None]:
model: nn.Module = torch.load('Core/BERT/model/model.pt')
enVocab: Vocab = torch.load('Core/BERT/model/vocab_en.pt')
zhVocab: Vocab = torch.load('Core/BERT/model/vocab_zh.pt')

In [None]:
class Translation:

    model: nn.Module  = None
    enVocab: Vocab = None
    zhVocab: Vocab = None
    tokenizer: Tokenizer = None
    device = None

    def __init__(self, model: nn.Module, enVocab: Vocab, zhVocab: Vocab):
        self.model = model
        self.enVocab = enVocab
        self.zhVocab = zhVocab
        self.tokenizer = Tokenizer.from_pretrained('bert-base-uncased')
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.maxLength = 72
    
    def translation(self, text: str) -> str:
        # 将与原句子分词后，通过词典转为index，然后增加<bos>和<eos>
        src = torch.tensor([0] + self.enVocab(self.enTokenizer(src)) + [1]).unsqueeze(0).to(self.device)
        # 首次tgt为<bos>
        tgt = torch.tensor([[0]]).to(self.device)
        # 一个一个词预测，直到预测为<eos>，或者达到句子最大长度
        for i in range(self.maxLength):
            # 进行transformer计算
            out = self.model(src, tgt)
            # 预测结果，因为只需要看最后一个词，所以取`out[:, -1]`
            predict = self.model.predictor(out[:, -1])
            # 找出最大值的index
            y = torch.argmax(predict, dim=1)
            # 和之前的预测结果拼接到一起
            tgt = torch.concat([tgt, y.unsqueeze(0)], dim=1)
            # 如果为<eos>，说明预测结束，跳出循环
            if y == 1:
                break
        # 将预测tokens拼起来
        tgt = ''.join(self.zhVocab.lookup_tokens(tgt.squeeze().tolist())).replace("<s>", "").replace("</s>", "")
        return tgt
    
    def enTokenizer(text: str) -> list[str]:
        pass

    def __call__(self, text: str) -> str:
        return self.translation(text)

In [None]:
def sentimentClassification(text: str) -> int:
    if text == 'positive':
        return 2
    elif text == 'neutral':
        return 1
    else:
        return 0


In [None]:
def readFile(filePath: str) -> list[dict]:
    result: list[dict] = []
    with open(filePath, "r", encoding="utf-8") as f:
        for line in f:
            segment: list[str] = line.split("@")
            content: str = ''.join(segment[0: -1])
            content = Translation(model, enVocab, zhVocab)(content)
            sentiment = sentimentClassification(segment[-1])
            result.append({
                "content": content,
                "sentiment": sentiment
            })
    return result

def readFiles(filePath: str) -> pd.DataFrame:
    result: list[dict] = []
    for files in os.listdir(filePath):
        result.append(readFile(os.join(filePath, files)))
    return pd.DataFrame(result)
    
def writeFiles(filePath: str, data: pd.DataFrame):
    data.to_csv(filePath, index=False)