### 导入预训练分词器

In [1]:
from transformers import MarianTokenizer

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")



### 导入数据

In [2]:
from data.data_load import train_data_raw, test_data_raw, valid_data_raw
import pandas as pd

train_df, test_df, vad_df = pd.DataFrame(train_data_raw), pd.DataFrame(test_data_raw), pd.DataFrame(valid_data_raw)

### 提取训练数据中的src和trg

In [3]:
def build_src_trg(df: pd.DataFrame):
    df = df.assign(
        de=df['translation'].apply(lambda x: x['de']), en=df['translation'].apply(lambda x: x['en'])).drop(columns=['translation']
    ).head(100000)
    return df

In [4]:
train_src_trg, test_src_trg, vad_src_trg = build_src_trg(train_df), build_src_trg(test_df), build_src_trg(vad_df)

### 微调分词器

In [5]:
train_corpus = train_src_trg['de'].tolist() + train_src_trg['en'].tolist()
test_corpus = test_src_trg['de'].tolist() + test_src_trg['en'].tolist()
valid_corpus = vad_src_trg['de'].tolist() + vad_src_trg['en'].tolist()

In [6]:
# 构建新词汇表
from collections import Counter

new_tokens = []
for text in train_corpus + test_corpus + valid_corpus:
    tokens = tokenizer.tokenize(text)
    new_tokens.extend(tokens)

In [7]:
counter = Counter(new_tokens)

In [8]:
new_vocab = [token for token, freq in counter.items() if freq > 10]

In [9]:
tokenizer.add_tokens(new_vocab)

2

In [10]:
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/source.spm',
 './tokenizer/target.spm',
 './tokenizer/added_tokens.json')

### 将语句分割成idx并进行填充和截断

In [11]:
def tokenizer_pad(row: pd.Series):
    return tokenizer(row, padding="max_length", truncation=True, return_tensors='pt', max_length=512)

In [12]:
def get_data(df: pd.DataFrame):
    return df.assign(src_input=lambda x: x['de'].apply(tokenizer_pad).apply(lambda x: x['input_ids']), trg_input=lambda x: x['en'].apply(tokenizer_pad).apply(lambda x: x['input_ids']), src_mask=lambda x: x['de'].apply(tokenizer_pad).apply(lambda x: x['attention_mask']), trg_mask=lambda x: x['en'].apply(tokenizer_pad).apply(lambda x: x['attention_mask']))

In [13]:
train_data, test_data, valid_data = get_data(train_src_trg), get_data(test_src_trg), get_data(vad_src_trg)
# train_feature, train_mask = train_data['src_input'], train_data['src_mask']
# train_label = train_data['trg_input']

In [22]:
train_src, train_mask = train_data['src_input'], train_data['src_mask']
test_src = test_data['src_input']
test_src

0       [[tensor(15418), tensor(886), tensor(492), ten...
1       [[tensor(3207), tensor(6437), tensor(920), ten...
2       [[tensor(605), tensor(7554), tensor(1678), ten...
3       [[tensor(8961), tensor(39369), tensor(11201), ...
4       [[tensor(71), tensor(692), tensor(14070), tens...
                              ...                        
2998    [[tensor(198), tensor(2413), tensor(12070), te...
2999    [[tensor(188), tensor(982), tensor(5045), tens...
3000    [[tensor(609), tensor(1522), tensor(11007), te...
3001    [[tensor(364), tensor(3361), tensor(2683), ten...
3002    [[tensor(6160), tensor(45), tensor(3490), tens...
Name: src_input, Length: 3003, dtype: object