### 导入预训练分词器

In [1]:
from transformers import MarianTokenizer

tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-de-en")



### 导入数据

In [2]:
from data.data_load import train_data_raw, test_data_raw, valid_data_raw
import pandas as pd

train_df, test_df, vad_df = pd.DataFrame(train_data_raw), pd.DataFrame(test_data_raw), pd.DataFrame(valid_data_raw)

### 提取训练数据中的src和trg

In [3]:
def build_src_trg(df: pd.DataFrame):
    df = df.assign(
        de=df['translation'].apply(lambda x: x['de']), en=df['translation'].apply(lambda x: x['en'])).drop(columns=['translation']
    ).head(100000)
    return df

In [4]:
train_src_trg, test_src_trg, vad_src_trg = build_src_trg(train_df), build_src_trg(test_df), build_src_trg(vad_df)

### 微调分词器

In [99]:
train_corpus = train_src_trg['de'].tolist() + train_src_trg['en'].tolist()
test_corpus = test_src_trg['de'].tolist() + test_src_trg['en'].tolist()
valid_corpus = vad_src_trg['de'].tolist() + vad_src_trg['en'].tolist()

In [111]:
# 构建新词汇表
from collections import Counter

new_tokens = []
for text in train_corpus + test_corpus + valid_corpus:
    tokens = tokenizer.tokenize(text)
    new_tokens.extend(tokens)

In [112]:
counter = Counter(new_tokens)

In [114]:
new_vocab = [token for token, freq in counter.items() if freq > 10]

In [115]:
tokenizer.add_tokens(new_vocab)

1

In [116]:
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/source.spm',
 './tokenizer/target.spm',
 './tokenizer/added_tokens.json')

### 将语句分割成idx并进行填充和截断

In [148]:
def tokenizer_pad(row: pd.Series):
    return tokenizer(row, padding="max_length", truncation=True, return_tensors='pt', max_length=512)

In [None]:
train_data = train_src_trg.assign(src_input=lambda x: x['de'].apply(tokenizer_pad).apply(lambda x: x['input_ids']), trg_input=lambda x: x['en'].apply(tokenizer_pad).apply(lambda x: x['input_ids'])).drop(columns=["de", "en"], src_mask=lambda x: x['de'].apply(tokenizer_pad).apply(lambda x: x['attention_mask']), trg_mask=lambda x: x['en'].apply(tokenizer_pad).apply(lambda x: x['attention_mask']))

train_feature, train_mask = train_data['src_input'], train_data['src_mask']
train_label = train_data['trg_input']

Unnamed: 0,src,trg
0,"[[tensor(398), tensor(695), tensor(537), tenso...","[[tensor(1196), tensor(8403), tensor(12353), t..."
1,"[[tensor(2449), tensor(45), tensor(10780), ten...","[[tensor(444), tensor(178), tensor(6359), tens..."
2,"[[tensor(398), tensor(695), tensor(142), tenso...","[[tensor(211), tensor(4544), tensor(17859), te..."
3,"[[tensor(444), tensor(90), tensor(5067), tenso...","[[tensor(3553), tensor(1803), tensor(358), ten..."
4,"[[tensor(3927), tensor(5521), tensor(90), tens...","[[tensor(609), tensor(500), tensor(1032), tens..."
...,...,...
99995,"[[tensor(2684), tensor(6123), tensor(348), ten...","[[tensor(444), tensor(3399), tensor(3452), ten..."
99996,"[[tensor(2449), tensor(11849), tensor(9959), t...","[[tensor(321), tensor(1694), tensor(2391), ten..."
99997,"[[tensor(2449), tensor(354), tensor(4209), ten...","[[tensor(444), tensor(982), tensor(118), tenso..."
99998,"[[tensor(2107), tensor(14551), tensor(166), te...","[[tensor(383), tensor(2605), tensor(6), tensor..."


In [146]:
train_data.assign(key=lambda x: x['src'].apply(lambda x: ))

0        [input_ids, attention_mask]
1        [input_ids, attention_mask]
2        [input_ids, attention_mask]
3        [input_ids, attention_mask]
4        [input_ids, attention_mask]
                    ...             
99995    [input_ids, attention_mask]
99996    [input_ids, attention_mask]
99997    [input_ids, attention_mask]
99998    [input_ids, attention_mask]
99999    [input_ids, attention_mask]
Name: src, Length: 100000, dtype: object

In [127]:
train_data['src'].apply(lambda x: x['input_ids'])

0             [398, 695, 537, 12394, 537, 30618, 25478, 0]
1        [2449, 45, 10780, 18, 1633, 982, 20107, 2706, ...
2        [398, 695, 142, 695, 4299, 3212, 2533, 3830, 1...
3        [444, 90, 5067, 104, 5898, 1149, 21348, 18, 81...
4        [3927, 5521, 90, 2209, 8319, 1860, 142, 695, 3...
                               ...                        
99995    [2684, 6123, 348, 235, 23226, 1860, 1633, 2100...
99996    [2449, 11849, 9959, 216, 9327, 3546, 537, 2, 4...
99997    [2449, 354, 4209, 7219, 18, 6283, 2011, 603, 1...
99998    [2107, 14551, 166, 12139, 7292, 25283, 45, 247...
99999                 [2449, 4765, 4209, 6283, 5400, 3, 0]
Name: src, Length: 100000, dtype: object

In [77]:
def process_df(df: pd.DataFrame):
    df = df.assign(
        de=df['translation'].apply(lambda x: x['de']), en=df['translation'].apply(lambda x: x['en'])).drop(columns=['translation']
    ).head(100000)
    
    data_src_raw, data_trg_raw = df['de'], df['en']
    data_dict = {"src": [], "trg": []}
    for src in data_src_raw:
        data_dict['src'].append(tokenizer(src, padding=True, truncation=True))
    for trg in data_trg_raw:
        data_dict['trg'].append(tokenizer(trg, padding=True, truncation=True))
    return data_dict

In [78]:
train_data, test_data, valid_data = process_df(train_df), process_df(test_df), process_df(vad_df)

In [71]:
# 取小规模的前10W条
train_src_raw, train_trg_raw = train_data['de'].head(100000), train_data['en'].head(100000)

train_data_dict = {"src": [], "trg": []}


In [70]:
train_data_dict['trg']

{'input_ids': [38, 121, 39, 3059, 1961, 8211, 18, 7, 4, 368, 3, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [62]:
input_demo = tokenizer(src, padding=True, truncation=True, return_tensors='pt', max_length=512)

In [67]:
tokenizer.tokenize(src)

['▁Ich', '▁bin', '▁für', '▁das', '▁System', '.']

In [66]:
tokenizer_dir = dir(tokenizer)

In [129]:
input_demo

{'input_ids': [[105, 495, 28, 44, 744, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

In [64]:
tokenizer.pad(input_demo, return_tensors='pt', padding='max_length', max_length=512)

{'input_ids': tensor([[  105,   495,    28,    44,   744,     3,     0, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 5

In [60]:
tokenizer(src, padding=True, truncation=True, return_tensors='pt', max_length=512)

{'input_ids': tensor([[105, 495,  28,  44, 744,   3,   0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [59]:
tokenizer.pad(tokenizer(src, padding=True, truncation=True, return_tensors='pt', max_length=512), return_tensors="pt", padding='max_length', max_length=512)

{'input_ids': tensor([[  105,   495,    28,    44,   744,     3,     0, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 5

In [29]:
train_data_dict['trg']

[{'input_ids': [6053, 477, 12353, 7, 4, 2952, 6, 2314, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [38, 230, 6359, 4006, 371, 8403, 108, 4, 2952, 6, 2314, 7, 4, 151, 3336, 1043, 7096, 3059, 3075, 705, 34, 108, 32, 7135, 3136, 507, 9851, 418, 935, 1958, 2, 8, 38, 563, 10382, 6843, 920, 32, 1214, 17, 11514, 118, 12, 3066, 1870, 41, 14, 3860, 20133, 168, 17, 3722, 5, 4, 6516, 1543, 35, 41, 17, 15, 3075, 132, 108, 14, 17, 5178, 817, 1085, 1772, 1013, 343, 1544, 82, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [6069, 216, 12015, 2, 52, 41, 73, 72, 2952, 15, 2, 4, 711, 8104, 108, 255, 90, 26050, 34, 1540, 700, 2597, 22, 6620, 3548, 82, 12, 17, 3771, 2029, 2, 530, 4, 238, 5, 14, 17, 34, 477, 935, 7, 1253, 5235, 3417, 5813, 15904, 108, 14, 17, 13420, 6

In [30]:
from transformers import AutoModel

model = AutoModel.from_pretrained("Everlyn/transformer_base")

In [13]:
model

MBartModel(
  (shared): Embedding(50265, 512, padding_idx=1)
  (encoder): MBartEncoder(
    (embed_tokens): MBartScaledWordEmbedding(50265, 512, padding_idx=1)
    (embed_positions): MBartLearnedPositionalEmbedding(514, 512)
    (layers): ModuleList(
      (0-5): 6 x MBartEncoderLayer(
        (self_attn): MBartAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation_fn): ReLU()
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (layernorm_embed

In [25]:
(model(**inputs))[-1].shape

torch.Size([2, 15, 512])

In [28]:
tokenizer.sep_token_id

102