In [1]:
import sys

sys.path.append("..")

import logging
import os
from pathlib import Path

from transformers import AutoConfig, AutoTokenizer
from transformers import (
    HfArgumentParser,
    set_seed,
)

from src.arguments import (
    ModelArguments,
    DataArguments,
    RetrieverTrainingArguments as TrainingArguments,
)
from src.data import TrainDatasetForEmbedding, EmbedCollator
from src.modeling import BiEncoderModel
# from trainer import BiTrainer
from transformers import Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch import nn

In [3]:
#Model Download
from modelscope import snapshot_download
model_dir = snapshot_download('AI-ModelScope/bert-base-uncased')

Downloading Model from https://www.modelscope.cn to directory: /home/jie/.cache/modelscope/hub/models/AI-ModelScope/bert-base-uncased




In [4]:
args_d = {
    "output_dir": "output",
    # "model_name_or_path": "BAAI/bge-large-zh-v1.5",
    "model_name_or_path": model_dir,
    "train_data": "./toy_finetune_data.jsonl",
    "learning_rate": 1e-5,
    "fp16": True,
    "num_train_epochs": 5,
    "per_device_train_batch_size": 2,
    "dataloader_drop_last": True,
    "normlized": True,
    "temperature": 0.02,
    "query_max_len": 64,
    "passage_max_len": 256,
    "train_group_size": 4,
    "negatives_cross_device": False,
    "logging_steps": 10,
    "query_instruction_for_retrieval": "为这个句子生成表示以用于检索相关文章：",
    "save_safetensors": False,
}

In [6]:
parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_dict(args_d)

In [6]:
training_args



In [7]:
num_labels = 1

tokenizer = AutoTokenizer.from_pretrained(
    (
        model_args.tokenizer_name
        if model_args.tokenizer_name
        else model_args.model_name_or_path
    ),
    cache_dir=model_args.cache_dir,
    use_fast=False,
)

config = AutoConfig.from_pretrained(
    (
        model_args.config_name
        if model_args.config_name
        else model_args.model_name_or_path
    ),
    num_labels=num_labels,
    cache_dir=model_args.cache_dir,
)

# model = BiEncoderModel(
#     model_name=model_args.model_name_or_path,
#     normlized=training_args.normlized,
#     sentence_pooling_method=training_args.sentence_pooling_method,
#     negatives_cross_device=training_args.negatives_cross_device,
#     temperature=training_args.temperature,
# )

In [8]:
train_dataset = TrainDatasetForEmbedding(args=data_args, tokenizer=tokenizer)

In [9]:
train_dataset[0]

('为这个句子生成表示以用于检索相关文章：Five women walk along a beach wearing flip-flops.',
 ['Some women with flip-flops on, are walking along the beach',
  'The man is talking about hawaii.',
  "She's not going to court to clear her record.",
  'There was a reform in 1996.'])

In [9]:
data_collator = EmbedCollator(
    tokenizer,
    query_max_len=data_args.query_max_len,
    passage_max_len=data_args.passage_max_len,
)

In [11]:
data_collator([train_dataset[0]])

{'query': {'input_ids': tensor([[  101,   100,   100,   100,   100,  1816,  1910,  1854,   100,  1923,
            100,   100,   100,   100,   100,  1919,   100,  1861,  1932,  1993,
           2274,  2308,  3328,  2247,  1037,  3509,  4147, 11238,  1011, 28583,
           2015,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 'passage': {'input_ids': tensor([[  101,  2070,  2308,  2007, 11238,  1011, 28583,  2015,  2006,  1010,
           2024,  3788,  2247,  1996,  3509,   102],
         [  101,  1037,  2177,  1997,  2111,  3248,  7454,  1012,   102,     0,
              0,     0,     0,     0,     0,     0],
         [  101,  2016,  1005,  1055,  2025,  2183,  2000,  2457,  2000,  3154,
           2014,  2501,  1012,   102,     0,     0],


In [12]:
model_args.model_name_or_path

'BAAI/bge-large-zh-v1.5'

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)

In [41]:
query, passage = data_collator([train_dataset[0]]).values()

In [42]:
query

{'input_ids': tensor([[  101,   711,  6821,   702,  1368,  2094,  4495,  2768,  6134,  4850,
           809,  4500,   754,  3466,  5164,  4685,  1068,  3152,  4995,  8038,
         12706, 12912,   165, 11346,  9266,  9142,   143,  9983, 12679,  8221,
           148,  8636,  8187,   118,   148,  8897,  8525,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
mean_model = BiEncoderModel(
    model_name=model_args.model_name_or_path,
    normlized=training_args.normlized,
    sentence_pooling_method="mean",
)

In [49]:
score = mean_model.compute_similarity(mean_model.encode(query), mean_model.encode(passage))
score

tensor([[0.7693, 0.2740, 0.5726, 0.2958]], grad_fn=<MmBackward0>)

In [45]:
mean_model.encode(query)

tensor([[ 0.0343,  0.0061, -0.0474,  ..., -0.0099, -0.0407,  0.0079]],
       grad_fn=<DivBackward0>)

In [46]:
mean_model.encode(passage)

tensor([[ 2.5201e-02,  1.6923e-02, -2.8226e-02,  ..., -1.4305e-02,
          2.4664e-02,  8.6464e-06],
        [ 2.5590e-02,  5.7111e-02, -4.5425e-02,  ...,  1.3755e-02,
          1.1986e-02, -3.4164e-02],
        [-1.4720e-02,  2.2096e-02, -2.6135e-02,  ..., -2.0131e-02,
         -9.4332e-03, -1.5504e-02],
        [-1.5406e-02,  1.2138e-02,  2.2544e-02,  ..., -5.2548e-02,
         -2.6954e-02,  2.0775e-04]], grad_fn=<DivBackward0>)

In [44]:
mean_model(query=query, passage=passage)

EncoderOutput(q_reps=tensor([[ 0.0343,  0.0061, -0.0474,  ..., -0.0099, -0.0407,  0.0079]],
       grad_fn=<DivBackward0>), p_reps=tensor([[ 2.5201e-02,  1.6923e-02, -2.8226e-02,  ..., -1.4305e-02,
          2.4664e-02,  8.6464e-06],
        [ 2.5590e-02,  5.7111e-02, -4.5425e-02,  ...,  1.3755e-02,
          1.1986e-02, -3.4164e-02],
        [-1.4720e-02,  2.2096e-02, -2.6135e-02,  ..., -2.0131e-02,
         -9.4332e-03, -1.5504e-02],
        [-1.5406e-02,  1.2138e-02,  2.2544e-02,  ..., -5.2548e-02,
         -2.6954e-02,  2.0775e-04]], grad_fn=<DivBackward0>), loss=tensor(1.1163, grad_fn=<NllLossBackward0>), scores=tensor([[0.7693, 0.2740, 0.5726, 0.2958]], grad_fn=<ViewBackward0>))

In [52]:
import torch

In [54]:
with torch.no_grad():
    res = mean_model(query=query, passage=passage)

In [55]:
res

EncoderOutput(q_reps=tensor([[ 0.0343,  0.0061, -0.0474,  ..., -0.0099, -0.0407,  0.0079]]), p_reps=tensor([[ 2.5201e-02,  1.6923e-02, -2.8226e-02,  ..., -1.4305e-02,
          2.4664e-02,  8.6464e-06],
        [ 2.5590e-02,  5.7111e-02, -4.5425e-02,  ...,  1.3755e-02,
          1.1986e-02, -3.4164e-02],
        [-1.4720e-02,  2.2096e-02, -2.6135e-02,  ..., -2.0131e-02,
         -9.4332e-03, -1.5504e-02],
        [-1.5406e-02,  1.2138e-02,  2.2544e-02,  ..., -5.2548e-02,
         -2.6954e-02,  2.0775e-04]]), loss=tensor(1.1163), scores=tensor([[0.7693, 0.2740, 0.5726, 0.2958]]))

In [21]:
cross_entropy = nn.CrossEntropyLoss(reduction="mean")

## batch data

In [13]:
batch_query, batch_passage = data_collator([train_dataset[i] for i in range(3)]).values()

In [14]:
batch_query

{'input_ids': tensor([[  101,   100,   100,   100,   100,  1816,  1910,  1854,   100,  1923,
           100,   100,   100,   100,   100,  1919,   100,  1861,  1932,  1993,
          2274,  2308,  3328,  2247,  1037,  3509,  4147, 11238,  1011, 28583,
          2015,  1012,   102,     0,     0,     0],
        [  101,   100,   100,   100,   100,  1816,  1910,  1854,   100,  1923,
           100,   100,   100,   100,   100,  1919,   100,  1861,  1932,  1993,
          1037,  2450,  3061,  2006,  1037,  2152,  7656,  2006,  2028,  4190,
          2559,  2058,  1037,  2314,  1012,   102],
        [  101,   100,   100,   100,   100,  1816,  1910,  1854,   100,  1923,
           100,   100,   100,   100,   100,  1919,   100,  1861,  1932,  1993,
          2048,  2450,  2024,  2652,  5693,  1025,  2028,  1037, 12089,  1010,
          1996,  2060,  1037,  6710,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 

In [18]:
mean_model(query=batch_query, passage=batch_passage)

EncoderOutput(q_reps=tensor([[ 0.0264, -0.0110, -0.0012,  ..., -0.0346,  0.0140, -0.0548],
        [ 0.0115, -0.0143, -0.0160,  ..., -0.0351,  0.0100, -0.0383],
        [ 0.0068,  0.0037, -0.0002,  ..., -0.0284,  0.0262, -0.0567]],
       grad_fn=<DivBackward0>), p_reps=tensor([[ 0.0601, -0.0266, -0.0091,  ..., -0.0334, -0.0120, -0.0295],
        [ 0.0505,  0.0308, -0.0269,  ..., -0.0146,  0.0410, -0.0189],
        [ 0.0470, -0.0370,  0.0343,  ..., -0.0092,  0.0043, -0.0207],
        ...,
        [ 0.0046, -0.0206, -0.0217,  ..., -0.0370,  0.0110, -0.0384],
        [ 0.0554, -0.0042, -0.0180,  ..., -0.0143, -0.0018, -0.0145],
        [-0.0126, -0.0385,  0.0013,  ..., -0.0295, -0.0036,  0.0067]],
       grad_fn=<DivBackward0>), loss=tensor(2.4021, grad_fn=<NllLossBackward0>), scores=tensor([[0.5581, 0.4210, 0.3527, 0.4637, 0.5158, 0.3126, 0.4513, 0.4229, 0.4140,
         0.2105, 0.4903, 0.3946],
        [0.5212, 0.4166, 0.3669, 0.4586, 0.5332, 0.3186, 0.4538, 0.4296, 0.4035,
         0.

In [20]:
batch_scores = mean_model.compute_similarity(
    mean_model.encode(batch_query),
    mean_model.encode(batch_passage),
)
batch_scores

tensor([[0.5581, 0.4210, 0.3527, 0.4637, 0.5158, 0.3126, 0.4513, 0.4229, 0.4140,
         0.2105, 0.4903, 0.3946],
        [0.5212, 0.4166, 0.3669, 0.4586, 0.5332, 0.3186, 0.4538, 0.4296, 0.4035,
         0.2310, 0.4357, 0.4211],
        [0.4923, 0.4345, 0.3714, 0.4525, 0.4658, 0.3053, 0.4426, 0.4184, 0.4044,
         0.2148, 0.4264, 0.4281]], grad_fn=<MmBackward0>)

In [28]:
cross_entropy(batch_scores, torch.tensor([0, 1, 2]) * data_args.train_group_size)

tensor(2.4021, grad_fn=<NllLossBackward0>)

In [None]:
# score

验证 hf 使用 mean 还是 cls 在计算encode

In [13]:
trainer = Trainer(
    model=mean_model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=EmbedCollator(
        tokenizer,
        query_max_len=data_args.query_max_len,
        passage_max_len=data_args.passage_max_len,
    ),
    tokenizer=tokenizer,
)

  trainer = Trainer(
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [14]:
trainer.train()

Step,Training Loss
10,1.9721
20,1.8612


TrainOutput(global_step=25, training_loss=1.89400390625, metrics={'train_runtime': 4.2306, 'train_samples_per_second': 11.819, 'train_steps_per_second': 5.909, 'total_flos': 0.0, 'train_loss': 1.89400390625, 'epoch': 5.0})

In [10]:
trainer.save_model()

In [11]:
trainer.is_world_process_zero()

True