In [1]:
import torch
import numpy as np
import os
import json
from EduNLP.ModelZoo.rnn import ElmoLM
from EduNLP.Pretrain import train_elmo, ElmoTokenizer
from EduNLP.Vector import ElmoModel, T2V
from EduNLP.I2V import Elmo, get_pretrained_i2v

os.environ["WANDB_DISABLED"] = "true"

# 训练自己的 Elmo 模型
## 1. 数据

In [2]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/data/pretrain_test_models/elmo/"

In [3]:
def stem_data():
    _data = []
    data_path = os.path.join(data_dir, "standard_luna_data.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

train_items = stem_data()

## 2. 训练和评估

In [4]:
# 自定义训练参数
train_params = {
  # "emb_dim": 128,
  # "hid_dim": 256,
  # "batch_size": 4,
  # "epochs": 1,
  # "lr": 5e-3,
  # "device": None,
  
  "num_train_epochs": 1,
  "per_device_train_batch_size": 8,
  "save_steps": 50,
  "save_total_limit": 2,
  "logging_steps": 5,
  "gradient_accumulation_steps": 1,
  "learning_rate": 5e-4,
}

train_elmo(train_items, output_dir, train_params=train_params)

  0%|          | 0/1 [00:00<?, ?ba/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 25
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4


  0%|          | 0/4 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ../../data/pretrain_test_models/elmo/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'train_runtime': 6.0807, 'train_samples_per_second': 4.111, 'train_steps_per_second': 0.658, 'train_loss': 11.407991409301758, 'epoch': 1.0}


'../../data/pretrain_test_models/elmo/'


## 3.使用模型

In [5]:
test_items = [
    {'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$，\
            如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
            若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
    {'ques_content': "已知集合$A=\\left\\{x \\mid x^2-3 x-4<0\\right\\}, \
            \\quad B=\\{-4,1,3,5\\}, \\quad$ 则$A \\cap B$的值为"}
]

### 3.1 直接加载令牌容器和模型

In [6]:
pretrained_model_dir = output_dir

model = ElmoLM.from_pretrained(pretrained_model_dir)
tokenizer = ElmoTokenizer.from_pretrained(pretrained_model_dir)

encodes = tokenizer(test_items, lambda x: x['ques_content'])
model(**encodes)

[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.


ElmoLMOutput([('pred_forward',
               tensor([[[ 0.0711,  0.0329, -0.0712,  ..., -0.0264, -0.1005, -0.0407],
                        [ 0.0061, -0.0306, -0.0051,  ..., -0.0114, -0.0032, -0.0349],
                        [ 0.1096,  0.0290, -0.0693,  ..., -0.0570, -0.0247,  0.0044],
                        ...,
                        [ 0.1481,  0.0452, -0.1055,  ..., -0.0555, -0.0767,  0.0661],
                        [ 0.0716,  0.0912, -0.1071,  ...,  0.0242, -0.0758, -0.0357],
                        [ 0.0827,  0.0552, -0.0762,  ..., -0.0408, -0.0744, -0.0262]],
               
                       [[ 0.1062,  0.0579, -0.0643,  ..., -0.0324, -0.0232, -0.0349],
                        [ 0.0868,  0.0312, -0.0822,  ..., -0.0020, -0.0091, -0.0700],
                        [ 0.1125,  0.0201, -0.0719,  ...,  0.0029,  0.0186, -0.0363],
                        ...,
                        [ 0.1114,  0.0602, -0.1583,  ..., -0.0303, -0.0600,  0.0948],
                        [ 0.0419, 

### 3.2 使用 I2V 向量化

In [7]:
tokenizer_kwargs = {"tokenizer_config_dir": pretrained_model_dir}
i2v = Elmo('elmo', 'elmo', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])

# 也可以对题目列表进行表征
i_vec, t_vec = i2v(test_items, key=lambda x: x["ques_content"])
print(i_vec.shape) # == torch.Size([len(test_items), x])
print(t_vec.shape) # == torch.Size([len(test_items), x, x]))


[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.
  (outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],


torch.Size([1, 600])
torch.Size([1, 17, 600])
torch.Size([3, 600])
torch.Size([3, 40, 600])


### 3.3 使用 Tokenizer 和 T2V 向量化

In [8]:
# 加载之前训练的模型 tokenizer
tokenizer = ElmoTokenizer.from_pretrained(pretrained_model_dir)
encodes = tokenizer(test_items, key=lambda x: x['ques_content'])

t2v = ElmoModel(pretrained_model_dir)

i_vec = t2v(encodes)
print(i_vec.shape) # == torch.Size([len(test_items), x])
print()

i_vec = t2v.infer_vector(encodes)
t_vec = t2v.infer_tokens(encodes)
print(i_vec.shape) # == torch.Size([len(test_items), x])
print(t_vec.shape) # == torch.Size([len(test_items), x, x]))
print()

[EduNLP, INFO] All the weights of ElmoLM were initialized from the model checkpoint at ../../data/pretrain_test_models/elmo/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use ElmoLM for predictions without further training.


torch.Size([3, 600])

torch.Size([3, 600])
torch.Size([3, 40, 600])



### 3.4 使用 EduNLP 中公开的预训练模型

In [12]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/elmo"
i2v = get_pretrained_i2v("elmo_pp_test", model_dir=pretrained_dir)

SSLError: HTTPSConnectionPool(host='modelhub-backend-269-production.env.bdaa.pro', port=443): Max retries exceeded with url: /v1/api/getPretrainedModelList (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:997)')))

In [None]:
i_vec, t_vec = i2v(test_items)
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(test_items, key=lambda x: x['ques_content'])
print(i_vec.shape)
t_vec = i2v.infer_token_vector(test_items, key=lambda x: x['ques_content'])
print(t_vec.shape)
print()

# 同样，可以获取单个题目的表征
i_vec, t_vec = i2v(test_items[0], key=lambda x: x['ques_content'])
print(i_vec.shape)
print(t_vec.shape)