In [1]:
import torch
import numpy as np
import os
import json
from EduNLP.Pretrain import train_elmo, ElmoTokenizer
from EduNLP.Vector import ElmoModel, T2V
from EduNLP.I2V import Elmo, get_pretrained_i2v



# 训练自己的Elmo模型
## 1. 数据

In [2]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/elmo"

In [3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "OpenLUNA.json")
    with open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

def stem_data(data):
    _data = []
    tokenizer = ElmoTokenizer()
    for e in data:
        d = tokenizer.tokenize(item=e['stem'], freeze_vocab=False)
        if d is not None:
            _data.append(d)
    assert _data
    return _data

raw_data = raw_data()
train_items = stem_data(raw_data)

## 2. 训练和评估

In [4]:
# 自定义训练参数
train_params = {
  "emb_dim": 128,
  "hid_dim": 256,
  "batch_size": 4,
  "epochs": 1,
  "lr": 5e-3,
  "device": None
}

train_elmo(train_items, output_dir, **train_params)

RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select
[DEBUG]Sample idx: tensor([[   7,  112,  113,   91,   13,   93,   10,   13,   15,   16,   17,   18,
           19,   17,   13,   34,  125,   11,   91,  248,   15,   16,   13,   19,
           29,   18,   34,  248,   15,   16,   19,   13,   34,   29,   18,   24,
           93, 1181,  253,  125,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,  

'../../examples/test_model/elmo'


## 3.使用模型

### 3.1使用训练好的Elmo模型

In [5]:
item = [
        {'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
        若$x,y$满足约束条件$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'stem': '已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为'}
]

tokenizer_kwargs = {"path": os.path.join(output_dir, "vocab.json")}
i2v = Elmo('elmo', 'elmo', output_dir, tokenizer_kwargs=tokenizer_kwargs)

# 可以对单个题目进行表征
i_vec, t_vec = i2v(item[0]['stem'])
print(i_vec.shape) # == torch.Size([x])
print(t_vec.shape) # == torch.Size([x, x])

# 也可以对题目列表进行表征
i_vec, t_vec = i2v([ item[0]['stem'], item[1]['stem'] ])
print(i_vec.shape) # == torch.Size([2, x])
print(t_vec.shape) # == torch.Size([2, x, x]))


torch.Size([512])
torch.Size([15, 512])
torch.Size([2, 512])
torch.Size([2, 25, 512])


### 3.2使用Elmo Tokenizer

In [6]:
# 加载之前训练的模型tokenizer
tokenizer = ElmoTokenizer(os.path.join(output_dir, "vocab.json"))

# 对题目文本进行令牌化
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
# 可以对单个题目进行令牌化
print(tokenizer(items[0], freeze_vocab=True))
print()

# 也可以对题目列表进行令牌化
print(tokenizer(items, freeze_vocab=True))
print()



([527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], 17)

([[527, 231, 3, 13, 26, 79, 159, 527, 6, 33, 10, 13, 34, 133, 79, 168, 4], [7, 104, 13, 15, 16, 17, 18, 34, 79, 15, 16, 17, 18, 19, 105, 13, 10, 23, 106, 107, 104, 108, 109, 110, 111]], [17, 25])



In [7]:
# 可以使用tokenize方法查看令牌化后的文本
print(tokenizer.tokenize(items[0], freeze_vocab=True))
print(tokenizer.tokenize(items, freeze_vocab=True))

['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
[['公式', '如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '公式', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]'], ['已知', '圆', 'x', '^', '{', '2', '}', '+', 'y', '^', '{', '2', '}', '-', '6', 'x', '=', '0', '过点', '直线', '圆', '截得', '弦', '长度', '最小值']]


### 3.3使用EduNLP中公开的预训练模型

In [9]:
# 获取公开的预训练模型
pretrained_dir = f"{BASE_DIR}/examples/test_model/elmo"
i2v = get_pretrained_i2v("elmo_test", model_dir=pretrained_dir)

EduNLP, INFO model_path: ..\..\examples\test_model\elmo\elmo_test
EduNLP, INFO Use pretrained t2v model elmo_test
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/elmo_pub/1/elmo_test.zip is saved as ..\..\examples\test_model\elmo\elmo_test.zip


Downloading ..\..\examples\test_model\elmo\elmo_test.zip 100.00%: 402KB | 402KB

downloader, INFO ..\..\examples\test_model\elmo\elmo_test.zip is unzip to ..\..\examples\test_model\elmo\elmo_test





In [10]:
items = [
    "有公式$\\FormFigureID{wrong1?}$，如图$\\FigureID{088f15ea-xxx}$,\
    若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$，则$z=x+7 y$的最大值为$\\SIFBlank$",
    "已知圆$x^{2}+y^{2}-6 x=0$，过点(1,2)的直线被该圆所截得的弦的长度的最小值为"
]
i_vec, t_vec = i2v(items)
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以单独获取题目表征和各个token的表征
i_vec = i2v.infer_item_vector(items)
print(i_vec.shape)
t_vec = i2v.infer_token_vector(items)
print(t_vec.shape)
print()

# 同样，可以获取单个题目的表征
i_vec, t_vec = i2v(items[0])
print(i_vec.shape)
print(t_vec.shape)

torch.Size([2, 64])
torch.Size([2, 25, 64])

torch.Size([2, 64])
torch.Size([2, 25, 64])

torch.Size([64])
torch.Size([17, 64])
