In [1]:
import torch
import numpy as np
import os
import json
import codecs
from EduNLP.Pretrain import QuesNetTokenizer, pretrain_quesnet
from EduNLP.Vector import T2V
from EduNLP.I2V import QuesNet, get_pretrained_i2v

os.environ["WANDB_DISABLED"] = "true"



# 训练自己的QuesNet模型
## 1. 数据

In [2]:
# 设置你的数据路径和输出路径
BASE_DIR = "../.."

data_dir = f"{BASE_DIR}/static/test_data"
output_dir = f"{BASE_DIR}/examples/test_model/quesnet"

In [3]:
def raw_data():
    _data = []
    data_path = os.path.join(data_dir, "quesnet_data.json")
    with codecs.open(data_path, encoding="utf-8") as f:
        for line in f.readlines():
            _data.append(json.loads(line))
    return _data

raw_data = raw_data()

## 2. 训练Tokenizer

In [4]:
tokenizer = QuesNetTokenizer(meta=['know_name'], max_length=50,
                             img_dir=os.path.join(data_dir, "quesnet_img"))

# 设置词表
tokenizer.set_vocab(raw_data, key=lambda x: x['ques_content'], trim_min_count=3, silent=False)

print("vocab_size: ", tokenizer.vocab_size)
print()


save words(3): 64/249 = 0.2570                  with frequency 696/927=0.7508
save meta information know_name: 48
vocab_size:  67



In [5]:
# 保存tokenizer
tokenizer.save_pretrained(output_dir)

## 3. 训练QuesNet

In [6]:
# 自定义训练参数
train_params = {
    # train params
    "n_epochs": 1,
    "batch_size": 1,
    "lr": 1e-3,
    'save_every': 1,
    'log_steps': 10,
    # 'device': 'cpu',
    'max_steps': 2,
    # model params
    'emb_size': 256,
    'feat_size': 256,
}

# 当前仅支持linux下训练
# pretrain_quesnet(os.path.join(os.path.abspath(data_dir), 'quesnet_data.json'),
#                  output_dir, tokenizer, True, train_params)

## 4. 使用模型

In [7]:
pretrain_dir = os.path.join(output_dir, "quesnet_test_256")

### 4.1 使用训练好的QuesNet Tokenzier

In [8]:
# 读取保存的tokenizer
tokenizer = QuesNetTokenizer.from_pretrained(pretrain_dir,
                                             img_dir=os.path.join(data_dir, "quesnet_img"))

In [9]:
# tokenize
# 可以处理单个题目
print(tokenizer.tokenize(raw_data[0], key=lambda x: x['ques_content']))
print()
# 也可以处理题目列表
print(tokenizer.tokenize(raw_data[:5], key=lambda x: x['ques_content']))

print()

# 将token转换为index
print(tokenizer(raw_data[0], key=lambda x: x['ques_content'], return_text=True, padding=True))
print()
print(tokenizer(raw_data[:3], key=lambda x: x['ques_content'], padding=True))

['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '=']

[['已知', '集合', 'A', '=', '\\left', '\\{', 'x', '\\mid', 'x', '^', '{', '2', '}', '-', '3', 'x', '-', '4', '<', '0', '\\right', '\\}', ',', '\\quad', 'B', '=', '\\{', '-', '4', ',', '1', ',', '3', ',', '5', '\\}', ',', '\\quad', 'A', '\\cap', 'B', '='], ['复数', 'z', '=', '1', '+', '2', 'i', '+', 'i', '^', '{', '3', '}', '|', 'z', '|', '='], ['埃及', '胡夫', '金字塔', '古代', '世界', '建筑', '奇迹', '形状', '视为', '正四', '棱锥', '以该', '四', '棱锥', '高为', '边长', '正方形', '面积', '等于', '四', '棱锥', '侧面', '三角形', '面积', '侧面', '三角形', '底边', '高', '底面', '正方形', '边长', '比值'], ['设', 'O', '正方形', 'ABCD', '中心', 'O', ',', 'A', ',', 'B', ',', 'C', ',', 'D', '中任取', '3', '点', '取到', '3', '点', '共线', '概率'], ['某校', '课外', '学习', '小组', '研究', '作物', '发芽率', 'y', '温度', 'x', '单位', '^', '{', '\\circ', '}',

### 4.2 使用训练好的QuesNet模型

In [10]:
tokenizer_kwargs = {
    'tokenizer_config_dir': pretrain_dir,
}
i2v = QuesNet('quesnet', 'quesnet', pretrain_dir,
              tokenizer_kwargs=tokenizer_kwargs, device="cpu")

In [11]:
# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)

torch.Size([1, 256])
torch.Size([1, 43, 256])

torch.Size([1, 43, 256])
torch.Size([1, 256])

torch.Size([2, 43, 256])
torch.Size([2, 256])


### 4.3 使用EduNLP中公开的预训练模型

In [12]:
# 获取公开的预训练模型
i2v = get_pretrained_i2v("quesnet_test_256", model_dir=output_dir)

EduNLP, INFO model_path: ..\..\examples\test_model\quesnet\quesnet_test_256
EduNLP, INFO Use pretrained t2v model quesnet_test_256
downloader, INFO http://base.ustc.edu.cn/data/model_zoo/modelhub/quesnet_pub/1/quesnet_test_256.zip is saved as ..\..\examples\test_model\quesnet\quesnet_test_256.zip
downloader, INFO file existed, skipped


In [13]:
# 用法和I2V相同

# 获得单个题目的表征
i_vec, t_vec = i2v(raw_data[0], key=lambda x: x["ques_content"])
print(i_vec.shape)
print(t_vec.shape)
print()

# 也可以分别获得题目表征和各个token的表征
t_vec = i2v.infer_token_vector(raw_data[0], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[0], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)
print()

# 获得题目列表的表征
t_vec = i2v.infer_token_vector(raw_data[:2], key=lambda x: x["ques_content"])
i_vec = i2v.infer_item_vector(raw_data[:2], key=lambda x: x["ques_content"])
print(t_vec.shape)
print(i_vec.shape)

torch.Size([1, 256])
torch.Size([1, 43, 256])

torch.Size([1, 43, 256])
torch.Size([1, 256])

torch.Size([2, 43, 256])
torch.Size([2, 256])
