### load gpt2 model

In [None]:
import torch
import torch.nn as nn
from GPT2 import GPT2Model, GPT2Tokenizer
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = ''
device = 'cuda' #'cuda'


def tokenize_input(inputStr, tokenizer, seq_length=1024):
    pad_id = tokenizer.encoder['<pad>']
    tokenized_sentence = tokenizer.encode(inputStr)[:seq_length-20]
    tokens = tokenized_sentence
    token_length = len(tokens)
    tokens.extend([pad_id] * (seq_length - token_length))
    tokens = torch.tensor(tokens, dtype=torch.long)
    return tokens.reshape(1,1024), [token_length]

tokenizer = GPT2Tokenizer(
    'GPT2/bpe/vocab.json',
    'GPT2/bpe/chinese_vocab.model',
    max_len=512)
    
model = GPT2Model(
    vocab_size=30000,
    layer_size=12,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=768,
    num_attention_heads=12,
    attention_dropout=0.0,
    residual_dropout=0.0
)

state_dict = torch.load('../models/model_pretrain_distill.pth', map_location='cpu')
model.load_state_dict(state_dict)

model.eval()

model.to(device)

print('loaded success')

### get vectors for all documents

In [None]:
import numpy as np
from tqdm import tqdm

data = np.load('./data/eastmoney_full_stocks_list_nlu_tencent.pkl', allow_pickle=True)

In [None]:
info_dict = {}

for item in data:
    info_dict[item['ticker_id']] = item['baike_summary'] + item['baike_content']

In [None]:
all = []

for idx in tqdm(range(len(data))):
    try:
        content = data[idx]['baike_content'] + data[idx]['baike_summary']
        tokens, token_length = tokenize_input(content, tokenizer, seq_length=1024)
        output = model(tokens.to(device))
        vector = output[0,token_length[0]].detach().cpu().numpy()

        one = {
            'ticker_id': data[idx]['ticker_id'],
            'ticker_name': data[idx]['ticker_name'],
            'vector': vector
        }
        all.append(one)
    except Exception:
        pass

In [None]:
import pickle

with open('./data_ignore/vectors_gpt2.pkl','wb') as f:
    pickle.dump(all, f)

### query by vector matrix

In [None]:
import numpy as np
from tqdm import tqdm

data = np.load('./data_ignore/vectors_gpt2.pkl', allow_pickle=True)

In [None]:
data[0]

In [None]:
ticker_names = []
ticker_ids = []
vectors = []

for idx in tqdm(range(len(data))):
    if 'vector' in data[idx].keys():
        vectors.append(data[idx]['vector'])
        ticker_ids.append(data[idx]['ticker_id'])
        ticker_names.append(data[idx]['ticker_name'])


In [None]:
vectors = np.stack(vectors)

In [None]:
def similarity_vector_matrix(arr, brr):
    return arr.dot(brr.T) / (np.sqrt(np.sum(arr*arr)) * np.sqrt(np.sum(brr*brr, axis=1)))

In [None]:
def stock_search(query, topk=10):

    # query = '医疗保险，重大疾病保障'

    tokens, token_length = tokenize_input(query, tokenizer, seq_length=1024)
    output = model(tokens.to(device))
    vector = output[0,token_length[0]].detach().cpu().numpy()

    res = similarity_vector_matrix(vector, vectors)
    idxs = np.argsort(res)[::-1]

    topk_idxs = idxs[:topk]
    names = [ticker_names[idx] for idx in topk_idxs]
    print(names)

- 自动驾驶，新能源汽车
- 电影，电视剧，文化艺术
- 啤酒，烧烤，朋友聚会
- 医疗保险，重大疾病保障
- 新冠肺炎

In [None]:
query = '自动驾驶，新能源汽车'
stock_search(query)

In [None]:
query = '电影，电视剧，文化艺术'
stock_search(query)

In [None]:
query = '啤酒，烧烤，朋友聚会'
stock_search(query)

In [None]:
query = '医疗保险，重大疾病保障'
stock_search(query)

In [None]:
query = '新冠肺炎'
stock_search(query)