In [1]:
import torch
import torch.nn as nn
from GPT2 import GPT2Model, GPT2Tokenizer
import os
# os.environ['CUDA_VISIBLE_DEVICES'] = ''
device = 'cuda' #'cuda'


def tokenize_input(inputStr, tokenizer, seq_length=1024):
    pad_id = tokenizer.encoder['<pad>']
    tokenized_sentence = tokenizer.encode(inputStr)[:seq_length-20]
    tokens = tokenized_sentence
    token_length = len(tokens)
    tokens.extend([pad_id] * (seq_length - token_length))
    tokens = torch.tensor(tokens, dtype=torch.long)
    return tokens.reshape(1,1024), [token_length]

tokenizer = GPT2Tokenizer(
    'GPT2/bpe/vocab.json',
    'GPT2/bpe/chinese_vocab.model',
    max_len=512)
    
class LayerNorm(nn.Module):
    r"""
    Layer normalization.
    """

    def __init__(self, hidden_size, eps=1e-5):
        super(LayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias


class MLP(nn.Module):
    def __init__(self, n_in, n_out):
        super(MLP, self).__init__()
        self.linear = nn.Linear(n_in, n_out)
        self.layer_norm = LayerNorm(n_out)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.linear(x)
        x = self.layer_norm(x)
        x = self.relu(x)
        return x


class GPT2_SIMILARITY(nn.Module):
    def __init__(self):
        super(GPT2_SIMILARITY, self).__init__()
        
        self.GPT2model = GPT2Model(
            vocab_size=30000,
            layer_size=12,
            block_size=1024,
            embedding_dropout=0.0,
            embedding_size=768,
            num_attention_heads=12,
            attention_dropout=0.0,
            residual_dropout=0.0
        )

        self.mlp =  MLP(30000, 256)

    def forward(self, x, length):
        x = self.GPT2model(x)
        classify = []
        for i in range(len(length)):
            classify.append(x[i, length[i]].view(-1))
        classify = torch.stack(classify)
        classify = self.mlp(classify)
        return classify

    def get_vector_0(self, x, length):
        x = self.GPT2model(x)
        return x[0, length[0]]


model = torch.load('../models/similarity_0.pth', map_location='cpu')

model.eval()
model.to(device)

print('loaded success')

loaded success


In [61]:
def is_chinese(string):
    """
    检查整个字符串是否包含中文
    :param string: 需要检查的字符串
    :return: bool
    """
    for ch in string:
        if u'\u4e00' <= ch <= u'\u9fa5':  # \u9fff
            return True

    return False

In [73]:
input = '。'.join(data.iloc[11]['info'].split('\n')[1:3])

print(len(input))
print(input)

tokens, token_length = tokenize_input(input, tokenizer, seq_length=1024)
output = model.get_vector_0(tokens.to(device), token_length)
vector = output.detach().cpu().numpy()

topk = np.argsort(-vector)

keywords = [tokenizer.decode(i) for i in topk.tolist() if is_chinese(tokenizer.decode(i))]

keywords[0:20]

364
要点二:经营范围 生物技术开发、技术咨询、技术转让、技术服务;经济信息咨询(不含行政许可的项目);工程招标及代理;货物进出口、技术进出口、代理进出口;销售金属材料、化工产品(不含一类易制毒品及危险化学品)、玻璃容器;会议服务;承办展览展示活动;设计、制作、代理、发布广告;生产重组蛋白;生产培养基、填料;生产生物试剂盒。。要点三:重组蛋白等关键生物试剂产品及技术服务 公司是一家专业提供重组蛋白等关键生物试剂产品及技术服务的高新技术企业,助力全球生物医药公司、生物科技公司和科研机构等进行生物药、细胞免疫治疗及诊断试剂的研发与生产。公司主要产品及服务应用于肿瘤、自身免疫疾病、心血管病、传染病等疾病的药物早期发现及验证、药物筛选及优化、诊断试剂开发及优化、临床前实验及临床试验、药物生产过程及工艺控制(CMC)等研发及生产环节。


['口服',
 '肝',
 '抗',
 '异',
 '毒',
 '有效',
 '复',
 '对',
 '类',
 '糖',
 '单',
 '胶囊',
 '酸',
 '红',
 '滴',
 '多',
 '苦',
 '双',
 '和',
 '过敏']

In [2]:
import numpy as np
from tqdm import tqdm
import pandas as pd


data = pd.read_csv('../data/stock_list_info.csv')

In [4]:
data.head()

Unnamed: 0,stockName,stockCode,indvInduCode,indvInduName,info
0,明阳智能,601615,1032.0,风电设备,要点一:所属板块 风电设备 广东板块 标准普尔 富时罗素 MSCI中国 沪股通 上证380 ...
1,帅丰电器,605336,456.0,家电行业,要点一:所属板块 家电行业 浙江板块\n要点二:经营范围 制造、销售:集成灶、吸排油烟机、燃...
2,兆易创新,603986,1036.0,半导体,要点一:所属板块 半导体 北京板块 百元股 标准普尔 富时罗素 MSCI中国 沪股通 上证1...
3,鼎龙股份,300054,1039.0,电子化学品,要点一:所属板块 电子化学品 湖北板块 富时罗素 创业板综 深股通 融资融券 预盈预增 深成...
4,拓普集团,601689,481.0,汽车零部件,要点一:所属板块 汽车零部件 浙江板块 标准普尔 富时罗素 MSCI中国 沪股通 中证500...


In [4]:
all = []
for i in tqdm(range(len(data))):
    item = data.iloc[i]
    info = '。'.join(item['info'].split('\n')[1:3]) # 要点二和要点三
    tokens, token_length = tokenize_input(info, tokenizer, seq_length=1024)
    output = model(tokens.to(device), token_length)
    vector = output[0].detach().cpu().numpy()

    all.append(
        {
            'stockName': item['stockName'],
            'stockCode': str(item['stockCode']).zfill(6),
            'indvInduName': item['indvInduName'],
            'indvInduCode': int(item['indvInduCode']),
            'info': info,
            'vector': vector,
        }
    )

100%|██████████| 2158/2158 [04:44<00:00,  7.58it/s]


In [5]:
import pickle

with open('./data_ignore/vectors_siamese.pkl','wb') as f:
    pickle.dump(all, f)