In [1]:
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset
import pandas as pd
import random
import torch
from GPT2 import Transformer
import torch.nn as nn

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
device = 'cuda' # cuda or cpu

In [9]:
class GPT2_word2vec(nn.Module):
    def __init__(self,
                 vocab_size,
                 layer_size,
                 block_size,
                 embedding_dropout,
                 embedding_size,
                 num_attention_heads,
                 attention_dropout,
                 residual_dropout):
        super(GPT2_word2vec, self).__init__()
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
        self.position_embeddings = nn.Embedding(block_size, embedding_size)
        self.emb_drop = nn.Dropout(embedding_dropout)
        self.transformer = Transformer(
            layer_size,
            embedding_size, 
            num_attention_heads,
            attention_dropout,
            residual_dropout)

    def forward(self, x, kv_cache=None, use_cache=False):
        # position_ids 和外面计算的一样，
        if kv_cache is None:
            past_length = 0
        else:
            past_length = kv_cache[0][0].shape[-2]

        position_ids = torch.arange(past_length, x.shape[-1] + past_length, dtype=torch.int64, device=x.device)
        position_ids = position_ids.unsqueeze(0).expand_as(x)
        
        x = self.word_embeddings(x)  # input ids
        x = self.emb_drop(x + self.position_embeddings(position_ids))  # position
        
        x, cached_kvs = self.transformer(x, kv_cache)  # kv_cache 是 attention mask
        
        if use_cache:
            return x, cached_kvs
        return x

In [7]:
def load_data(data_path):
    data = pd.read_csv(data_path)
    all = []
    for i in range(len(data)):
        item = data.iloc[i]
        words = item['info'].split('\n')[0][9:].split(' ')
        if len(words) > 1:
            all.append(
                {
                    'stockName': item['stockName'],
                    'stockCode': str(item['stockCode']).zfill(6),
                    'indvInduName': item['indvInduName'],
                    'indvInduCode': int(item['indvInduCode']),
                    'words': words
                }
            )
    return all
    

class PreprocessDataset():
    
    def __init__(self, data_list):
        all = data_list
        dict_id2name = {}
        dict_name2id = {}
        dict_id2words = {}
        dict_name2words = {}
        words = []
        for item in all:
            dict_id2name[item['stockCode']] = item['stockName']
            dict_name2id[item['stockName']] = item['stockCode']
            dict_id2words[item['stockCode']] = item['words']
            dict_name2words[item['stockName']] = item['words']
            words.extend(item['words'])

        words = list(set(words))
        for item in all:
            words.append(item['stockName'])

        dict_encoder = {}
        dict_decoder = {}

        for i, word in enumerate(words):
            dict_encoder[word] = i
            dict_decoder[i] = word

        self.dict_id2name = dict_id2name
        self.dict_name2id = dict_name2id
        self.dict_id2words = dict_id2words
        self.dict_name2words = dict_name2words
        self.words = words

In [8]:
data_list = load_data('../data/stock_list_info.csv')
dataset = PreprocessDataset(data_list)

In [13]:
dataset.words

['中药',
 '航运港口',
 '浙江板块',
 '云南板块',
 '消毒剂',
 '氢能源',
 '水利建设',
 '股权激励',
 '光伏设备',
 '能源金属',
 '非金属材料',
 'CAR-T细胞疗法',
 '商业百货',
 '退税商店',
 '流感',
 '基本金属',
 '人造肉',
 '被动元件',
 '文化传媒',
 '换电概念',
 '医药商业',
 '船舶制造',
 '杭州亚运会',
 '统一大市场',
 '智能机器',
 '百元股',
 '汽车零部件',
 '天津板块',
 '体育产业',
 '工业互联',
 '车联网',
 '券商概念',
 'MSCI中国',
 'MLCC',
 '中证500',
 '中字头',
 '创业板综',
 '电子烟',
 '西藏板块',
 '无线耳机',
 '进口博览',
 '无线充电',
 'IPO受益',
 '电网设备',
 '宠物经济',
 'DRG/DIP',
 '陕西板块',
 '国产芯片',
 '抽水蓄能',
 '军工',
 '创业成份',
 '地热能',
 '工程机械',
 '医废处理',
 '上证380',
 '河南板块',
 '多元金融',
 'IGBT概念',
 '山西板块',
 '网络游戏',
 'ETC',
 '养老金',
 '茅指数',
 'GDR',
 '化学制品',
 '电子车牌',
 '包装材料',
 'EDR概念',
 '游戏',
 '海绵城市',
 '燃气',
 'HIT电池',
 '昨日连板_含一字',
 '中药概念',
 '铁路基建',
 '北京板块',
 '电商概念',
 '蚂蚁概念',
 '电源设备',
 '房地产服务',
 '核污染防治',
 '电机',
 '辅助生殖',
 '特斯拉',
 '大数据',
 'NFT概念',
 '消费电子',
 '医疗服务',
 '5G概念',
 '江西板块',
 '移动支付',
 '数字阅读',
 '壳资源',
 '海洋经济',
 '电子身份证',
 '万达概念',
 '手游概念',
 '国资云概念',
 '北交所概念',
 '土壤修复',
 '光学光电子',
 '专用设备',
 '环氧丙烷',
 '稀土永磁',
 'HS300_',
 '央视50_',
 '深圳特区',
 '教育',
 '贵州板块',
 '刀片电池',
 '新疆

In [15]:
model = torch.load('../models/word2vec_37.pth', map_location='cpu')
model.eval()
model.cuda()
print('loaded success')

all = []
for i, word in enumerate(tqdm(dataset.words)):
    input = torch.tensor(i).cuda().reshape(1,1)
    output = model(input)
    vector = output[0][0].detach().cpu().numpy()
    try:
        keywords = dataset.dict_name2words[word]
    except Exception:
        keywords = []
    all.append(
        {
            'word' : word,
            'keywords': ','.join(keywords),
            'vector': vector,
        }
    )

loaded success


100%|██████████| 2634/2634 [00:26<00:00, 97.73it/s] 


In [16]:
import pickle

with open('./data_ignore/vectors_word2vec.pkl','wb') as f:
    pickle.dump(all, f)