# 词向量探索之旅

### 环境准备

确保已将 `histwords` 目录加入 `PYTHONPATH`，并已安装依赖。


In [1]:
import sys
sys.path.append('/root/workspace/MicroDistanc-Word2Vec/histwords')


## 1. 从斯坦福大学HistWords项目中获取词向量

In [2]:
import pickle
import numpy as np

# 加载词表
with open('/root/workspace/MicroDistanc-Word2Vec/Chinese_sgns_basic/1990-vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

# 加载词向量
vecs = np.load('/root/workspace/MicroDistanc-Word2Vec/Chinese_sgns_basic/1990-w.npy')

# 获取某个词的向量
words = ['病毒', '电脑', '疾病', '计算机']
for word in words:
    if word in vocab:
        idx = vocab.index(word)
        vector = vecs[idx]
        print(vector.shape) # 输出向量维度
    else:
        print('词不在词表中')
        


(300,)
(300,)
(300,)
(300,)


In [3]:
from itertools import combinations
from numpy.linalg import norm

# 获取所有在词表中的词及其向量
word_vecs = {}
for word in words:
    if word in vocab:
        idx = vocab.index(word)
        word_vecs[word] = vecs[idx]

# 计算两两余弦相似度
def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

for w1, w2 in combinations(word_vecs.keys(), 2):
    sim = cosine_similarity(word_vecs[w1], word_vecs[w2])
    print(f"{w1} - {w2} 的余弦相似度: {sim:.4f}")


病毒 - 电脑 的余弦相似度: 0.2628
病毒 - 疾病 的余弦相似度: 0.3643
病毒 - 计算机 的余弦相似度: 0.3050
电脑 - 疾病 的余弦相似度: 0.1756
电脑 - 计算机 的余弦相似度: 0.4383
疾病 - 计算机 的余弦相似度: 0.1918


## 调用HistWords提供的API端口

In [4]:
from representations.sequentialembedding import SequentialEmbedding
from representations.embedding import Embedding
# SequentialEmbedding是Word2Vec（SGNS）词向量
# Embedding是SVD词向量

# 加载词向量：单一时间点
embedding = Embedding.load('/root/workspace/MicroDistanc-Word2Vec/Chinese_sgns_basic/1990')
# 获取词向量
vector = embedding.represent('病毒')
print(vector.shape)

(300,)


In [5]:
neighbors = embedding.closest('学者', n=5)
for score, word in neighbors:
    print(word, score)

学者 0.9999999999999999
专家 0.6019439454234679
名望 0.5160167174959029
业内 0.49601799256692675
与会 0.45654864865628536


## 历史模型

In [6]:
### 3.1 加载历史词向量序列

from representations.sequentialembedding import SequentialEmbedding
years = range(1950, 2000, 10)
semb = SequentialEmbedding.load('/root/workspace/MicroDistanc-Word2Vec/chi-sim-all/sgns', years)

In [7]:
### 3.2 获取某一年份的 Embedding

embed_1990 = semb.get_embed(1990)

In [17]:
### 3.3 获取某个词在各年份的向量

for year in years:
    vec = semb.get_embed(year).represent('主义')
    print(year, vec[:5])

1950 [-0.04487927  0.02573186 -0.00947995 -0.03877381  0.03205856]
1960 [ 0.00161559  0.04229061 -0.0445441  -0.02257752  0.06220378]
1970 [-0.00346575  0.0531096  -0.09491494 -0.03063736  0.01986805]
1980 [-0.00850022 -0.02292808 -0.06003177 -0.04335207  0.04974374]
1990 [-0.01324701  0.0365957   0.01875553 -0.0736503   0.09564648]


In [9]:
### 3.4 计算某两个词随时间的相似度变化

time_sims = semb.get_time_sims('学术', '反动')
for year, sim in time_sims.items():
    print(year, sim)

1950 0.4552571961323729
1960 0.4420168275879869
1970 0.1762702730146145
1980 0.12860420820157537
1990 0.1581925143538493


In [10]:
### 3.5 获取某个词在所有年份的邻居集合

neigh_set = semb.get_seq_neighbour_set('反动', n=2)
print(neigh_set)

{'资产', '腐朽', '流毒', '残酷', '猖狂', '反动'}


## 读取词性

In [11]:
import pickle

pos_file = '/root/workspace/MicroDistanc-Word2Vec/chi-sim-all/pos/1990-pos.pkl'
with open(pos_file, 'rb') as f:
    pos_dict = pickle.load(f)

# 示例：提取指定词的词性
target_words = ['文人', '作家', '反动']
for word in target_words:
    print(f"{word}: {pos_dict.get(word, '未知')}")

文人: NOUN
作家: NOUN
反动: ADJ


### 词性标签表

| 类别       | NOUN | VERB | ADV | ADJ | PRT           | NUM |
|------------|------|------|-----|------|---------|-----|
| 含义 | 名词 | 动词 | 副词 | 形容词 | 小品词| 数词 |



In [12]:
# 按照词性筛选邻居

def get_neighbor_with_pos(embed: Embedding, pos_dict: dict, target_word: str, target_pos: str, n=10):
    """
    获取与 target_word 最相近且词性为 target_pos 的前 n 个词。
    """
    # 获取所有邻居及分数
    neighbors = embed.closest(target_word, n=n*5)  # 先多取一些，防止词性过滤后不够
    filtered = []
    for score, word in neighbors:
        if pos_dict.get(word) == target_pos:
            filtered.append((score, word))
        if len(filtered) >= n:
            break
    return filtered

# 示例用法：
result = get_neighbor_with_pos(embedding, pos_dict, '反动', 'NOUN', n=5)
for score, word in result:
    print(word, score)
    


流毒 0.4868973151790801
统治 0.469506656018562
反动派 0.4544083060623153
山头 0.4390510906423339
势力 0.4244963574685242


In [13]:
# 获取历年的邻居集合（不筛选词性）

semb.get_seq_closest_by_year("钱", n=5)

{1950: [(0.9999999999999998, '钱'),
  (0.9038659420271383, '花'),
  (0.835149942616815, '人家'),
  (0.8337984704298391, '买'),
  (0.8294886079513678, '吃')],
 1960: [(1.0, '钱'),
  (0.7650870282192708, '花'),
  (0.7129815339097929, '一声'),
  (0.6971985912410859, '口气'),
  (0.6760362568782627, '这儿')],
 1970: [(0.9999999999999993, '钱'),
  (0.5850082522695432, '买'),
  (0.5524397898584145, '笔'),
  (0.5367441157010124, '花'),
  (0.5156797911524045, '力气')],
 1980: [(1.0, '钱'),
  (0.5349853096138412, '赚'),
  (0.5055068782570069, '花'),
  (0.47393366638954987, '袋'),
  (0.471501573268131, '琛')],
 1990: [(1.0, '钱'),
  (0.4587084023580492, '袋'),
  (0.45845257084407204, '掏'),
  (0.44759882378635524, '赚'),
  (0.4351914162230578, '抽屉')]}

In [14]:
def get_seq_neighbor_with_pos(seqembed: SequentialEmbedding, pos_dict: dict, target_word: str, target_pos: str, n=10):
    """
    获取目标词在每个年份中，最相近且词性为 target_pos 的前 n 个词。
    返回格式：{year: [(score, word), ...], ...}
    """
    result = {}
    for year, embed in seqembed.embeds.items():
        neighbors = embed.closest(target_word, n=n*100)  # 先多取一些
        filtered = []
        for score, word in neighbors:
            if pos_dict.get(word) == target_pos and word != target_word:
                filtered.append((score, word))
            if len(filtered) >= n:
                break
        result[year] = filtered
    return result

In [15]:
# 示例用法
get_seq_neighbor_with_pos(semb, pos_dict, "钱", "ADJ", n=5)

{1950: [(0.7484916568501956, '年轻'),
  (0.698449745027883, '女'),
  (0.5980052306407435, '亲爱'),
  (0.5614793955077362, '违法'),
  (0.5436568614143564, '间接')],
 1960: [(0.5316574658520359, '女'),
  (0.5078091283696169, '年轻'),
  (0.4924626804071737, '相关'),
  (0.4663310115925252, '亲爱'),
  (0.4641558590600444, '有限')],
 1970: [(0.4098309110734928, '男'),
  (0.3945802598299771, '女'),
  (0.3754188716391379, '年轻'),
  (0.3684000505355881, '优质'),
  (0.35792361677380874, '可怜')],
 1980: [(0.3748411423736513, '巨额')],
 1990: [(0.2683090578396913, '余下')]}

## 读取词频（如果你需要的话）

In [None]:
counts_file = '/root/workspace/MicroDistanc-Word2Vec/chi-sim-all/counts/1990-counts.pkl'
with open(counts_file, 'rb') as f:
    counts_dict = pickle.load(f, encoding='latin1') # 考虑到是python2时代保存的pickle文件，应该用latin1读取避免历史遗留问题

# 示例：提取指定词的词频
target_words = ['文人', '作家', '反动']
for word in target_words:
    print(f"{word}: {counts_dict.get(word, '未知')}")
    
# 如果你需要读取多个年代的词频，可自行编写相关代码

文人: 2076.0
作家: 210922.5
反动: 44264.5


## 加载数据集代码总结

In [None]:
# 如果读取单个时间点的数据：

# from representations.embedding import Embedding
# from representations.sequentialembedding import SequentialEmbedding

# embedding = Embedding.load('这里放你的路径')


# 如果读取多个时间点的数据：

# years = range(1950, 2000, 10) # 这里放年份列表
# semb = SequentialEmbedding.load('这里放你的文件夹路径', years)

## 英文数据集的加载也同理

In [1]:
from representations.sequentialembedding import SequentialEmbedding
from representations.embedding import Embedding

years = range(1800,2000,10)
seq_embedding_eng = SequentialEmbedding.load('/root/workspace/MicroDistanc-Word2Vec/eng-all/sgns',years)
# 加载比较慢，要忍一下

: 