In [1]:
import numpy as np
import pandas as pd
from pandas import read_excel

import gensim
from gensim import models
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity


# 预训练的词向量语料库文件路径
tecent_embedding_path = (r'D:\embedding_zh\tencent-ailab-embedding-zh-d200-v0.2.0-s\tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')

# 加载词向量文件
tencent_embedding = models.KeyedVectors.load_word2vec_format(tecent_embedding_path, binary=False)

In [27]:
from scipy.stats import pearsonr  # 导入 pearsonr 函数

In [10]:
df_APP = read_excel(r'data\SEV_wordlist.xlsx')

df_APP_positive = df_APP.loc[df_APP['Valence']=='Positive', 'Word'].to_frame()
APP_positive_seed_words = df_APP_positive.iloc[:,0].tolist()
df_APP_negative = df_APP.loc[df_APP['Valence']=='Negative', 'Word'].to_frame()
APP_negative_seed_words = df_APP_negative.iloc[:,0].tolist()

In [11]:
# 函数：获取词向量
def get_word_vectors(word_list, embedding_model):
    """
    获取词向量列表
    :param word_list: 要提取词向量的词汇列表
    :param embedding_model: 已加载的词向量模型
    :return: 包含词汇及其对应向量的字典
    """
    word_vectors = {}
    for word in word_list:
        if word in embedding_model:
            word_vectors[word] = embedding_model[word]
        else:
            print(f"Word '{word}' not found in embedding model.")
    return word_vectors

In [12]:
# 获取积极和消极种子词的词向量
APP_positive_vectors = get_word_vectors(APP_positive_seed_words, tencent_embedding)
APP_negative_vectors = get_word_vectors(APP_negative_seed_words, tencent_embedding)

Word '高挑儿' not found in embedding model.
Word '囚首垢面' not found in embedding model.
Word '慓悍' not found in embedding model.
Word '病病歪歪' not found in embedding model.


In [13]:
# 示例输出
print(f"Positive seed word vectors: {list(APP_positive_vectors.items())[:5]}")
print(f"Negative seed word vectors: {list(APP_negative_vectors.items())[:5]}")

Positive seed word vectors: [('美', array([-0.063265, -0.290808,  0.33485 ,  0.401058, -0.085513, -0.242435,
        0.038056,  0.347926, -0.118692,  0.113957, -0.024499,  0.091414,
       -0.007095, -0.124725,  0.036011,  0.058022, -0.227367,  0.214025,
       -0.2128  ,  0.063273,  0.241622,  0.22136 , -0.1249  , -0.048605,
       -0.139632, -0.204389, -0.074671,  0.311976, -0.223462, -0.020783,
       -0.003681, -0.036068, -0.140897,  0.243983,  0.490715, -0.316042,
        0.110241,  0.189536, -0.300561,  0.005994,  0.131898, -0.042457,
        0.151375,  0.050606,  0.145889,  0.134272, -0.02711 , -0.04741 ,
        0.031331,  0.211017,  0.170571,  0.328561, -0.109814,  0.080928,
       -0.320565,  0.157999,  0.093348, -0.206504,  0.30342 ,  0.490316,
       -0.069969,  0.176902, -0.337005, -0.427515,  0.152909,  0.448392,
        0.05524 ,  0.242363,  0.277323, -0.025935,  0.137497, -0.190485,
        0.555874,  0.384026, -0.390845,  0.026243,  0.210312,  0.162708,
        0.097209

In [22]:
# 提取词向量为二维数组
APP_positive_vectors = np.array(list(APP_positive_vectors.values()))
APP_negative_vectors = np.array(list(APP_negative_vectors.values()))

# 检查维度
print(f"Positive vectors shape: {APP_positive_vectors.shape}")
print(f"Negative vectors shape: {APP_negative_vectors.shape}")

Positive vectors shape: (157, 200)
Negative vectors shape: (95, 200)


In [17]:
External_df_APP = read_excel(r'APP_polarity_results.xlsx')

APP_positive_external_words = External_df_APP.loc[External_df_APP['Polarity']=='positive', 'Word'].tolist()
APP_negative_external_words = External_df_APP.loc[External_df_APP['Polarity']=='negative', 'Word'].tolist()

In [18]:
APP_positive_external_vectors = get_word_vectors(APP_positive_external_words, tencent_embedding)
APP_negative_external_vectors = get_word_vectors(APP_negative_external_words, tencent_embedding)

In [19]:
# 示例输出
print(f"Positive external word vectors: {list(APP_positive_external_vectors.items())[:5]}")
print(f"Negative external word vectors: {list(APP_negative_external_vectors.items())[:5]}")

Positive external word vectors: [('仙风道骨', array([ 0.216949,  0.248651, -0.139164,  0.144905,  0.222048, -0.342325,
        0.171736,  0.019075,  0.00168 ,  0.630787, -0.222363,  0.509805,
        0.034442, -0.53178 , -0.01322 , -0.132236,  0.350107,  0.159559,
       -0.12521 ,  0.124991,  0.200089,  0.267528, -0.004522,  0.099476,
       -0.124619, -0.240942,  0.685705,  0.510095, -0.172306,  0.356119,
       -0.397261,  0.176644, -0.296311, -0.017491,  0.003167,  0.76972 ,
        0.462957,  0.036609, -0.121001,  0.548066, -0.111989, -0.189734,
        0.037821, -0.200291, -0.11532 ,  0.559569,  0.474287, -0.036509,
       -0.256032,  0.171442, -0.467874, -0.098621,  0.050189,  0.114245,
        0.078036,  0.316675, -0.02796 ,  0.290333, -0.4083  ,  0.10135 ,
       -0.140622,  0.535116, -0.002902, -0.06789 ,  0.123552, -0.462062,
        0.426816, -0.043905,  0.120482,  0.350633,  0.286277, -0.719374,
        0.125254,  0.169549, -0.261571,  0.150737, -0.038593, -0.152229,
       -0

In [23]:
# 提取词向量为二维数组
APP_positive_external_vectors = np.array(list(APP_positive_external_vectors.values()))
APP_negative_external_vectors = np.array(list(APP_negative_external_vectors.values()))

# 检查维度
print(f"Positive vectors shape: {APP_positive_external_vectors.shape}")
print(f"Negative vectors shape: {APP_negative_external_vectors.shape}")

Positive vectors shape: (1088, 200)
Negative vectors shape: (829, 200)


In [25]:
# 计算每组词向量的平均值（中心词向量）
APP_mean_seed_positive = np.mean(APP_positive_vectors, axis=0)
APP_mean_seed_negative = np.mean(APP_negative_vectors, axis=0)
APP_mean_external_positive = np.mean(APP_positive_external_vectors, axis=0)
APP_mean_external_negative = np.mean(APP_negative_external_vectors, axis=0)


In [28]:
# 计算皮尔逊相关系数
positive_corr, _ = pearsonr(APP_mean_seed_positive, APP_mean_external_positive)
negative_corr, _ = pearsonr(APP_mean_seed_negative, APP_mean_external_negative)

# 打印输出结果
print(f"APP Positive correlation (seed vs expanded): {positive_corr:.4f}")
print(f"APP Negative correlation (seed vs expanded): {negative_corr:.4f}")


APP Positive correlation (seed vs expanded): 0.9694
APP Negative correlation (seed vs expanded): 0.9778


In [None]:
# from scipy.stats import pearsonr

# positive_corr, positive_p_value = pearsonr(APP_mean_seed_positive, APP_mean_external_positive)
# negative_corr, negative_p_value = pearsonr(APP_mean_seed_negative, APP_mean_external_negative)

# # 输出结果
# print(f"APP Positive correlation (seed vs expanded): {positive_corr:.4f}, P-value: {positive_p_value:.4f}")
# print(f"APP Negative correlation (seed vs expanded): {negative_corr:.4f}, P-value: {negative_p_value:.4f}")


APP Positive correlation (seed vs expanded): 0.9694, P-value: 0.0000
APP Negative correlation (seed vs expanded): 0.9778, P-value: 0.0000


### SES流程

In [30]:
df_SES = read_excel(r'data\SEV_wordlist.xlsx', sheet_name = 1)

df_SES_positive = df_SES.loc[df_SES['Valence']=='Positive', 'Word'].to_frame()
SES_positive_seed_words = df_SES_positive.iloc[:,0].tolist()
df_SES_negative = df_SES.loc[df_SES['Valence']=='Negative', 'Word'].to_frame()
SES_negative_seed_words = df_SES_negative.iloc[:,0].tolist()

SES_positive_vectors = get_word_vectors(SES_positive_seed_words, tencent_embedding)
SES_negative_vectors = get_word_vectors(SES_negative_seed_words, tencent_embedding)

SES_positive_vectors = np.array(list(SES_positive_vectors.values()))
SES_negative_vectors = np.array(list(SES_negative_vectors.values()))

Word '奢糜' not found in embedding model.


In [None]:
External_df_SES = read_excel(r'SES_polarity_results.xlsx')

SES_positive_external_words = External_df_SES.loc[External_df_SES['Polarity']=='positive', 'Word'].tolist()
SES_negative_external_words = External_df_SES.loc[External_df_SES['Polarity']=='negative', 'Word'].tolist()

SES_positive_external_vectors = get_word_vectors(SES_positive_external_words, tencent_embedding)
SES_negative_external_vectors = get_word_vectors(SES_negative_external_words, tencent_embedding)

SES_positive_external_vectors = np.array(list(SES_positive_external_vectors.values()))
SES_negative_external_vectors = np.array(list(SES_negative_external_vectors.values()))

In [32]:
# 计算每组词向量的平均值（中心词向量）
SES_mean_seed_positive = np.mean(SES_positive_vectors, axis=0)
SES_mean_seed_negative = np.mean(SES_negative_vectors, axis=0)
SES_mean_external_positive = np.mean(SES_positive_external_vectors, axis=0)
SES_mean_external_negative = np.mean(SES_negative_external_vectors, axis=0)

In [33]:
# 计算皮尔逊相关系数
positive_corr, _ = pearsonr(SES_mean_seed_positive, SES_mean_external_positive)
negative_corr, _ = pearsonr(SES_mean_seed_negative, SES_mean_external_negative)

# 打印输出结果
print(f"SES Positive correlation (seed vs expanded): {positive_corr:.4f}")
print(f"SES Negative correlation (seed vs expanded): {negative_corr:.4f}")

SES Positive correlation (seed vs expanded): 0.9690
SES Negative correlation (seed vs expanded): 0.9749


### Soc流程

In [34]:
df_Soc = read_excel(r'data\SEV_wordlist.xlsx', sheet_name = 2)

df_Soc_positive = df_Soc.loc[df_Soc['Valence']=='Positive', 'Word'].to_frame()
Soc_positive_seed_words = df_Soc_positive.iloc[:,0].tolist()
df_Soc_negative = df_Soc.loc[df_Soc['Valence']=='Negative', 'Word'].to_frame()
Soc_negative_seed_words = df_Soc_negative.iloc[:,0].tolist()

Soc_positive_vectors = get_word_vectors(Soc_positive_seed_words, tencent_embedding)
Soc_negative_vectors = get_word_vectors(Soc_negative_seed_words, tencent_embedding)

Soc_positive_vectors = np.array(list(Soc_positive_vectors.values()))
Soc_negative_vectors = np.array(list(Soc_negative_vectors.values()))

Word '直心眼儿' not found in embedding model.
Word '率然' not found in embedding model.
Word '笨口拙舌' not found in embedding model.
Word '贫嘴薄舌' not found in embedding model.
Word '狐朋狗党' not found in embedding model.


In [35]:
External_df_Soc = read_excel(r'Soc_polarity_results.xlsx')

Soc_positive_external_words = External_df_Soc.loc[External_df_Soc['Polarity']=='positive', 'Word'].tolist()
Soc_negative_external_words = External_df_Soc.loc[External_df_Soc['Polarity']=='negative', 'Word'].tolist()

Soc_positive_external_vectors = get_word_vectors(Soc_positive_external_words, tencent_embedding)
Soc_negative_external_vectors = get_word_vectors(Soc_negative_external_words, tencent_embedding)

Soc_positive_external_vectors = np.array(list(Soc_positive_external_vectors.values()))
Soc_negative_external_vectors = np.array(list(Soc_negative_external_vectors.values()))

In [36]:
Soc_mean_seed_positive = np.mean(Soc_positive_vectors, axis=0)
Soc_mean_seed_negative = np.mean(Soc_negative_vectors, axis=0)
Soc_mean_external_positive = np.mean(Soc_positive_external_vectors, axis=0)
Soc_mean_external_negative = np.mean(Soc_negative_external_vectors, axis=0)

In [37]:
positive_corr, _ = pearsonr(Soc_mean_seed_positive, Soc_mean_external_positive)
negative_corr, _ = pearsonr(Soc_mean_seed_negative, Soc_mean_external_negative)

print(f"Soc Positive correlation (seed vs expanded): {positive_corr:.4f}")
print(f"Soc Negative correlation (seed vs expanded): {negative_corr:.4f}")

Soc Positive correlation (seed vs expanded): 0.9756
Soc Negative correlation (seed vs expanded): 0.9830


### Com流程

In [39]:
df_Com = read_excel(r'data\SEV_wordlist.xlsx', sheet_name = 3)

df_Com_positive = df_Com.loc[df_Com['Valence']=='Positive', 'Word'].to_frame()
Com_positive_seed_words = df_Com_positive.iloc[:,0].tolist()
df_Com_negative = df_Com.loc[df_Com['Valence']=='Negative', 'Word'].to_frame()
Com_negative_seed_words = df_Com_negative.iloc[:,0].tolist()

Com_positive_vectors = get_word_vectors(Com_positive_seed_words, tencent_embedding)
Com_negative_vectors = get_word_vectors(Com_negative_seed_words, tencent_embedding)

Com_positive_vectors = np.array(list(Com_positive_vectors.values()))
Com_negative_vectors = np.array(list(Com_negative_vectors.values()))

Word '干才' not found in embedding model.
Word '聪悟' not found in embedding model.
Word '有谱儿' not found in embedding model.
Word '言必有中' not found in embedding model.
Word '骁悍' not found in embedding model.
Word '粗手笨脚' not found in embedding model.
Word '蠢头蠢脑' not found in embedding model.
Word '羽毛未丰' not found in embedding model.
Word '别无长物' not found in embedding model.
Word '迂拙' not found in embedding model.


In [40]:
External_df_Com = read_excel(r'Com_polarity_results.xlsx')

Com_positive_external_words = External_df_Com.loc[External_df_Com['Polarity']=='positive', 'Word'].tolist()
Com_negative_external_words = External_df_Com.loc[External_df_Com['Polarity']=='negative', 'Word'].tolist()

Com_positive_external_vectors = get_word_vectors(Com_positive_external_words, tencent_embedding)
Com_negative_external_vectors = get_word_vectors(Com_negative_external_words, tencent_embedding)

Com_positive_external_vectors = np.array(list(Com_positive_external_vectors.values()))
Com_negative_external_vectors = np.array(list(Com_negative_external_vectors.values()))

In [41]:
Com_mean_seed_positive = np.mean(Com_positive_vectors, axis=0)
Com_mean_seed_negative = np.mean(Com_negative_vectors, axis=0)
Com_mean_external_positive = np.mean(Com_positive_external_vectors, axis=0)
Com_mean_external_negative = np.mean(Com_negative_external_vectors, axis=0)

In [42]:
positive_corr, _ = pearsonr(Com_mean_seed_positive, Com_mean_external_positive)
negative_corr, _ = pearsonr(Com_mean_seed_negative, Com_mean_external_negative)

print(f"Soc Positive correlation (seed vs expanded): {positive_corr:.4f}")
print(f"Soc Negative correlation (seed vs expanded): {negative_corr:.4f}")

Soc Positive correlation (seed vs expanded): 0.9765
Soc Negative correlation (seed vs expanded): 0.9831


### Mor流程

In [43]:
df_Mor = read_excel(r'data\SEV_wordlist.xlsx', sheet_name = 4)

df_Mor_positive = df_Mor.loc[df_Mor['Valence']=='Positive', 'Word'].to_frame()
Mor_positive_seed_words = df_Mor_positive.iloc[:,0].tolist()
df_Mor_negative = df_Mor.loc[df_Mor['Valence']=='Negative', 'Word'].to_frame()
Mor_negative_seed_words = df_Mor_negative.iloc[:,0].tolist()

Mor_positive_vectors = get_word_vectors(Mor_positive_seed_words, tencent_embedding)
Mor_negative_vectors = get_word_vectors(Mor_negative_seed_words, tencent_embedding)

Mor_positive_vectors = np.array(list(Mor_positive_vectors.values()))
Mor_negative_vectors = np.array(list(Mor_negative_vectors.values()))

Word '谦顺' not found in embedding model.
Word '恳挚' not found in embedding model.
Word '轻诺寡信' not found in embedding model.
Word '险诈' not found in embedding model.
Word '鼠肚鸡肠' not found in embedding model.
Word '耍心眼儿' not found in embedding model.
Word '害人虫' not found in embedding model.
Word '卑俗' not found in embedding model.


In [44]:
External_df_Mor = read_excel(r'Mor_polarity_results.xlsx')

Mor_positive_external_words = External_df_Mor.loc[External_df_Mor['Polarity']=='positive', 'Word'].tolist()
Mor_negative_external_words = External_df_Mor.loc[External_df_Mor['Polarity']=='negative', 'Word'].tolist()

Mor_positive_external_vectors = get_word_vectors(Mor_positive_external_words, tencent_embedding)
Mor_negative_external_vectors = get_word_vectors(Mor_negative_external_words, tencent_embedding)

Mor_positive_external_vectors = np.array(list(Mor_positive_external_vectors.values()))
Mor_negative_external_vectors = np.array(list(Mor_negative_external_vectors.values()))

In [45]:
Mor_mean_seed_positive = np.mean(Mor_positive_vectors, axis=0)
Mor_mean_seed_negative = np.mean(Mor_negative_vectors, axis=0)
Mor_mean_external_positive = np.mean(Mor_positive_external_vectors, axis=0)
Mor_mean_external_negative = np.mean(Mor_negative_external_vectors, axis=0)

In [None]:
positive_corr, _ = pearsonr(Mor_mean_seed_positive, Mor_mean_external_positive)
negative_corr, _ = pearsonr(Mor_mean_seed_negative, Mor_mean_external_negative)

print(f"Mor Positive correlation (seed vs expanded): {positive_corr:.4f}")
print(f"Mor Negative correlation (seed vs expanded): {negative_corr:.4f}")

Soc Positive correlation (seed vs expanded): 0.9776
Soc Negative correlation (seed vs expanded): 0.9869


### 有效性检验

In [47]:
# 定义维度、性质和对应的数据
dimensions = ["APP", "SES", "Soc", "Com", "Mor"]
polarities = ["positive", "negative"]

# 初始化存储数据的列表
all_words = []
all_vectors = []
all_polarities = []
all_dimensions = []

# 定义所有的词汇、词向量和标签
for dim in dimensions:
    for pol in polarities:
        # 获取词汇和词向量的变量名
        seed_words_var = f"{dim}_{pol}_seed_words"
        external_words_var = f"{dim}_{pol}_external_words"
        seed_vectors_var = f"{dim}_{pol}_vectors"
        external_vectors_var = f"{dim}_{pol}_external_vectors"
        
        # 获取种子词和扩展词
        seed_words = globals()[seed_words_var]
        external_words = globals()[external_words_var]
        seed_vectors = globals()[seed_vectors_var]
        external_vectors = globals()[external_vectors_var]
        
        # 合并词汇和向量
        all_words.extend(seed_words)
        all_vectors.extend(seed_vectors)
        all_polarities.extend([pol] * len(seed_words))
        all_dimensions.extend([dim] * len(seed_words))
        
        all_words.extend(external_words)
        all_vectors.extend(external_vectors)
        all_polarities.extend([pol] * len(external_words))
        all_dimensions.extend([dim] * len(external_words))


In [48]:
# 将数据组织为DataFrame
data = pd.DataFrame({
    "Word": all_words,
    "Vector": all_vectors,
    "Polarity": all_polarities,
    "Dimension": all_dimensions
})

# 保存为Excel文件
output_file = "word_dataset_for_classification.xlsx"
data.to_excel(output_file, index=False)
print(f"数据集已保存为 {output_file}")


ValueError: All arrays must be of the same length

In [49]:
# 初始化存储数据的列表
all_words = []
all_vectors = []
all_polarities = []
all_dimensions = []

# 定义所有的维度和极性
dimensions = ["APP", "SES", "Soc", "Com", "Mor"]
polarities = ["positive", "negative"]

# 加载腾讯词向量（假设已加载到 tencent_embedding 中）

# 定义所有的词汇、词向量和标签
for dim in dimensions:
    for pol in polarities:
        # 获取词汇和词向量的变量名
        seed_words_var = f"{dim}_{pol}_seed_words"
        external_words_var = f"{dim}_{pol}_external_words"
        seed_vectors_var = f"{dim}_{pol}_vectors"
        external_vectors_var = f"{dim}_{pol}_external_vectors"
        
        # 获取种子词和扩展词
        seed_words = globals().get(seed_words_var, [])
        external_words = globals().get(external_words_var, [])
        seed_vectors = globals().get(seed_vectors_var, [])
        external_vectors = globals().get(external_vectors_var, [])
        
        # 过滤掉没有对应向量的词
        filtered_seed_words = [w for w in seed_words if w in tencent_embedding]
        filtered_seed_vectors = [tencent_embedding[w] for w in filtered_seed_words]
        
        filtered_external_words = [w for w in external_words if w in tencent_embedding]
        filtered_external_vectors = [tencent_embedding[w] for w in filtered_external_words]

        # 检查词汇和向量长度是否匹配
        if len(filtered_seed_words) != len(filtered_seed_vectors):
            print(f"种子词与词向量长度不匹配（过滤后）：{seed_words_var}")
        if len(filtered_external_words) != len(filtered_external_vectors):
            print(f"扩展词与词向量长度不匹配（过滤后）：{external_words_var}")
        
        # 合并种子词
        all_words.extend(filtered_seed_words)
        all_vectors.extend(filtered_seed_vectors)
        all_polarities.extend([pol] * len(filtered_seed_words))
        all_dimensions.extend([dim] * len(filtered_seed_words))
        
        # 合并扩展词
        all_words.extend(filtered_external_words)
        all_vectors.extend(filtered_external_vectors)
        all_polarities.extend([pol] * len(filtered_external_words))
        all_dimensions.extend([dim] * len(filtered_external_words))

# 检查数据长度是否一致
print(f"Words length: {len(all_words)}")
print(f"Vectors length: {len(all_vectors)}")
print(f"Polarities length: {len(all_polarities)}")
print(f"Dimensions length: {len(all_dimensions)}")

# 确保所有列表长度一致后，创建 DataFrame
if len(all_words) == len(all_vectors) == len(all_polarities) == len(all_dimensions):
    data = pd.DataFrame({
        "Word": all_words,
        "Vector": all_vectors,
        "Polarity": all_polarities,
        "Dimension": all_dimensions
    })

    # 保存为 Excel 文件
    output_file = "word_dataset_for_classification.xlsx"
    data.to_excel(output_file, index=False)
    print(f"数据集已保存为 {output_file}")
else:
    print("数据长度不一致，无法创建 DataFrame，请检查输入数据。")


Words length: 9693
Vectors length: 9693
Polarities length: 9693
Dimensions length: 9693
数据集已保存为 word_dataset_for_classification.xlsx
