In [21]:
# 导入读取数据所需要的库
import numpy as np
import pandas as pd
from pandas import read_excel

import gensim
from gensim import models
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity


# 预训练的词向量语料库文件路径
tecent_embedding_path = (r'D:\embedding_zh\tencent-ailab-embedding-zh-d200-v0.2.0-s\tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')

# 加载词向量文件
tencent_embedding = models.KeyedVectors.load_word2vec_format(tecent_embedding_path, binary=False)

In [22]:
df = read_excel(r'data\SEV_wordlist.xlsx', sheet_name=1)

In [23]:
df_positive = df.loc[df['Valence']=='Positive', 'Word'].to_frame()
positive_seed_words = df_positive.iloc[:,0].tolist()
df_negative = df.loc[df['Valence']=='Negative', 'Word'].to_frame()
negative_seed_words = df_negative.iloc[:,0].tolist()

In [24]:
positive_external_df = pd.read_excel('result\SES_unique_positive_similar_words.xlsx')
positive_external_words = positive_external_df.iloc[:,0].tolist()

negative_external_df = pd.read_excel('result\SES_unique_negative_similar_words.xlsx')
negative_external_words = negative_external_df.iloc[:,0].tolist()

external_words = positive_external_words + negative_external_words
external_words

['富足的生活',
 '身价飙升',
 '鲜花着锦',
 '很气派',
 '土豪们',
 '富于',
 '家境殷实',
 '下流',
 '飞黄腾达',
 '家资',
 '叱咤风云的人物',
 '身份高贵',
 '很体面',
 '声名赫赫',
 '顶尖人物',
 '看不起别人',
 '华丽高贵',
 '一直默默无闻',
 '高贵的身份',
 '知名人物',
 '挣学费',
 '手眼通天',
 '最尊贵',
 '不可替代',
 '名不经传',
 '派头',
 '雍容华贵',
 '寒酸',
 '首富',
 '默默无闻的人',
 '高层们',
 '高贵无比',
 '人人皆知',
 '经济富裕',
 '鼎盛',
 '相对富足',
 '豪华气派',
 '辉煌',
 '传奇人物',
 '当红人物',
 '土豪啊',
 '雇农',
 '优越感十足',
 '家缠万贯',
 '高贵',
 '大富豪',
 '金钱',
 '极富',
 '贵气',
 '流着',
 '半工半读',
 '导师',
 '光耀',
 '丰足',
 '最富',
 '尊贵的身份',
 '富的',
 '富足',
 '上层',
 '名利双收',
 '小康之家',
 '可以说是家喻户晓',
 '壕气十足',
 '不富裕',
 '家财万贯',
 '富贵荣华',
 '富裕的生活',
 '有身份的人',
 '你是土豪',
 '非常体面',
 '很富',
 '煊赫',
 '殷实的家底',
 '最著名',
 '响当当的人物',
 '体体面面',
 '显赫的身世',
 '堆金积玉',
 '荣华富贵',
 '千万富翁',
 '脸面',
 '富贵之气',
 '身价高',
 '衣锦还乡',
 '贵族的气质',
 '两位教授',
 '教授们',
 '偶像级人物',
 '高人一等的感觉',
 '比较著名',
 '藉藉无名',
 '贫寒',
 '穷困',
 '赚取财富',
 '生活富足',
 '事业成功',
 '高贵优雅',
 '更权威',
 '高大气派',
 '大富翁',
 '兼职打工',
 '农村农民',
 '权威性',
 '高高在上的优越感',
 '坐拥金山',
 '很著名',
 '权威专业',
 '很有名气',
 '优雅贵气',
 '专业权威',
 '贵族气质',
 '家底殷实',
 '赫赫有名',
 '第一首富',
 '豪气冲天',
 '

In [25]:
# 尝试计算种子词的中心向量（正向或负向），跳过未找到的词
positive_vectors = []
negative_vectors = []

for T in positive_seed_words:
    try:
        vector = tencent_embedding[T]
        positive_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

for T in negative_seed_words:
    try:
        vector = tencent_embedding[T]
        negative_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

# 计算中心向量，确保列表不为空
if positive_vectors:
    positive_center_vector = np.mean(positive_vectors, axis=0)
else:
    print("No positive seed words found in embedding.")
    positive_center_vector = None

if negative_vectors:
    negative_center_vector = np.mean(negative_vectors, axis=0)
else:
    print("No negative seed words found in embedding.")
    negative_center_vector = None

Skipping word: 奢糜, not found in embedding.


In [26]:
# 数据存储
results = []
positive_count = 0
negative_count = 0

In [27]:
# 获取未标注词汇的向量并计算相似度
if positive_center_vector is not None and negative_center_vector is not None:
    external_vectors = np.array([tencent_embedding[W] for W in external_words if W in tencent_embedding])

    # 计算未标注词汇与中心向量的余弦相似度
    positive_similarity = cosine_similarity(external_vectors, positive_center_vector.reshape(1, -1))
    negative_similarity = cosine_similarity(external_vectors, negative_center_vector.reshape(1, -1))

    # 根据相似度标记每个未标注词汇的情感极性
    for i, W in enumerate(external_words):
        if W in tencent_embedding:
            pos_score = positive_similarity[i][0]
            neg_score = negative_similarity[i][0]

            print(f"Word: {W}, Positive Similarity: {pos_score}, Negative Similarity: {neg_score}")

            if pos_score > neg_score:
                polarity = "positive"
                print(f"{W} is positive")
                positive_count += 1
            else:
                print(f"{W} is negative")
                polarity = "negative"
                negative_count += 1
            
            results.append([W, polarity, pos_score, neg_score])
        else:
            print(f"Word: {W} not found in embedding, skipping.")

Word: 富足的生活, Positive Similarity: 0.6359830498695374, Negative Similarity: 0.6196398735046387
富足的生活 is positive
Word: 身价飙升, Positive Similarity: 0.4510631263256073, Negative Similarity: 0.3601765036582947
身价飙升 is positive
Word: 鲜花着锦, Positive Similarity: 0.5604444146156311, Negative Similarity: 0.5404638051986694
鲜花着锦 is positive
Word: 很气派, Positive Similarity: 0.5597915649414062, Negative Similarity: 0.3647834062576294
很气派 is positive
Word: 土豪们, Positive Similarity: 0.5225933790206909, Negative Similarity: 0.4276607036590576
土豪们 is positive
Word: 富于, Positive Similarity: 0.47312092781066895, Negative Similarity: 0.429778128862381
富于 is positive
Word: 家境殷实, Positive Similarity: 0.7062009572982788, Negative Similarity: 0.6216046810150146
家境殷实 is positive
Word: 下流, Positive Similarity: 0.4982437193393707, Negative Similarity: 0.5696237087249756
下流 is negative
Word: 飞黄腾达, Positive Similarity: 0.6246823072433472, Negative Similarity: 0.5807827711105347
飞黄腾达 is positive
Word: 家资, Positive S

In [28]:
df_result = pd.DataFrame(results, columns=["Word", "Polarity", "Positive Similarity", "Negative Similarity"])

In [29]:
# 保存到 Excel 文件
output_file = "SES_polarity_results.xlsx"
df_result.to_excel(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to SES_polarity_results.xlsx


In [30]:
print(f"Positive words count: {positive_count}")
print(f"Negative words count: {negative_count}")

Positive words count: 382
Negative words count: 529
