In [1]:
# 导入读取数据所需要的库
import numpy as np
import pandas as pd
from pandas import read_excel

import gensim
from gensim import models
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity


# 预训练的词向量语料库文件路径
tecent_embedding_path = (r'D:\embedding_zh\tencent-ailab-embedding-zh-d200-v0.2.0-s\tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')

# 加载词向量文件
tencent_embedding = models.KeyedVectors.load_word2vec_format(tecent_embedding_path, binary=False)

In [2]:
df = read_excel(r'data\SEV_wordlist.xlsx', sheet_name=3)

In [3]:
df_positive = df.loc[df['Valence']=='Positive', 'Word'].to_frame()
positive_seed_words = df_positive.iloc[:,0].tolist()
df_negative = df.loc[df['Valence']=='Negative', 'Word'].to_frame()
negative_seed_words = df_negative.iloc[:,0].tolist()

In [4]:
positive_external_df = pd.read_excel('result\Com_unique_positive_similar_words.xlsx')
positive_external_words = positive_external_df.iloc[:,0].tolist()

negative_external_df = pd.read_excel('result\Com_unique_negative_similar_words.xlsx')
negative_external_words = negative_external_df.iloc[:,0].tolist()

external_words = positive_external_words + negative_external_words
external_words

['辛勤汗水',
 '持之以恒的毅力',
 '精妙',
 '坚忍不拔',
 '双巧手',
 '由得',
 '诸葛孔明',
 '光芒',
 '胆识',
 '痴爱',
 '高妙',
 '明察',
 '懂变通',
 '精明睿智',
 '才思敏捷',
 '慧眼识人',
 '没有高手',
 '狡兔三窟',
 '很稳健',
 '所谓的独立',
 '矫健',
 '崇',
 '娴熟地',
 '独树一帜的风格',
 '固执',
 '独具一格',
 '缺少主见',
 '缜密',
 '正值青年',
 '积极奋进',
 '全能型',
 '胸有成竹',
 '能吃苦',
 '一飞冲天',
 '执著',
 '初生牛犊',
 '真正的学霸',
 '顺应大势',
 '独辟',
 '一双慧眼',
 '周密的计划',
 '算无遗漏',
 '哲学思维',
 '辨是非',
 '手巧',
 '勇于拼搏',
 '高瞻远瞩',
 '果断决绝',
 '不懈地',
 '一往无前',
 '先进的',
 '稳打',
 '聪慧过人',
 '理性的思维',
 '你努力',
 '果决',
 '矫捷',
 '精悍',
 '把钱花在刀刃上',
 '终极',
 '出口成章',
 '自觉',
 '博士导师',
 '发奋图强',
 '耳不聋',
 '懂得审时度势',
 '好学上进',
 '什么是独立',
 '奋力拼搏',
 '极其谨慎',
 '学会举一反三',
 '思辨性',
 '非常细致',
 '防患于未然',
 '先进的技术',
 '塞翁失马',
 '慎密',
 '奋斗拼搏',
 '绸缪',
 '可以自制',
 '雄才大略',
 '拼力',
 '手脚勤快',
 '身手矫健',
 '眼力劲儿',
 '再坚持',
 '坚强的毅力',
 '药学博士',
 '思辩',
 '防患未然',
 '实力',
 '长远眼光',
 '老鬼',
 '金睛火眼',
 '一步一个脚印',
 '机智灵敏',
 '用功读书',
 '十分严谨',
 '稳健有力',
 '努力上进',
 '能用心',
 '善战',
 '抓住重点',
 '更加精细',
 '聪颖过人',
 '你要坚持',
 '仔细看',
 '相当有力',
 '会变通',
 '去拼搏',
 '非常机智',
 '金光',
 '很认真',
 '理智和冷静',
 '坚持自律',
 '恪守职责',
 '玩味',


In [5]:
# 尝试计算种子词的中心向量（正向或负向），跳过未找到的词
positive_vectors = []
negative_vectors = []

for T in positive_seed_words:
    try:
        vector = tencent_embedding[T]
        positive_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

for T in negative_seed_words:
    try:
        vector = tencent_embedding[T]
        negative_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

# 计算中心向量，确保列表不为空
if positive_vectors:
    positive_center_vector = np.mean(positive_vectors, axis=0)
else:
    print("No positive seed words found in embedding.")
    positive_center_vector = None

if negative_vectors:
    negative_center_vector = np.mean(negative_vectors, axis=0)
else:
    print("No negative seed words found in embedding.")
    negative_center_vector = None

Skipping word: 干才, not found in embedding.
Skipping word: 聪悟, not found in embedding.
Skipping word: 有谱儿, not found in embedding.
Skipping word: 言必有中, not found in embedding.
Skipping word: 骁悍, not found in embedding.
Skipping word: 粗手笨脚, not found in embedding.
Skipping word: 蠢头蠢脑, not found in embedding.
Skipping word: 羽毛未丰, not found in embedding.
Skipping word: 别无长物, not found in embedding.
Skipping word: 迂拙, not found in embedding.


In [6]:
# 数据存储
results = []
positive_count = 0
negative_count = 0

In [7]:
# 获取未标注词汇的向量并计算相似度
if positive_center_vector is not None and negative_center_vector is not None:
    external_vectors = np.array([tencent_embedding[W] for W in external_words if W in tencent_embedding])

    # 计算未标注词汇与中心向量的余弦相似度
    positive_similarity = cosine_similarity(external_vectors, positive_center_vector.reshape(1, -1))
    negative_similarity = cosine_similarity(external_vectors, negative_center_vector.reshape(1, -1))

    # 根据相似度标记每个未标注词汇的情感极性
    for i, W in enumerate(external_words):
        if W in tencent_embedding:
            pos_score = positive_similarity[i][0]
            neg_score = negative_similarity[i][0]

            print(f"Word: {W}, Positive Similarity: {pos_score}, Negative Similarity: {neg_score}")

            if pos_score > neg_score:
                polarity = "positive"
                print(f"{W} is positive")
                positive_count += 1
            else:
                print(f"{W} is negative")
                polarity = "negative"
                negative_count += 1
            
            results.append([W, polarity, pos_score, neg_score])
        else:
            print(f"Word: {W} not found in embedding, skipping.")

Word: 辛勤汗水, Positive Similarity: 0.41083264350891113, Negative Similarity: 0.29220908880233765
辛勤汗水 is positive
Word: 持之以恒的毅力, Positive Similarity: 0.575130820274353, Negative Similarity: 0.4250966012477875
持之以恒的毅力 is positive
Word: 精妙, Positive Similarity: 0.6524255871772766, Negative Similarity: 0.5347509384155273
精妙 is positive
Word: 坚忍不拔, Positive Similarity: 0.6892150640487671, Negative Similarity: 0.4821857213973999
坚忍不拔 is positive
Word: 双巧手, Positive Similarity: 0.46630722284317017, Negative Similarity: 0.3557364344596863
双巧手 is positive
Word: 由得, Positive Similarity: 0.4308822453022003, Negative Similarity: 0.39977067708969116
由得 is positive
Word: 诸葛孔明, Positive Similarity: 0.5192629098892212, Negative Similarity: 0.48335620760917664
诸葛孔明 is positive
Word: 光芒, Positive Similarity: 0.4286801218986511, Negative Similarity: 0.3481287360191345
光芒 is positive
Word: 胆识, Positive Similarity: 0.7702984809875488, Negative Similarity: 0.5898198485374451
胆识 is positive
Word: 痴爱, Positive

In [8]:
df_result = pd.DataFrame(results, columns=["Word", "Polarity", "Positive Similarity", "Negative Similarity"])

In [9]:
# 保存到 Excel 文件
output_file = "Com_polarity_results.xlsx"
df_result.to_excel(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to Com_polarity_results.xlsx


In [10]:
print(f"Positive words count: {positive_count}")
print(f"Negative words count: {negative_count}")

Positive words count: 1583
Negative words count: 698
