In [2]:
# 导入读取数据所需要的库
import numpy as np
import pandas as pd
from pandas import read_excel

import gensim
from gensim import models
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity


df = read_excel(r'data\SEV_wordlist.xlsx')

# 预训练的词向量语料库文件路径
tecent_embedding_path = (r'D:\embedding_zh\tencent-ailab-embedding-zh-d200-v0.2.0-s\tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')

# 加载词向量文件
tencent_embedding = models.KeyedVectors.load_word2vec_format(tecent_embedding_path, binary=False)

In [3]:
df = read_excel(r'data\SEV_wordlist.xlsx')

In [4]:
df_positive = df.loc[df['Valence']=='Positive', 'Word'].to_frame()
positive_seed_words = df_positive.iloc[:,0].tolist()
df_negative = df.loc[df['Valence']=='Negative', 'Word'].to_frame()
negative_seed_words = df_negative.iloc[:,0].tolist()

In [5]:
positive_external_df = pd.read_excel('result\App_unique_positive_similar_words.xlsx')
positive_external_words = positive_external_df.iloc[:,0].tolist()

negative_external_df = pd.read_excel('result\App_unique_negative_similar_words.xlsx')
negative_external_words = negative_external_df.iloc[:,0].tolist()

external_words = positive_external_words + negative_external_words
external_words

['厚实',
 '仙风道骨',
 '阳刚之美',
 '威猛',
 '精美',
 '优雅从容',
 '朴实憨厚',
 '漂亮精致',
 '端庄稳重',
 '红',
 '清秀可人',
 '白皙',
 '十分优美',
 '强壮',
 '美目',
 '年轻朝气',
 '威严无比',
 '雍容',
 '文静优雅',
 '青春可人',
 '花容月貌',
 '赶潮流',
 '清甜',
 '迷人',
 '炯炯',
 '上帅',
 '娴淑',
 '凶厉',
 '骠悍',
 '纯真可爱',
 '清俊',
 '精致漂亮',
 '雅致',
 '干净点',
 '玲珑剔透',
 '媚态',
 '肃然',
 '娇滴滴',
 '帅么',
 '端庄淑女',
 '柔情',
 '美丽',
 '诡异的',
 '目光炯炯有神',
 '如花似玉',
 '十分英俊',
 '青年',
 '短小',
 '圆润丰满',
 '无比诱人',
 '抚媚',
 '面目慈祥',
 '端庄',
 '圆润饱满',
 '娇艳',
 '仪表堂堂',
 '光鲜靓丽',
 '长得眉清目秀',
 '不上相',
 '风流儒雅',
 '健美身材',
 '凶横',
 '时尚新潮',
 '甜美可爱',
 '优美地',
 '年轻热血',
 '外表光鲜亮丽',
 '精瘦',
 '结实一点',
 '富态',
 '滑润',
 '柔婉',
 '硬朗',
 '美丽大方',
 '气质儒雅',
 '神态',
 '白嫩',
 '清纯',
 '娇柔',
 '水眸',
 '得意',
 '面目清秀',
 '身穿道袍',
 '威武雄壮',
 '文邹邹',
 '热血冲动',
 '精致无比',
 '迷人的',
 '书卷味',
 '经久耐看',
 '独到',
 '二十来岁',
 '小巧可爱',
 '慈祥和蔼',
 '白净',
 '很惊艳',
 '小淑女',
 '文静',
 '短小精致',
 '看着顺眼',
 '身姿',
 '体贴',
 '西服革履',
 '明亮的双眸',
 '柔和的',
 '只是',
 '身姿挺拔',
 '文气',
 '棱角分明',
 '纯',
 '时尚好看',
 '自然精神',
 '黑',
 '不相',
 '风仪',
 '太丰满',
 '威严感',
 '小孩健康',
 '眉善目',
 '不着四六',
 '正午的阳光',
 '乖巧温顺',
 '健康啊',
 '眉清

In [6]:
# 尝试计算种子词的中心向量（正向或负向），跳过未找到的词
positive_vectors = []
negative_vectors = []

for T in positive_seed_words:
    try:
        vector = tencent_embedding[T]
        positive_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

for T in negative_seed_words:
    try:
        vector = tencent_embedding[T]
        negative_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")


# for T in seed_words:
#     try:
#         vector = tencent_embedding[T]
#         if seed_words[T]["pos"] > seed_words[T]["neg"]:
#             positive_vectors.append(vector)
#         else:
#             negative_vectors.append(vector)
#     except KeyError:
#         print(f"Skipping word: {T}, not found in embedding.")

# 计算中心向量，确保列表不为空
if positive_vectors:
    positive_center_vector = np.mean(positive_vectors, axis=0)
else:
    print("No positive seed words found in embedding.")
    positive_center_vector = None

if negative_vectors:
    negative_center_vector = np.mean(negative_vectors, axis=0)
else:
    print("No negative seed words found in embedding.")
    negative_center_vector = None

Skipping word: 高挑儿, not found in embedding.
Skipping word: 囚首垢面, not found in embedding.
Skipping word: 慓悍, not found in embedding.
Skipping word: 病病歪歪, not found in embedding.


In [7]:
# 数据存储
results = []
positive_count = 0
negative_count = 0

In [8]:
# 获取未标注词汇的向量并计算相似度
if positive_center_vector is not None and negative_center_vector is not None:
    external_vectors = np.array([tencent_embedding[W] for W in external_words if W in tencent_embedding])

    # 计算未标注词汇与中心向量的余弦相似度
    positive_similarity = cosine_similarity(external_vectors, positive_center_vector.reshape(1, -1))
    negative_similarity = cosine_similarity(external_vectors, negative_center_vector.reshape(1, -1))

    # 根据相似度标记每个未标注词汇的情感极性
    for i, W in enumerate(external_words):
        if W in tencent_embedding:
            pos_score = positive_similarity[i][0]
            neg_score = negative_similarity[i][0]

            print(f"Word: {W}, Positive Similarity: {pos_score}, Negative Similarity: {neg_score}")

            if pos_score > neg_score:
                polarity = "positive"
                print(f"{W} is positive")
                positive_count += 1
            else:
                print(f"{W} is negative")
                polarity = "negative"
                negative_count += 1
            
            results.append([W, polarity, pos_score, neg_score])
        else:
            print(f"Word: {W} not found in embedding, skipping.")

Word: 厚实, Positive Similarity: 0.6119920015335083, Negative Similarity: 0.6227110028266907
厚实 is negative
Word: 仙风道骨, Positive Similarity: 0.7023902535438538, Negative Similarity: 0.6991890668869019
仙风道骨 is positive
Word: 阳刚之美, Positive Similarity: 0.6339033246040344, Negative Similarity: 0.5175043344497681
阳刚之美 is positive
Word: 威猛, Positive Similarity: 0.7356791496276855, Negative Similarity: 0.7462306022644043
威猛 is negative
Word: 精美, Positive Similarity: 0.5967763662338257, Negative Similarity: 0.479477196931839
精美 is positive
Word: 优雅从容, Positive Similarity: 0.6861534714698792, Negative Similarity: 0.550076425075531
优雅从容 is positive
Word: 朴实憨厚, Positive Similarity: 0.5999985337257385, Negative Similarity: 0.5913994312286377
朴实憨厚 is positive
Word: 漂亮精致, Positive Similarity: 0.7088738679885864, Negative Similarity: 0.5855383276939392
漂亮精致 is positive
Word: 端庄稳重, Positive Similarity: 0.6960281729698181, Negative Similarity: 0.5481259822845459
端庄稳重 is positive
Word: 红, Positive Simila

In [9]:
df_result = pd.DataFrame(results, columns=["Word", "Polarity", "Positive Similarity", "Negative Similarity"])

In [10]:
# 保存到 Excel 文件
output_file = "APP_polarity_results.xlsx"
df_result.to_excel(output_file, index=False)  # 确保保存的是 df_result
print(f"Results saved to {output_file}")

Results saved to APP_polarity_results.xlsx


In [11]:
print(f"Positive words count: {positive_count}")
print(f"Negative words count: {negative_count}")

Positive words count: 1118
Negative words count: 864
