In [1]:
# 导入读取数据所需要的库
import numpy as np
import pandas as pd
from pandas import read_excel

import gensim
from gensim import models
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity


# 预训练的词向量语料库文件路径
tecent_embedding_path = (r'D:\embedding_zh\tencent-ailab-embedding-zh-d200-v0.2.0-s\tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')

# 加载词向量文件
tencent_embedding = models.KeyedVectors.load_word2vec_format(tecent_embedding_path, binary=False)

In [2]:
df = read_excel(r'data\SEV_wordlist.xlsx', sheet_name=4)

In [3]:
df_positive = df.loc[df['Valence']=='Positive', 'Word'].to_frame()
positive_seed_words = df_positive.iloc[:,0].tolist()
df_negative = df.loc[df['Valence']=='Negative', 'Word'].to_frame()
negative_seed_words = df_negative.iloc[:,0].tolist()

In [4]:
positive_external_df = pd.read_excel('result\Mor_unique_positive_similar_words.xlsx')
positive_external_words = positive_external_df.iloc[:,0].tolist()

negative_external_df = pd.read_excel('result\Mor_unique_negative_similar_words.xlsx')
negative_external_words = negative_external_df.iloc[:,0].tolist()

external_words = positive_external_words + negative_external_words
external_words

['憨直',
 '绝对公正',
 '尽量节约',
 '忠义两全',
 '菩萨心肠',
 '率性',
 '卑谦',
 '守道',
 '沉毅',
 '雪白',
 '缄口不言',
 '泰然自若',
 '很忠心',
 '光明正大地',
 '右军',
 '最纯洁',
 '诚实守信',
 '殚精竭虑',
 '浩然正气',
 '清清白白',
 '开诚布公',
 '孝敬父母',
 '不公允',
 '善良之心',
 '独善',
 '嫉恶如仇',
 '义气',
 '一股正气',
 '虚心好学',
 '恬淡',
 '公正处',
 '一脸正义',
 '孝顺母亲',
 '森然',
 '通情理',
 '献出',
 '勤勉敬业',
 '沉雄',
 '应该以身作则',
 '忠顺王',
 '窄小',
 '无比虔诚',
 '高尚品质',
 '保守秘密',
 '淳朴善良',
 '诚恳地',
 '淳厚',
 '爱情的忠贞',
 '披肝沥胆',
 '恩怨分明',
 '性格刚强',
 '不慕名利',
 '赤胆忠诚',
 '赤忱',
 '虔诚的信仰',
 '爱国情操',
 '高潓',
 '很耿直',
 '光明力量',
 '绝对的忠诚',
 '坚决',
 '纯良',
 '良好的家教',
 '崇高的精神',
 '明哲保身',
 '讲礼貌',
 '明目张胆地',
 '谦逊有礼',
 '忠君',
 '节省',
 '学法守法',
 '超逸',
 '堂堂正正',
 '忠贞不屈',
 '古道热肠',
 '爱国主义',
 '礼貌问题',
 '勤劳努力',
 '大无畏',
 '坚定',
 '老诚',
 '破规矩',
 '分内之事',
 '慷慨无私',
 '讨回公道',
 '好的教养',
 '慈悲为怀',
 '秉德',
 '崇高',
 '无法独善其身',
 '很敬业',
 '坚贞不屈',
 '坚毅的目光',
 '涵养',
 '守规',
 '社会奉献',
 '特别专一',
 '不真心',
 '坚贞不渝的爱情',
 '玉洁',
 '礼仪规矩',
 '很专一',
 '深情',
 '这时',
 '就在',
 '科技文明',
 '慎独',
 '心地善良',
 '完全公平',
 '刚毅',
 '尽职履责',
 '专二',
 '你真诚',
 '能够包容',
 '坦率地',
 '整体素质',
 '生死之交',
 '死忠',
 '时间观念',
 '

In [5]:
# 尝试计算种子词的中心向量（正向或负向），跳过未找到的词
positive_vectors = []
negative_vectors = []

for T in positive_seed_words:
    try:
        vector = tencent_embedding[T]
        positive_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

for T in negative_seed_words:
    try:
        vector = tencent_embedding[T]
        negative_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

# 计算中心向量，确保列表不为空
if positive_vectors:
    positive_center_vector = np.mean(positive_vectors, axis=0)
else:
    print("No positive seed words found in embedding.")
    positive_center_vector = None

if negative_vectors:
    negative_center_vector = np.mean(negative_vectors, axis=0)
else:
    print("No negative seed words found in embedding.")
    negative_center_vector = None

Skipping word: 谦顺, not found in embedding.
Skipping word: 恳挚, not found in embedding.
Skipping word: 轻诺寡信, not found in embedding.
Skipping word: 险诈, not found in embedding.
Skipping word: 鼠肚鸡肠, not found in embedding.
Skipping word: 耍心眼儿, not found in embedding.
Skipping word: 害人虫, not found in embedding.
Skipping word: 卑俗, not found in embedding.


In [6]:
# 数据存储
results = []
positive_count = 0
negative_count = 0

In [7]:
# 获取未标注词汇的向量并计算相似度
if positive_center_vector is not None and negative_center_vector is not None:
    external_vectors = np.array([tencent_embedding[W] for W in external_words if W in tencent_embedding])

    # 计算未标注词汇与中心向量的余弦相似度
    positive_similarity = cosine_similarity(external_vectors, positive_center_vector.reshape(1, -1))
    negative_similarity = cosine_similarity(external_vectors, negative_center_vector.reshape(1, -1))

    # 根据相似度标记每个未标注词汇的情感极性
    for i, W in enumerate(external_words):
        if W in tencent_embedding:
            pos_score = positive_similarity[i][0]
            neg_score = negative_similarity[i][0]

            print(f"Word: {W}, Positive Similarity: {pos_score}, Negative Similarity: {neg_score}")

            if pos_score > neg_score:
                polarity = "positive"
                print(f"{W} is positive")
                positive_count += 1
            else:
                print(f"{W} is negative")
                polarity = "negative"
                negative_count += 1
            
            results.append([W, polarity, pos_score, neg_score])
        else:
            print(f"Word: {W} not found in embedding, skipping.")

Word: 憨直, Positive Similarity: 0.7405951619148254, Negative Similarity: 0.6656156182289124
憨直 is positive
Word: 绝对公正, Positive Similarity: 0.5134783387184143, Negative Similarity: 0.451236367225647
绝对公正 is positive
Word: 尽量节约, Positive Similarity: 0.30633875727653503, Negative Similarity: 0.21143364906311035
尽量节约 is positive
Word: 忠义两全, Positive Similarity: 0.695813775062561, Negative Similarity: 0.5913645029067993
忠义两全 is positive
Word: 菩萨心肠, Positive Similarity: 0.7176082730293274, Negative Similarity: 0.6463457942008972
菩萨心肠 is positive
Word: 率性, Positive Similarity: 0.6141777038574219, Negative Similarity: 0.5112205743789673
率性 is positive
Word: 卑谦, Positive Similarity: 0.7093167304992676, Negative Similarity: 0.634091317653656
卑谦 is positive
Word: 守道, Positive Similarity: 0.6181929707527161, Negative Similarity: 0.45611897110939026
守道 is positive
Word: 沉毅, Positive Similarity: 0.6609910726547241, Negative Similarity: 0.45372098684310913
沉毅 is positive
Word: 雪白, Positive Similarity

In [8]:
df_result = pd.DataFrame(results, columns=["Word", "Polarity", "Positive Similarity", "Negative Similarity"])

In [9]:
# 保存到 Excel 文件
output_file = "Mor_polarity_results.xlsx"
df_result.to_excel(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to Mor_polarity_results.xlsx


In [10]:
print(f"Positive words count: {positive_count}")
print(f"Negative words count: {negative_count}")

Positive words count: 1224
Negative words count: 1335
