In [1]:
# 导入读取数据所需要的库
import numpy as np
import pandas as pd
from pandas import read_excel

import gensim
from gensim import models
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity


# 预训练的词向量语料库文件路径
tecent_embedding_path = (r'D:\embedding_zh\tencent-ailab-embedding-zh-d200-v0.2.0-s\tencent-ailab-embedding-zh-d200-v0.2.0-s.txt')

# 加载词向量文件
tencent_embedding = models.KeyedVectors.load_word2vec_format(tecent_embedding_path, binary=False)

In [2]:
df = read_excel(r'data\SEV_wordlist.xlsx', sheet_name=2)

In [3]:
df_positive = df.loc[df['Valence']=='Positive', 'Word'].to_frame()
positive_seed_words = df_positive.iloc[:,0].tolist()
df_negative = df.loc[df['Valence']=='Negative', 'Word'].to_frame()
negative_seed_words = df_negative.iloc[:,0].tolist()

In [4]:
positive_external_df = pd.read_excel('result\Soc_unique_positive_similar_words.xlsx')
positive_external_words = positive_external_df.iloc[:,0].tolist()

negative_external_df = pd.read_excel('result\Soc_unique_negative_similar_words.xlsx')
negative_external_words = negative_external_df.iloc[:,0].tolist()

external_words = positive_external_words + negative_external_words
external_words

['随性不羁',
 '不会主动',
 '咋咋呼呼',
 '性格直爽',
 '嘻嘻哈哈',
 '狂放',
 '关系亲近',
 '十分豪爽',
 '口直心快',
 '口才了得',
 '吃的开',
 '阳光暖男',
 '随性潇洒',
 '从容淡定',
 '机智幽默',
 '待人和善',
 '言观',
 '亲昵',
 '通情达理',
 '妓女',
 '羞涩',
 '亲密',
 '很不合群',
 '满腔热忱',
 '能言善道',
 '无趣',
 '虚怀若谷',
 '十分幽默',
 '洒脱随性',
 '没眼色',
 '豪爽大气',
 '诙谐风趣',
 '很亲近',
 '冷幽默',
 '梗直',
 '和祥',
 '高谈',
 '怕羞',
 '不亲近',
 '圆滑处事',
 '够仗义',
 '非常吃得开',
 '知世故',
 '善谈',
 '混得风生水起',
 '幽默',
 '特别贴心',
 '风趣',
 '向外开放',
 '透亮',
 '直爽',
 '和蔼可亲',
 '热情的态度',
 '赤忱',
 '比较活跃',
 '体贴周到',
 '小鬼头',
 '圆形方孔',
 '外向',
 '温文有礼',
 '迎合别人',
 '特别亲近',
 '率直',
 '性格大大咧咧',
 '豪放洒脱',
 '性情耿直',
 '圆滑',
 '口齿伶俐',
 '外方内圆',
 '侃侃而谈',
 '豪放不羁',
 '开朗活泼',
 '大暖男',
 '阔论',
 '不走心',
 '最贴心',
 '通彻',
 '奔放',
 '豪迈',
 '能主动',
 '十分亲切',
 '夸夸其谈',
 '谈判',
 '率性自然',
 '开朗',
 '温柔体贴',
 '太合群',
 '有趣的',
 '关心体贴',
 '合作关系',
 '人小鬼大',
 '憨直',
 '知机',
 '亲切和蔼',
 '欲擒故纵',
 '吉庆',
 '鬼丫头',
 '察言观色的能力',
 '长期合作',
 '活波',
 '很爽快',
 '世故圆滑',
 '剔透',
 '大大咧咧',
 '太亲近',
 '活泼可爱',
 '有幽默',
 '自知之明',
 '比较好说话',
 '懂人情世故',
 '会来事',
 '豪爽大方',
 '识趣点',
 '贴心温暖',
 '体贴',
 '走了心',
 '淡然自若',
 '不摆架子',
 '难说话',
 '性情

In [5]:
# 尝试计算种子词的中心向量（正向或负向），跳过未找到的词
positive_vectors = []
negative_vectors = []

for T in positive_seed_words:
    try:
        vector = tencent_embedding[T]
        positive_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

for T in negative_seed_words:
    try:
        vector = tencent_embedding[T]
        negative_vectors.append(vector)
    except KeyError:
        print(f"Skipping word: {T}, not found in embedding.")

# 计算中心向量，确保列表不为空
if positive_vectors:
    positive_center_vector = np.mean(positive_vectors, axis=0)
else:
    print("No positive seed words found in embedding.")
    positive_center_vector = None

if negative_vectors:
    negative_center_vector = np.mean(negative_vectors, axis=0)
else:
    print("No negative seed words found in embedding.")
    negative_center_vector = None

Skipping word: 直心眼儿, not found in embedding.
Skipping word: 率然, not found in embedding.
Skipping word: 笨口拙舌, not found in embedding.
Skipping word: 贫嘴薄舌, not found in embedding.
Skipping word: 狐朋狗党, not found in embedding.


In [6]:
# 数据存储
results = []
positive_count = 0
negative_count = 0

In [7]:
# 获取未标注词汇的向量并计算相似度
if positive_center_vector is not None and negative_center_vector is not None:
    external_vectors = np.array([tencent_embedding[W] for W in external_words if W in tencent_embedding])

    # 计算未标注词汇与中心向量的余弦相似度
    positive_similarity = cosine_similarity(external_vectors, positive_center_vector.reshape(1, -1))
    negative_similarity = cosine_similarity(external_vectors, negative_center_vector.reshape(1, -1))

    # 根据相似度标记每个未标注词汇的情感极性
    for i, W in enumerate(external_words):
        if W in tencent_embedding:
            pos_score = positive_similarity[i][0]
            neg_score = negative_similarity[i][0]

            print(f"Word: {W}, Positive Similarity: {pos_score}, Negative Similarity: {neg_score}")

            if pos_score > neg_score:
                polarity = "positive"
                print(f"{W} is positive")
                positive_count += 1
            else:
                print(f"{W} is negative")
                polarity = "negative"
                negative_count += 1
            
            results.append([W, polarity, pos_score, neg_score])
        else:
            print(f"Word: {W} not found in embedding, skipping.")

Word: 随性不羁, Positive Similarity: 0.5077546834945679, Negative Similarity: 0.45164915919303894
随性不羁 is positive
Word: 不会主动, Positive Similarity: 0.5228650569915771, Negative Similarity: 0.5389181971549988
不会主动 is negative
Word: 咋咋呼呼, Positive Similarity: 0.7011270523071289, Negative Similarity: 0.7391164302825928
咋咋呼呼 is negative
Word: 性格直爽, Positive Similarity: 0.7583307027816772, Negative Similarity: 0.6400808095932007
性格直爽 is positive
Word: 嘻嘻哈哈, Positive Similarity: 0.6893005967140198, Negative Similarity: 0.6917827129364014
嘻嘻哈哈 is negative
Word: 狂放, Positive Similarity: 0.6482868194580078, Negative Similarity: 0.6260992288589478
狂放 is positive
Word: 关系亲近, Positive Similarity: 0.6205608248710632, Negative Similarity: 0.5928951501846313
关系亲近 is positive
Word: 十分豪爽, Positive Similarity: 0.7026607990264893, Negative Similarity: 0.534868597984314
十分豪爽 is positive
Word: 口直心快, Positive Similarity: 0.7686547040939331, Negative Similarity: 0.7044286727905273
口直心快 is positive
Word: 口才了得, Po

In [8]:
df_result = pd.DataFrame(results, columns=["Word", "Polarity", "Positive Similarity", "Negative Similarity"])

In [9]:
# 保存到 Excel 文件
output_file = "Soc_polarity_results.xlsx"
df_result.to_excel(output_file, index=False)
print(f"Results saved to {output_file}")

Results saved to Soc_polarity_results.xlsx


In [10]:
print(f"Positive words count: {positive_count}")
print(f"Negative words count: {negative_count}")

Positive words count: 458
Negative words count: 397
