In [1]:
import pandas as pd
import numpy as np
import scipy.spatial.distance
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('/Users/shiwei/Desktop/研究论文/一带一路回音室/7个子群的用户以及推文/（清洗后）第0组的推文.csv')

In [3]:
df

Unnamed: 0,username,text,sumpopular,neg,neu,pos,compound
0,ivbaoshi,build emerge asia fearful decline west,0.039527,0.000,0.859,0.141,0.3875
1,Winslow_R,read analysis investment trade innovation emer...,0.019764,0.000,0.860,0.140,0.3818
2,RyskeldiSatke,rid great pic kazakhstan centralasia,0.019764,0.000,0.709,0.291,0.6249
3,larouchepac,syria crisis postpone sco focus,0.037407,0.396,0.604,0.000,-0.7096
4,SebPerimony,jinping foreign policy energy security ideolog...,0.000000,0.000,0.757,0.243,0.5423
...,...,...,...,...,...,...,...
22865,Eusebius64,news eir daily alert service sit december 2022...,0.020393,0.000,0.776,0.224,0.8779
22866,deepmpolar,reject financial capitalism first focus infras...,0.247888,0.091,0.769,0.140,0.2023
22867,LBenabdallah,next chi speaker lina benabdallah deliver talk...,0.072693,0.000,0.928,0.072,0.4939
22868,YRanaraja,sometimes refer ambitious infrastructure proje...,0.153755,0.000,0.892,0.108,0.5256


In [4]:
# 加载模型（如果需要）
model = Word2Vec.load("500Dword2vec.model")

In [5]:
# 词向量加权平均，获取每一条推文的向量表示
def get_post_vector(post, model):
    # 分词帖子文本
    words = post.split()  # 假设帖子已分词
    word_vectors = [model.wv[word] for word in words if word in model.wv]

    if len(word_vectors) > 0:
        post_vector = np.mean(word_vectors, axis=0)  # 求均值
        return post_vector
    else:
        return np.zeros(model.vector_size)  # 如果帖子中没有词汇在模型中

In [6]:
# 假设你的DataFrame中有一个名为df的数据框，包含帖子文本在"text"列中
df['post_vector'] = df['text'].astype(str).apply(lambda x: get_post_vector(x, model))

In [7]:
df

Unnamed: 0,username,text,sumpopular,neg,neu,pos,compound,post_vector
0,ivbaoshi,build emerge asia fearful decline west,0.039527,0.000,0.859,0.141,0.3875,"[-0.19832146, 0.2352466, -0.22543992, -0.48205..."
1,Winslow_R,read analysis investment trade innovation emer...,0.019764,0.000,0.860,0.140,0.3818,"[-0.597564, -0.027605435, 0.076412424, -0.3042..."
2,RyskeldiSatke,rid great pic kazakhstan centralasia,0.019764,0.000,0.709,0.291,0.6249,"[-0.15182252, 0.0687624, -0.10182159, -0.05943..."
3,larouchepac,syria crisis postpone sco focus,0.037407,0.396,0.604,0.000,-0.7096,"[-0.17775272, 0.108363286, -0.0019052625, 0.01..."
4,SebPerimony,jinping foreign policy energy security ideolog...,0.000000,0.000,0.757,0.243,0.5423,"[-0.24528721, -0.078924894, -0.32499433, -0.59..."
...,...,...,...,...,...,...,...,...
22865,Eusebius64,news eir daily alert service sit december 2022...,0.020393,0.000,0.776,0.224,0.8779,"[-0.17919934, 0.51183695, 0.20959733, 0.086956..."
22866,deepmpolar,reject financial capitalism first focus infras...,0.247888,0.091,0.769,0.140,0.2023,"[-0.5059119, -0.0693976, -0.24547595, 0.151870..."
22867,LBenabdallah,next chi speaker lina benabdallah deliver talk...,0.072693,0.000,0.928,0.072,0.4939,"[-0.06509728, -0.21845277, 0.45346275, -0.2209..."
22868,YRanaraja,sometimes refer ambitious infrastructure proje...,0.153755,0.000,0.892,0.108,0.5256,"[-0.32556713, -0.34609717, -0.5087717, -0.0366..."


In [8]:
# 计算每一子社群内容平均相似度

In [9]:
def np_cosine_similarity(x, y):
    
    cosine_similarity = np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    
    return cosine_similarity

In [10]:
np_cosine_similarity(df['post_vector'][51], df['post_vector'][450])

0.023369452

In [11]:
# 开始计算社群内容平均相似度

In [12]:
from itertools import combinations

In [13]:
# 获取所有可能的用户对组合
user = df["username"].tolist()
user_pairs = list(combinations(user, 2))

In [14]:
len(user_pairs)

261507015

# 数据量太大，随机抽取一定数量进行内容相似度计算

In [15]:
import random

In [16]:
# 从数据中随机选择50000条数据并存入新的列表
sample_data = random.choices(user_pairs, k=50000)

In [17]:
len(sample_data) # 抽取出了一定量的数据，存储在sample_data中

50000

In [18]:
result_data = []

In [19]:
# 遍历每个情感层组合
for layer1, layer2 in sample_data:
    # 获取对应的帖子文本向量
    vector_layer1 = df[df["username"] == layer1]['post_vector'].values[0]
    vector_layer2 = df[df["username"] == layer2]['post_vector'].values[0]
    
    # 计算文本相似度
    similarity = np_cosine_similarity(vector_layer1, vector_layer2)

    # 将结果添加到结果数据列表中
    result_data.append(similarity)

In [20]:
np.mean(result_data)

0.17652969