In [1]:
import os
import sys
import numpy as np
import pandas as pd
import sklearn
import sklearn.metrics as metrics
from matplotlib import pyplot as plt
import seaborn as sns
from collections import Counter

from scipy.stats import kstest, ks_2samp, mannwhitneyu
from scipy.stats import ttest_ind,ttest_rel

# 基于不同的情绪，分析时间分布的差异
# 回答问题：不同情绪下，用户表达的时间感知是否存在差异？

----
# Load data

In [3]:
# multiv = pd.read_csv("pred_results/valence_multitask_pred.csv", )
singlev = pd.read_csv("pred_results/valence_singletask8_pred.csv", ) # valence prediction 结果
sim8 = pd.read_csv("./pred_results/8mood_w2v_simscore.csv", ) # 基于similarity的discrete emotion结果
cnt8 = pd.read_csv("./pred_results/8mood_cnt_normscore.csv", ) # 基于count的discrete emotion结果

In [167]:
singlev.shape,singlev.user_id.nunique()

((44568, 24), 42353)

In [169]:
singlev.time.value_counts()

Fast    40723
Slow     3845
Name: time, dtype: int64

In [168]:
singlev.groupby('time').user_id.nunique()

time
Fast    38708
Slow     3806
Name: user_id, dtype: int64

In [5]:
# 只保留1月20日之后的数据
valid_ids = singlev.loc[singlev.created_at>='2020-01-20',"id"].to_numpy()

In [6]:
singlev = singlev.loc[singlev['id'].isin(valid_ids)].reset_index(drop=True).copy()
sim8 = sim8.loc[sim8['id'].isin(valid_ids)].reset_index(drop=True).copy()
cnt8 = cnt8.loc[cnt8['id'].isin(valid_ids)].reset_index(drop=True).copy()

In [119]:
sim8.shape

(38869, 15)

In [121]:
38869/44568

0.8721279842039131

In [8]:
singlev.shape

(44568, 21)

In [9]:
moods = ['喜爱','快乐','期望','惊讶','厌恶','悲伤','愤怒','焦虑']
moods_eng = ["love","joy",'hope','surprise',"disgust","sad","angry","anxiety"]

In [10]:
sim8["pred_mood"] = sim8[moods].to_numpy().argmax(axis=1)

In [11]:
# cnt方法可能有多个emotion的score相同，只保留至多有两个相同score的微博，并随机取一个作为该微博的emotion
max_score = cnt8[moods].max(axis=1)
print(((cnt8[moods].to_numpy() == max_score.to_numpy().reshape(-1,1)).sum(axis=1) == 1).mean())
print(Counter((cnt8[moods].to_numpy() == max_score.to_numpy().reshape(-1,1)).sum(axis=1)))

cnt8["max_num"] = (cnt8[moods].to_numpy() == max_score.to_numpy().reshape(-1,1)).sum(axis=1)

np.random.seed(0)
pred_mood = []
for m in cnt8[moods].to_numpy() == max_score.to_numpy().reshape(-1,1):
    pred_mood.append(np.random.choice(m.argsort()[::-1][:m.sum()]))
    
# 保留两个及以下相同值的微博
cnt8["pred_mood"] = pred_mood
cnt8 = cnt8.loc[cnt8.max_num<3].reset_index(drop=True)

0.6788054209298151
Counter({1: 30253, 2: 6209, 8: 5885, 3: 1687, 4: 438, 5: 79, 6: 15, 7: 2})


In [10]:
figdir= "./figures_Febstart_0402/"
os.makedirs(figdir,exist_ok=True) # 创建文件夹

In [104]:
figdir= "./figures_Febstart_0702_percentage/"
os.makedirs(figdir,exist_ok=True) # 创建文件夹

In [11]:
singlev.time.value_counts()

Fast    40723
Slow     3845
Name: time, dtype: int64

In [126]:
os.makedirs("./Final_data/",exist_ok=True)
sim8[['id','time','pred_mood','month','date','week']].to_csv("Final_data/discrete_emotion.csv",index=False)
singlev[['id','user_id','timestamp','time','model_reweight_c','month','date','week','template','keyword']].to_csv(
            "Final_data/valence_emotion.csv",index=False)

------


In [12]:
# sim和cnt方法结果的相似性
all8 = cnt8[["id","pred_mood"]].rename(columns={"pred_mood":"cnt_mood"}).merge(
        sim8[["id","pred_mood"]].rename(columns={"pred_mood":"sim_mood"}))
eight_matrix = all8.groupby("cnt_mood").sim_mood.value_counts().unstack().fillna(0)
eight_matrix.columns,eight_matrix.index = ["sim_"+m for m in moods_eng],["cnt_"+m for m in moods_eng]
display(eight_matrix.astype(int))
#sns.heatmap(eight_matrix)

Unnamed: 0,sim_love,sim_joy,sim_hope,sim_surprise,sim_disgust,sim_sad,sim_angry,sim_anxiety
cnt_love,4488,562,389,29,11,223,17,61
cnt_joy,512,6699,895,132,19,258,16,114
cnt_hope,759,1184,10094,91,47,748,47,430
cnt_surprise,36,69,64,1059,6,177,16,42
cnt_disgust,25,16,19,2,164,21,13,16
cnt_sad,113,143,144,15,10,2531,21,95
cnt_angry,20,26,40,2,5,35,150,18
cnt_anxiety,303,133,141,27,9,233,15,1885
