## Note

This notebook is used to compute sentiment distribution score of the predicted emo-denoting words based on the their distance to the top words representing each cluster.

In [1]:
import pandas as pd 
import networkx as nx
import numpy as np
from scipy.special import softmax
from collections import defaultdict
from networkx.algorithms.shortest_paths.generic import shortest_path_length as spl

In [10]:
## create undirect network
"""
Input variables:
edge_list,
node_list, 
gelphi_output_csv
emo_pred_file_path (from Emo-denoting Prediction without aggregated + corpus)

"""
Graphtype = nx.Graph()
edge_list_path = "../source_data/second_iteration_edge_list_updated.csv"
node_list_path = "../source_data/second_iteration_nodes_updated.csv"
gelphi_output_csv_path = "../source_data/Gelphi_output_1.csv"
output_folder="../output/"
emo_pred_file_path = "pred_with_rfc.csv"


## select the top words from each modularity_class to represent the class
top_k_words = 10

### build the network
edge_list = pd.read_csv(edge_list_path)
node_list = pd.read_csv(node_list_path)
syn_G = nx.from_pandas_edgelist(edge_list, edge_attr='weight', create_using=Graphtype)

In [7]:
## read gelphi output graph for computed distance, centrality, authority values
network = pd.read_csv(gelphi_output_csv_path)
network.reset_index()
network.set_index(network['Id'],inplace=True)
mapping =network['Label'].to_dict()
G = nx.relabel_nodes(syn_G, mapping, copy=False)

In [8]:

selected_top = network.sort_values('Authority',ascending=False).groupby('modularity_class').head(top_k_words)
## generate dic with cluster number as key and representing words list as the value
selected_top_dic = selected_top.groupby('modularity_class')['Label'].agg(list).to_dict()
### import the emo_pred_output from Emo-denoting prediction notebook
pred = pd.read_csv(output_folder+emo_pred_file_path)

In [12]:
pred

Unnamed: 0,self_auth,self_deg,self_betcent,avg_pred_auth,avg_pred_deg,avg_pred_betcent,word,emo?,textid
0,0.013005,398.0,0.000000,0.001089,87.00,13923.030250,good_1,1.0,0
1,-1.000000,-1.0,-1.000000,0.000561,66.75,11691.404453,Stooge_2,0.0,0
2,0.101399,226.0,0.000000,0.030168,315.00,25551.916316,lovely_7,1.0,0
3,0.001230,130.0,776.623589,0.025934,102.50,29467.071750,evil_9,1.0,0
4,0.000318,10.0,0.000000,0.004475,131.75,27115.451270,time_13,0.0,0
...,...,...,...,...,...,...,...,...,...
111,0.016628,266.0,0.000000,0.000066,32.00,5451.389329,way_21,0.0,9
112,0.000010,6.0,3307.571201,0.004167,140.80,429.976794,movie_23,0.0,9
113,-1.000000,-1.0,-1.000000,0.001083,88.00,0.000000,randomly_27,0.0,9
114,0.001007,47.0,0.000000,0.001297,20.00,149.284191,silly_28,0.0,9


In [14]:
"""
    output softmax function value for the probability of the emotion based on the score 
"""
def emo_distribution_cal(top_cluster_word_dict = selected_top_dic,source=None,network_Graph=G,row=None):
    emo_score_dist = []
    if row['emo?']:
        source=row['word'].split('_')[0]
    else:
        return 
    for key, top_words in top_cluster_word_dict.items():
        total_distance = 0
        count =0
        ### loop for each top word in one cluster
        for top_word in top_words:
            try: 
                """
                spl retrieves the shortest distance from the source to the top_word in the network
                """
                total_distance += spl(network_Graph,source,top_word)
                count+=1
            except:
                """
                when either source or top_word in the one cluster not found in the network 
                or they are not reachable => just pass 
                """
                pass
        ### shorter distance between nodes => closer the relationship => 1/ avg_distance of the cluster
        if count !=0:
            avg_distance = total_distance/count
            emo_score_dist.append(1/avg_distance) 
        else:
            emo_score_dist.append(0)

    return softmax(emo_score_dist)


In [15]:
### emotion score distribution of all the clusters
total_emo_res = pred.apply(lambda x: emo_distribution_cal(row=x),axis=1)
pred["emo_dist_prob"] = total_emo_res

def emo_cluster_assignment(row):
    if row['emo?']==1:
        return np.array(row['emo_dist_prob']).argsort()[::-1]
    else:
        return 0

pred['emo_dist_cluster_order']=pred.apply(lambda x: emo_cluster_assignment(x),axis=1)
### output results
pred.to_csv('../output/emo_assignment.csv',index=False)

In [3]:
pd.read_csv('../output/emo_assignment.csv')

Unnamed: 0,self_auth,self_deg,self_betcent,avg_pred_auth,avg_pred_deg,avg_pred_betcent,word,emo?,textid,emo_dist_prob,emo_dist_cluster_order
0,0.013005,398.0,0.000000,0.001089,87.00,13923.030250,good_1,1.0,0,[0.03834712 0.03869042 0.03869042 0.03905854 0...,[18 19 20 17 3 11 25 1 22 2 7 24 4 0 13 ...
1,-1.000000,-1.0,-1.000000,0.000561,66.75,11691.404453,Stooge_2,0.0,0,,0
2,0.101399,226.0,0.000000,0.030168,315.00,25551.916316,lovely_7,1.0,0,[0.03797827 0.03764128 0.0373263 0.03872806 0...,[18 11 4 25 6 8 17 3 14 22 23 19 20 16 13 ...
3,0.001230,130.0,776.623589,0.025934,102.50,29467.071750,evil_9,1.0,0,[0.03787994 0.03813641 0.03813641 0.03840861 0...,[17 16 8 19 6 11 18 13 25 3 7 9 10 5 4 ...
4,0.000318,10.0,0.000000,0.004475,131.75,27115.451270,time_13,0.0,0,,0
...,...,...,...,...,...,...,...,...,...,...,...
111,0.016628,266.0,0.000000,0.000066,32.00,5451.389329,way_21,0.0,9,,0
112,0.000010,6.0,3307.571201,0.004167,140.80,429.976794,movie_23,0.0,9,,0
113,-1.000000,-1.0,-1.000000,0.001083,88.00,0.000000,randomly_27,0.0,9,,0
114,0.001007,47.0,0.000000,0.001297,20.00,149.284191,silly_28,0.0,9,,0
