In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import sys
import os
sys.path.insert(0, os.path.abspath('../'))
from src.hypergraphs import HeterogeneousHyperGraph
from src.components import FiveWOneH
from src import components

In [2]:
dataset = 'Headlines'
threew = FiveWOneH(dataset, pd.read_csv('../datasets/Headlines.csv'), 0.2)
dic_who_headlines = threew.generate_dict('Who')
dic_where_headlines = threew.generate_dict('Where')
dic_when, dic_where, dic_who = components.dic_when_headlines, dic_where_headlines, dic_who_headlines
num_node_types = 6

In [3]:
dataset = 'FinCausal'
dic_when, dic_where, dic_who = components.dic_when_fincausal, components.dic_where_fincausal, components.dic_who_fincausal
num_node_types = 6

In [17]:
dataset = 'Twitter'
threew = FiveWOneH(dataset, pd.read_csv('../datasets/Twitter.csv'), 0.2)
dic_who_twitter = threew.generate_dict('Who')
dic_when, dic_where, dic_who = components.dic_when_twitter, components.dic_where_twitter, dic_who_twitter
num_node_types = 6

In [18]:
#m = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
m = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [19]:
df = pd.read_csv('../datasets/'+ dataset + '.csv').dropna().reset_index(drop=True)

het_hyperG = HeterogeneousHyperGraph(' Cause', ' Effect', df, m, dic_who, dic_when, dic_where, dataset)
het_hyperG.add_main_edges()
het_hyperG.add_main_node_labels()
het_hyperG.add_main_node_embeddings()
het_hyperG.add_secundary_edges()
het_hyperG.add_secundary_node_labels()
het_hyperG.add_secundary_node_embeddings()
graphs_kfold = het_hyperG.generate_kfold_graphs()

In [20]:
from scipy.spatial.distance import cosine
def _get_most_similar_embedding(emb, embeddings, ids):
    menor = 1
    for i in range(len(embeddings)):
        if i in ids: continue
        c = cosine(emb, embeddings[i])
        if c < menor:
            menor = c
            index = i
    return index    

In [8]:
from sklearn.neighbors import kneighbors_graph
import networkx as nx

graph_fold = graphs_kfold[0]

index_to_node_train, index_to_node_test_int, index_to_node_test_out = {}, {}, {}
count_train, count_test_int, count_test_out = 0, 0, 0
embeddings_relation_train, embeddings_relation_test_int, embeddings_relation_test_out  = [], [], []
for node in graph_fold.nodes():
    if 'relation:' in node:
        if graph_fold.nodes[node]['train'] == 1:
            embeddings_relation_train.append(graph_fold.nodes[node]['embedding'])
            index_to_node_train[count_train] = node
            count_train+=1
        elif graph_fold.nodes[node]['train'] == 0 and graph_fold.nodes[node]['label'] != 'causal':
            embeddings_relation_test_out.append(graph_fold.nodes[node]['embedding'])
            index_to_node_test_out[count_test_out] = node
            count_test_out+=1
        elif graph_fold.nodes[node]['train'] == 0 and graph_fold.nodes[node]['label'] == 'causal':
            embeddings_relation_test_int.append(graph_fold.nodes[node]['embedding'])
            index_to_node_test_int[count_test_int] = node
            count_test_int+=1

A = kneighbors_graph(embeddings_relation_train, n_neighbors=3, metric='cosine', mode='connectivity', include_self=False).toarray() 

g = nx.Graph()

for i in range(len(A)):
    for j in range(len(A)):
        if A[i][j] == 1:
            g.add_edge(index_to_node_train[i],index_to_node_train[j])


for i in range(len(embeddings_relation_test_out)):
    index_most_similar = _get_most_similar_embedding(embeddings_relation_test_out[i], embeddings_relation_test_out, [i])
    index_most_similar2 = _get_most_similar_embedding(embeddings_relation_test_out[i], embeddings_relation_test_out, [i,index_most_similar])
    g.add_edge(index_to_node_test_out[i],index_to_node_test_out[index_most_similar])
    g.add_edge(index_to_node_test_out[i],index_to_node_test_out[index_most_similar2])

for i in range(len(embeddings_relation_test_int)):
    index_most_similar = _get_most_similar_embedding(embeddings_relation_test_int[i], embeddings_relation_train, [-1])
    index_most_similar2 = _get_most_similar_embedding(embeddings_relation_test_int[i], embeddings_relation_train, [index_most_similar])
    g.add_edge(index_to_node_test_int[i],index_to_node_train[index_most_similar])
    g.add_edge(index_to_node_test_int[i],index_to_node_train[index_most_similar2])

for node in g:
    g.nodes[node]['node_type'] = 0 if graph_fold.nodes[node]['label'] == 'causal' else 1

In [48]:
from sklearn.neighbors import kneighbors_graph
import networkx as nx
from sklearn.cluster import KMeans
import numpy as np

graph_fold = graphs_kfold[0]

index_to_node_train, index_to_node_test = {}, {}
count_train, count_test = 0, 0
embeddings_relation_train, embeddings_relation_test  = [], []

g = nx.Graph()
for node in graph_fold.nodes():
    if 'relation:' in node:
        if graph_fold.nodes[node]['train'] == 1:
            embeddings_relation_train.append(graph_fold.nodes[node]['embedding'])
            index_to_node_train[count_train] = node
            count_train+=1
        elif graph_fold.nodes[node]['train'] == 0:
            embeddings_relation_test.append(graph_fold.nodes[node]['embedding'])
            index_to_node_test[count_test] = node
            count_test+=1
        
        g.add_node(node)

A = kneighbors_graph(embeddings_relation_train, n_neighbors=3, metric='cosine', mode='connectivity', include_self=False).toarray() 

#A2 = kneighbors_graph(embeddings_relation_test, n_neighbors=2, metric='cosine', mode='connectivity', include_self=False).toarray() 

for i in range(len(A)):
    for j in range(len(A)):
        if A[i][j] > 0:
            g.add_edge(index_to_node_train[i],index_to_node_train[j])

# for i in range(len(A2)):
#     for j in range(len(A2)):
#         if A2[i][j] > 0:
#             g.add_edge(index_to_node_test[i],index_to_node_test[j])

k = 2

kmeans_train = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(np.array(embeddings_relation_train))

kmeans_test = KMeans(n_clusters=k, random_state=0, n_init="auto").fit(np.array(embeddings_relation_test))

th = 0.5

for i in range(len(embeddings_relation_test)):
    menor_train, menor_test = 1, 1
    for center in kmeans_train.cluster_centers_:
        c = cosine(center, embeddings_relation_test[i])
        if c < menor_train:
            menor_train = c
    for center in kmeans_test.cluster_centers_: 
        c = cosine(center, embeddings_relation_test[i])
        if c < menor_test:
            menor_test = c
    
    if menor_train < menor_test and menor_train < th:
        j = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_train, [-1])
        j2 = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_train, [j])
        g.add_edge(index_to_node_test[i],index_to_node_train[j])
        g.add_edge(index_to_node_test[i],index_to_node_train[j2])
    elif menor_test < menor_train and menor_test < th:
        j = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_test, [i])
        j2 = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_test, [i,j])
        g.add_edge(index_to_node_test[i],index_to_node_test[j])
        g.add_edge(index_to_node_test[i],index_to_node_test[j2])

for node in g:
    g.nodes[node]['node_type'] = 0 if graph_fold.nodes[node]['label'] == 'causal' else 1

In [23]:
from sklearn.neighbors import kneighbors_graph
import networkx as nx
from sklearn.cluster import KMeans
import numpy as np

graph_fold = graphs_kfold[0]

index_to_node_train, index_to_node_test = {}, {}
count_train, count_test = 0, 0
embeddings_relation_train, embeddings_relation_test  = [], []

g = nx.Graph()
for node in graph_fold.nodes():
    if 'relation:' in node:
        if graph_fold.nodes[node]['train'] == 1:
            embeddings_relation_train.append(graph_fold.nodes[node]['embedding'])
            index_to_node_train[count_train] = node
            count_train+=1
        elif graph_fold.nodes[node]['train'] == 0:
            embeddings_relation_test.append(graph_fold.nodes[node]['embedding'])
            index_to_node_test[count_test] = node
            count_test+=1
        
        g.add_node(node)

A = kneighbors_graph(embeddings_relation_train, n_neighbors=3, metric='cosine', mode='connectivity', include_self=False).toarray() 
for i in range(len(A)):
    for j in range(len(A)):
        if A[i][j] > 0:
            g.add_edge(index_to_node_train[i],index_to_node_train[j])

df_llm = pd.read_csv('../results/' + dataset+ '_gemma2:27b.csv')

def return_llm_class(df_llm, relation):
    e1 = relation.split(' || ')[0].replace('relation: First event: - ', '')
    e2 = relation.split(' || ')[1].replace('Second event: - ','')
    llm_class = df_llm[(df_llm[' Cause'] == e1) & (df_llm[' Effect'] == e2)]['llm_class'].iloc[0]
    return llm_class

for i in range(len(embeddings_relation_test)):
    llm_class = return_llm_class(df_llm, index_to_node_test[i])
    if llm_class == 'causal':
         j = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_train, [-1])
         j2 = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_train, [j])
         g.add_edge(index_to_node_test[i],index_to_node_train[j])
         g.add_edge(index_to_node_test[i],index_to_node_train[j2])
    if llm_class == 'non_causal':
        l_out = [i]
        l_in = []
        while len(l_in) < 2 :
            j = _get_most_similar_embedding(embeddings_relation_test[i], embeddings_relation_test, l_out)
            llm_class2 = return_llm_class(df_llm, index_to_node_test[j])
            if llm_class2 == 'non_causal':
                l_in.append(j)
            l_out.append(j)
                
        g.add_edge(index_to_node_test[i],index_to_node_test[l_in[0]])
        g.add_edge(index_to_node_test[i],index_to_node_test[l_in[1]])

for node in g:
    g.nodes[node]['node_type'] = 0 if graph_fold.nodes[node]['label'] == 'causal' else 1

In [24]:
from src.utils import show_graph

g_rel = g
pos = nx.spring_layout(g_rel,seed=42) # obtém coordenadas dos vértices para visualização
for node in g_rel.nodes():
  g_rel.nodes[node]['pos'] = pos[node]

show_graph(g_rel)