In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from src.hypergraphs import HeterogeneousHyperGraph
from src.components import FiveWOneH
from src import components

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = 'Headlines'
df = pd.read_csv('./datasets/' + dataset  + '.csv')
threew = FiveWOneH(dataset, df, 0.2)
dic_who_headlines = threew.generate_dict('Who')
dic_where_headlines = threew.generate_dict('Where')
dic_when, dic_where, dic_who = components.dic_when_headlines, dic_where_headlines, dic_who_headlines
num_node_types = 6

In [2]:
dataset = 'Risk'
dic_when, dic_where, dic_who = components.dic_when_risk, components.dic_where_risk, components.dic_who_risk
num_node_types = 5

In [2]:
dataset = 'FinCausal'
dic_when, dic_where, dic_who = components.dic_when_fincausal, components.dic_where_fincausal, components.dic_who_fincausal
num_node_types = 6

In [2]:
dataset = 'Twitter'
df = pd.read_csv('./datasets/' + dataset  + '.csv')
threew = FiveWOneH(dataset, df, 0.2)
dic_who_twitter = threew.generate_dict('Who')
dic_when, dic_where, dic_who = components.dic_when_twitter, components.dic_where_twitter, dic_who_twitter
num_node_types = 6

In [3]:
m = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df = pd.read_csv('./datasets/' + dataset + '.csv')
het_hyperG = HeterogeneousHyperGraph(' Cause', ' Effect', df, m, dic_who, dic_when, dic_where, dataset)
het_hyperG.add_main_edges()
het_hyperG.add_main_node_labels()
het_hyperG.add_main_node_embeddings()
het_hyperG.add_secundary_edges()
het_hyperG.add_secundary_node_labels()
het_hyperG.add_secundary_node_embeddings()
graphs_kfold = het_hyperG.generate_kfold_graphs()

In [4]:
from src.eCHOLGA import HeterogeneousGNNModel
from src.eCHOLGA import one_class_loss
from src.utils import plot_confusion_matrix
import torch
from torch import nn
import random
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)
warnings.filterwarnings(action='ignore', category=UserWarning)
from sklearn.metrics import classification_report

def run(ep,ra,of,rf,nt,ofi,rfi,nti):
    scenarios = ['1', '2', '3']
    for scenario in scenarios:
        np.random.seed(42)
        torch.manual_seed(42)
        random.seed(42)
        torch.cuda.manual_seed_all(42)
        l_f1 = []
        fold = 0
        for g_obs in graphs_kfold:
            #het_hyperG.add_relation_edges_pseudo_labels(4, g_obs, llm_pseudo_label, system_prompt, user_prompt)
            epochs, radius, lr, dim = ep, ra, 0.008, 3
            heterogeneous_model = HeterogeneousGNNModel('cuda:0', lr, radius, dim, num_node_types, g_obs, 384)
            embeddings, loss_node_type, loss_recon, loss_recon_u, loss_ocl, embeddings_relation,f1s = [], [], [], [], [], [], []
            node_type_labels = [g_obs.nodes[node]['node_type'] for node in g_obs.nodes()]

            index = 0
            node_to_index = {}
            for node in g_obs.nodes():
                node_to_index[node] = index
                index+=1

            ocl_factor = 0
            rec_factor = 0
            nt_factor = 1

            for epoch in range(epochs):
                heterogeneous_model.get_gnn_model().train()
                heterogeneous_model.get_optimizer().zero_grad()

                node_representation, pred_node_type = heterogeneous_model.get_gnn_model().encode(heterogeneous_model.get_graph_torch().embedding.float(), heterogeneous_model.get_graph_torch().edge_index)

                loss1 = one_class_loss(heterogeneous_model.get_center(), heterogeneous_model.get_radius(), node_representation, heterogeneous_model.get_mask())
                loss2 = heterogeneous_model.get_gnn_model().recon_loss(node_representation, heterogeneous_model.get_graph_torch().edge_index)
                loss3 = nn.CrossEntropyLoss()(pred_node_type, torch.Tensor(node_type_labels).squeeze().long().to('cuda:0'))

                loss = loss1 * min(of, ocl_factor) 
                
                if scenario == '2' or scenario == '3':
                    loss+= loss2 * min(rf, rec_factor)
                
                if scenario == '3':
                    loss+= loss3 * max(nt, nt_factor) 

                loss.backward()
                heterogeneous_model.get_optimizer().step()
                embeddings.append(node_representation)
                if epoch%100 == 0:
                    ocl_factor = ocl_factor + ofi
                    rec_factor = rec_factor + rfi
                    nt_factor = max(0,(nt_factor - nti)) 
                    #f1 = heterogeneous_model.one_class_homogeneousGNN_prediction(node_representation, node_to_index, True)['macro avg']['f1-score']   
                    #print(f'Ep {int(epoch)} | Ocl: {loss1.detach().cpu().numpy():.3f} | Rec: {loss2.detach().cpu().numpy():.3f} | NT: {loss3.detach().cpu().numpy():.3f} | F1: {f1*100:.2f}%')

            y_true, y_pred = heterogeneous_model.one_class_homogeneousGNN_prediction(node_representation, node_to_index, False)
            l_f1.append(heterogeneous_model.one_class_homogeneousGNN_prediction(node_representation, node_to_index, True)['macro avg']['f1-score'])
            #print(f'F1-macro: {l_f1[fold]}')
            fold+=1
            #plot_confusion_matrix(y_true, y_pred)
            #print(classification_report(y_true, y_pred))

        f1_macro_mean = np.mean(l_f1)
        f1_macro_std = np.std(l_f1)
        print(scenario, dataset, f1_macro_mean, f1_macro_std)

In [None]:
run(6000, 0.4, 0.5, 0, 0.5, 0.02, 0, 0.02) # HEADLINES 0.5 0 0.5 = 0.02 em 0.02 nos dois - 6k epoca - r  0.4

1 Headlines 0.26458418783990195 0.0004885520090794726
2 Headlines 0.26458418783990195 0.0004885520090794726
3 Headlines 0.5722717342165249 0.012336220890890794


In [5]:
run(6000, 0.5, 0.7, 0.2, 0.1, 0.04, 0.01, 0.04) # FINCAUSAL 0.7 0.2 0.1 = 0.04 em 0.04 nos dois - 6k epoca - r  0.5

1 FinCausal 0.3125 0.0
2 FinCausal 0.3942045271150213 0.025607677736310976
3 FinCausal 0.4149257372459383 0.027879923249676454


In [5]:
run(6000, 0.3, 0.6, 0.3, 0.1, 0.02, 0.01, 0.02) # RISK 0.6 0.3 0.1 = 0.02 em 0.02 nos dois - 6k epoca - r  0.3

1 Risk 0.32365579263350763 0.0011969437133096572
2 Risk 0.6572955932290797 0.04400006338909593
3 Risk 0.6191197601273872 0.13757701735184158


In [6]:
run(3000, 0.4, 0.8, 0.15, 0.05, 0.05, 0.01, 0.05) # Twitter 0.8 0.15 0.05 = 0.05 em 0.05 nos dois - 3k epoca - r  0.4

1 Twitter 0.3338226249909558 0.000981975283743149
2 Twitter 0.4961669877880996 0.03163076679804815
3 Twitter 0.4882791044707675 0.032050628005783136
