In [1]:
import os
os.chdir('/sise/home/tommarz/hate_speech_detection/')
os.getcwd()
import pickle
import numpy as np
import igraph as ig
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

In [2]:
from detection.diffusion_method.degroots_diffusion import degroots_diffusion

# Choose Dataset

In [293]:
dataset = 'echo_2'
seed = 0

In [294]:
network_output_dir = "/sise/home/tommarz/hate_speech_detection/data/networks_data"
raw_graphs_dict_path = os.path.join(network_output_dir, "raw_graphs_dict.p")
network_dataset_output_dir = os.path.join(network_output_dir, dataset)
raw_network_path  = os.path.join(network_dataset_output_dir, "raw_network.p")
largest_cc_path  = os.path.join(network_dataset_output_dir, "largest_cc.p")

# Load Dataset (Graph)

In [295]:
with open(largest_cc_path, 'rb') as f:
    largest_cc = pickle.load(f)
largest_cc.summary()

'IGRAPH DNW- 3746 20728 -- echo_2\n+ attr: name (g), doc2vec (v), label (v), name (v), predictions (v), weight (e)'

In [296]:
g = largest_cc.copy()
g.reverse_edges()

In [297]:
del largest_cc

In [298]:
labeled_nodes = g.vs.select(lambda v: v['label'] != -1)
len(labeled_nodes)

532

In [299]:
y = np.array(labeled_nodes['label'])
y.shape

(532,)

In [300]:
g.summary()

'IGRAPH DNW- 3746 20728 -- echo_2\n+ attr: name (g), doc2vec (v), label (v), name (v), predictions (v), weight (e)'

In [301]:
y_true = labeled_nodes['label']
len(labeled_nodes)

532

In [302]:
np.random.seed(seed)
seeds = np.random.randint(0, 2**32-1, 5)
seeds

array([2357136044, 2546248239, 3071714933, 3626093760, 2588848963])

In [303]:
def get_ego_subgraph(g, vertices, order_k = 1):
    ego_network = g.neighborhood(vertices=vertices, order=order_k, mode='out')
    set_of_tuples = set(tuple(inner_list) for inner_list in ego_network)
    flattened_set = list({element for tupl in set_of_tuples for element in tupl})
    ego_subgraph = g.subgraph(flattened_set)
    return ego_subgraph

In [304]:
def degroot_update(adj_matrix, opinions, iterations=10):
    n = len(opinions)
    for _ in range(iterations):
        # Calculate row sums
        row_sums = np.array(adj_matrix.sum(axis=1)).flatten()
        
        # Avoid division by zero by replacing zero sums with 1
        row_sums[row_sums == 0] = 1
        
        # Normalize each row to sum to 1
        norm_adj_matrix = adj_matrix.multiply(1 / row_sums[:, np.newaxis])
        
        # Update opinions
        opinions = norm_adj_matrix.dot(opinions)
        
    return opinions

In [305]:
def degroot_update_weighted(adj_matrix, opinions, alpha=0.5, iterations=10):
    """
    Update opinions in DeGroot's model with a weighting factor.
    
    :param adj_matrix: Sparse adjacency matrix of the graph.
    :param opinions: Initial array of opinions.
    :param alpha: Weighting factor (0 <= alpha <= 1), where alpha closer to 1 gives
                  more weight to the neighbors' opinions.
    :param iterations: Number of iterations to run the model.
    :return: Array of updated opinions.
    """
    n = len(opinions)
    for _ in range(iterations):
        # Normalize adjacency matrix
        row_sums = np.array(adj_matrix.sum(axis=1)).flatten()
        row_sums[row_sums == 0] = 1  # Safe division
        norm_adj_matrix = adj_matrix.multiply(1 / row_sums[:, np.newaxis])

        # Update opinions with weighting
        opinions = (1 - alpha) * opinions + alpha * norm_adj_matrix.dot(opinions)

    return opinions

In [306]:
from scipy.sparse import csr_matrix

In [307]:
A = g.get_adjacency_sparse(attribute='weight')
A.shape

(3746, 3746)

In [308]:
from sklearn.preprocessing import StandardScaler

In [326]:
metrics = []
frac = 0.05
initial_belief = 0
size = int(frac * len(labeled_nodes))
iterations = 10
alpha=1
for seed in seeds:
    np.random.seed(seed)
    seed_hate_users = np.random.choice(labeled_nodes.indices, size, replace=False)
    initial_beliefs = np.full(g.vcount(), initial_belief)   
    initial_beliefs[seed_hate_users] = 1
    opinions = degroot_update_weighted(A, initial_beliefs, alpha=alpha, iterations=iterations)
    labeled_nodes_opinions = opinions[labeled_nodes.indices]
    # scaler = StandardScaler()
    # scaled_opinions = scaler.fit_transform(labeled_nodes_opinions.reshape(-1, 1)).flatten()
    preds = labeled_nodes_opinions >= labeled_nodes_opinions.mean()
    metrics.append([s(y_true, preds) for s in[accuracy_score, precision_score, recall_score,  f1_score, roc_auc_score]])

In [327]:
results_df = pd.DataFrame(metrics, columns=['acccuracy', 'precision', 'recall', 'f1', 'roc_auc'], index=seeds)
results_df

Unnamed: 0,acccuracy,precision,recall,f1,roc_auc
2357136044,0.840226,0.77,0.553957,0.644351,0.747716
2546248239,0.723684,0.1,0.007194,0.013423,0.492147
3071714933,0.731203,0.0,0.0,0.0,0.494911
3626093760,0.834586,0.8,0.489209,0.607143,0.722976
2588848963,0.770677,0.688889,0.223022,0.336957,0.593699


In [328]:
df = pd.concat([results_df.mean(axis=0), results_df.std(axis=0)], axis=1, names=['mean', 'std'])
df

Unnamed: 0,0,1
acccuracy,0.780075,0.055332
precision,0.471778,0.388779
recall,0.254676,0.260616
f1,0.320375,0.309972
roc_auc,0.61029,0.121583


In [329]:
s = "& DeGroot's Diffusion"
for mean, std in df.values:
    s += (f' & ${mean:.3f} \pm {std:.3f}$')
s+= '\\\\'
print(s)

& DeGroot's Diffusion & $0.780 \pm 0.055$ & $0.472 \pm 0.389$ & $0.255 \pm 0.261$ & $0.320 \pm 0.310$ & $0.610 \pm 0.122$\\
