# Imports

In [1]:
!cat /proc/meminfo | grep Mem

MemTotal:       263906928 kB
MemFree:        131092968 kB
MemAvailable:   242471312 kB


In [2]:
import multiprocessing
multiprocessing.cpu_count()

40

In [3]:
import pandas as pd
import numpy as np
import igraph as ig
import seaborn as sns
import sys
import os
import pickle
import dill
import warnings
import json
from tqdm import tqdm

In [4]:
main_dir = '/sise/home/tommarz/hate_speech_detection/'
detection_dir = os.path.join(main_dir, 'detection')
# experiments_dir = os.path.join(detection_dir, 'experiments')
sna_dir = os.path.join(detection_dir, 'sna')
os.chdir(main_dir)
os.getcwd()

'/sise/home/tommarz/hate_speech_detection'

In [5]:
from config.data_config import path_confs
from config.detection_config import user_level_execution_config, user_level_conf, post_level_execution_config

# Globals

In [6]:
def get_doc_vectors(dataset):
    # if dataset in doc_vectors_dict:
    #     return doc_vectors_dict[dataset]
    doc_vectors = pickle.load(open(path_confs[dataset]['doc_vectors'], "rb"))
    # doc_vectors_dict[dataset] = doc_vectors
    return doc_vectors

In [7]:
def get_user_labels(dataset):
    if dataset in labeled_nodes_dict:
        return labeled_nodes_dict[dataset]
    user2label_path = user_level_conf[dataset]["data_path"]
    sep = ","
    if user2label_path.endswith("tsv"):
        sep = "\t"
    y = pd.read_csv(user2label_path, sep=sep, index_col=[0]).squeeze()
    y.index = y.index.astype('str')
    labeled_nodes_dict[dataset] = y
    return y

In [8]:
def get_reposts_graph(dataset, min_weight=1, graphs_dict = {}):
    if dataset in graphs_dict:
        g = graphs_dict[dataset]
        filtered_edges = g.es.select(weight_ge=min_weight)
        return g.subgraph_edges(filtered_edges)
    reposts_path = path_confs[dataset]['reposts']
    if reposts_path.endswith('.txt'):
        reposts_df = pd.read_csv(reposts_path, sep='\t', header=None, names=['source', 'target', 'weight'])
    elif reposts_path.endswith('.tsv'):
        if dataset == 'truth':
            pd.read_csv(reposts_path, sep='\t', names=['source', 'target', 'retruths_list', 'weight'], skiprows=1)
        else:
            reposts_df = pd.read_csv(reposts_path, sep='\t', names=['source', 'target', 'weight'], skiprows=1)
    elif reposts_path.endswith('.csv'):
        reposts_df = pd.read_csv(reposts_path, header=None, names=['source', 'target', 'weight'])
    else:
        reposts_edge_dict = pickle.load(open(reposts_path, "rb"))
        reposts_edge_list = [[k[0], k[1], v] for k,v in tqdm(reposts_edge_dict.items())]
        reposts_df = pd.DataFrame(reposts_edge_list, columns=['source', 'target', 'weight'])
    reposts_df['source'] = reposts_df['source'].astype(str)
    reposts_df['target'] = reposts_df['target'].astype(str)
    reposts_dict[dataset] = reposts_df
    edges = [tuple(x) for x in reposts_df[['source', 'target', 'weight']].values]
    g = ig.Graph.TupleList(edges, edge_attrs=['weight'], directed=True)    
    y = get_user_labels(dataset)
    g.vs['label'] = [y.loc[e['name']] if e['name'] in y.index else -1 for e in g.vs]
    # g.vs.select(_degree=0).delete()
    g.simplify(multiple=True, loops=True, combine_edges='sum')
    g['name'] = dataset
    graphs_dict[dataset] = g
    filtered_edges = g.es.select(weight_ge=min_weight)
    return g.subgraph_edges(filtered_edges)

In [9]:
def get_largest_weak_cc(g):
    # Assuming 'g' is your igraph Graph
    components = g.components(mode='WEAK')  # Find weakly connected components
    return components.giant()

In [10]:
def get_labeled_nodes(g):
    return g.vs.select(lambda v: v['label']!=-1)

In [11]:
def write_results_latex(df, name):
    with open(os.path.join(method_output_path, f'{method_name}_{name}_results_latex.txt'), 'w') as f:
        res = ' & '.join([f'${m:.3f} \pm {s:.3f}$' for m, s in df.values.reshape(-1, 2)]) + '\\\\'
        f.write(res)

def get_best_results_from_gs(gs):
    train_results = pd.DataFrame(pd.DataFrame.from_dict(gs.cv_results_).loc[
                                     gs.best_index_, [c for c in gs.cv_results_ if
                                                      'mean_train' in c or 'std_train' in c]].values.reshape(-1, 2),
                                 columns=['mean', 'std'], index=scoring_names)
    test_results = pd.DataFrame(pd.DataFrame.from_dict(gs.cv_results_).loc[gs.best_index_, [c for c in gs.cv_results_ if
                                                                                            'mean_test' in c or 'std_test' in c]].values.reshape(
        -1, 2), columns=['mean', 'std'], index=scoring_names)
    # best_hyper_params = pd.Series(gs.best_estimator_.get_params()).rename('value').to_csv(os.path.join(method_output_path, 'best_hyperparams.csv'))
    return {'train': train_results, 'test': test_results}

def get_best_results_from_cv_results(cv_results):
    train_results =  pd.DataFrame(pd.DataFrame.from_dict(cv_results).loc[:, [c for c in cv_results if  'mean_train' in c or 'std_train' in c]].values.reshape(-1,2), columns=['mean', 'std'], index=scoring_names)
    test_results =  pd.DataFrame(pd.DataFrame.from_dict(cv_results).loc[:, [c for c in cv_results if  'mean_test' in c or 'std_test' in c]].values.reshape(-1,2), columns=['mean', 'std'], index=scoring_names)
    return {'train': train_results, 'test': test_results}

def write_best_results_and_params_from_gs(gs):
    results_dict = get_best_results_from_gs(gs)
    pd.Series(gs.best_estimator_.get_params()).rename('value').to_csv(os.path.join(method_output_path, 'best_hyperparams.csv'))
    pd.Series(gs.best_estimator_.get_learned_params()).rename('value').to_csv(os.path.join(method_output_path, 'best_params.csv'))

    for name, df in results_dict.items():
        write_results_latex(df, name=name)
        df.to_csv(os.path.join(method_output_path, f'best_results_{name}.csv'))
        
    return results_dict

In [12]:
def get_ego_subgraph(g, vertices, order_k = 1):
    ego_network = g.neighborhood(vertices=vertices, order=order_k, mode='out')
    set_of_tuples = set(tuple(inner_list) for inner_list in ego_network)
    flattened_set = list({element for tupl in set_of_tuples for element in tupl})
    ego_subgraph = g.subgraph(flattened_set)
    return ego_subgraph

In [13]:
network_output_dir = "/sise/home/tommarz/hate_speech_detection/data/networks_data"
raw_graphs_dict_path = os.path.join(network_output_dir, "raw_graphs_dict.p")

In [14]:
parler_all_users_path =  "/sise/home/tommarz/hate_speech_detection/detection/outputs/parler/BertFineTuning/user_level/split_by_posts/no_text/"

# Load Reposts Graph

In [15]:
datasets = ['echo_2', 'gab', 'parler']

In [17]:
%%time
if os.path.exists(raw_graphs_dict_path):
    raw_graphs_dict = pickle.load(open(raw_graphs_dict_path, 'rb'))
elif "raw_graphs_dict" not in globals():
    raw_graphs_dict = {d: get_reposts_graph(d) for d in datasets}
    with open(raw_graphs_dict_path, 'wb') as f:
        pickle.dump(raw_graphs_dict, f)

CPU times: user 10.5 s, sys: 2.76 s, total: 13.2 s
Wall time: 13.2 s


# Create Datasets Graphs

## Choose Dataset

In [124]:
dataset = 'echo_2'

## Load Graph

In [125]:
raw_g = raw_graphs_dict[dataset].copy()
print(raw_g.summary())
network_dataset_output_dir = os.path.join(network_output_dir, dataset)
raw_network_path  = os.path.join(network_dataset_output_dir, "raw_network.p")
network_with_singletons_path  = os.path.join(network_dataset_output_dir, "network_with_singletons.p")
largest_cc_path  = os.path.join(network_dataset_output_dir, "largest_cc.p")

IGRAPH DNW- 10274 196981 -- echo_2
+ attr: name (g), label (v), name (v), weight (e)


## Load Posts and Predictions

In [126]:
dataset_path_conf = path_confs[dataset]
if dataset == 'parler':
    preds_df = pd.read_parquet(parler_all_users_path)
else:
    preds_df = pd.read_parquet(dataset_path_conf['predictions'])

In [127]:
preds_df

Unnamed: 0,user_id,predictions
0,231597325,0.014254
1,231597325,0.003655
2,231597325,0.003499
3,231597325,0.003180
4,231597325,0.507950
...,...,...
17575990,2880670643,0.015104
17575991,2880670643,0.006758
17575992,2880670643,0.006917
17575993,2880670643,0.020877


## Load Doc2Vec (for GNN later on)

In [128]:
with open(dataset_path_conf['doc_vectors'], 'rb') as f:
    doc2vec = pickle.load(f)
docs_arr = np.array(list(doc2vec.values()))
print(docs_arr.shape)
# mean, std = docs_arr.mean(), docs_arr.std()
# mean, std 

(7073, 100)


## Load Labels

In [129]:
y = pd.read_csv(user_level_conf[dataset]['data_path'], sep='\t').set_index('user_id')['label']
y.index = y.index.astype(str)
print(y.mean())
y

0.154


user_id
231597325     0
2190420108    0
548007350     1
113526237     0
716664192     0
             ..
225298549     0
460453341     0
88994026      0
187450820     0
2974346781    1
Name: label, Length: 1000, dtype: int64

In [130]:
users_with_posts = preds_df['user_id'].unique()
len(users_with_posts)

7073

In [131]:
users_with_posts_and_docs = set(users_with_posts).intersection(set(doc2vec))
len(users_with_posts_and_docs)

7073

In [132]:
raw_g.summary()

'IGRAPH DNW- 10274 196981 -- echo_2\n+ attr: name (g), label (v), name (v), weight (e)'

In [133]:
singletons = raw_g.vs[[index for index, degree in enumerate(raw_g.vs.degree()) if degree == 0]]
print(f'# of Singletons: {len(singletons)}')

# of Singletons: 0


In [134]:
singletons = list(set(users_with_posts_and_docs).difference(set(raw_g.vs['name'])))
attributes={'label': [y.get(user_id, default=-1) for user_id in singletons]}
print(f'# of Singletons: {len(singletons)}')

g_with_singletons = raw_g.copy()
g_with_singletons.summary()

g_with_singletons.add_vertices(singletons, attributes={'label': [y.get(user_id, default=-1) for user_id in singletons]})
g_with_singletons.summary()

# of Singletons: 2211


'IGRAPH DNW- 12485 196981 -- echo_2\n+ attr: name (g), label (v), name (v), weight (e)'

## Define Labeled Nodes

In [135]:
labeled_nodes = g_with_singletons.vs.select(lambda v: v['label'] != -1)
len(labeled_nodes)

1000

In [136]:
nodes_with_posts_and_docs = g_with_singletons.vs.select(lambda v: v['name'] in users_with_posts_and_docs)
print(len(nodes_with_posts_and_docs))

7073


## Filter out user (nodes) without any posts

In [137]:
g = g_with_singletons.subgraph(nodes_with_posts_and_docs)
g.summary()

'IGRAPH DNW- 7073 21409 -- echo_2\n+ attr: name (g), label (v), name (v), weight (e)'

In [138]:
labeled_nodes = g.vs.select(lambda v: v['label'] != -1)
len(labeled_nodes), np.mean(labeled_nodes['label'])

(1000, 0.154)

In [139]:
labeled_singletons = labeled_nodes[[index for index, degree in enumerate(labeled_nodes.degree()) if degree == 0]]
len(labeled_singletons)

410

In [140]:
singletons = g.vs[[index for index, degree in enumerate(g.vs.degree()) if degree == 0]]
print(f'# of Singletons: {len(singletons)}')

# of Singletons: 2919


In [141]:
weakly_connected_components = g.components(mode="weak")
print("Number of Weakly Connected Components:", len(weakly_connected_components))

# Find strongly connected components
strongly_connected_components = g.components(mode="strong")
print("Number of Strongly Connected Components:", len(strongly_connected_components))

print('Total number of Connected Components:', len(weakly_connected_components) + len(strongly_connected_components))

# You can also explore the size of the largest component, or other properties
largest_weakly_component = max(weakly_connected_components, key=len)
print("Size of Largest Weakly Connected Component:", len(largest_weakly_component))

Number of Weakly Connected Components: 3067
Number of Strongly Connected Components: 6008
Total number of Connected Components: 9075
Size of Largest Weakly Connected Component: 3746


In [142]:
len(preds_df)

17575995

In [143]:
len(preds_df.query('`user_id` in @g.vs["name"]'))

17575995

In [144]:
node_preds_agg = preds_df.query('`user_id` in @g.vs["name"]').groupby('user_id')['predictions'].agg(list)

In [145]:
g.vs['doc2vec'] = [doc2vec[name] for name in g.vs['name']]
# g.vs['doc2vec'] = [doc2vec.get(v['name'],  np.zeros(100)) for v in g.vs]
g.vs['predictions'] = [np.array(node_preds_agg[name]) for name in g.vs['name']]

In [146]:
largest_cc = get_largest_weak_cc(g)
largest_cc.summary()

'IGRAPH DNW- 3746 20728 -- echo_2\n+ attr: name (g), doc2vec (v), label (v), name (v), predictions (v), weight (e)'

In [147]:
len(preds_df.query('`user_id` in @largest_cc.vs["name"]'))

9805943

In [148]:
labeled_nodes = largest_cc.vs.select(lambda v: v['label'] != -1)
len(labeled_nodes), np.mean(labeled_nodes['label'])

(532, 0.26127819548872183)

## Connected Components

In [149]:
def get_num_of_cc(g):
    weakly_connected_components = g.components(mode="weak")
    print("Number of Weakly Connected Components:", len(weakly_connected_components))

    # Find strongly connected components
    strongly_connected_components = g.components(mode="strong")
    print("Number of Strongly Connected Components:", len(strongly_connected_components))

    print('Total number of Connected Components:', len(weakly_connected_components) + len(strongly_connected_components))

    # You can also explore the size of the largest component, or other properties
    largest_weakly_component = max(weakly_connected_components, key=len)
    print("Size of Largest Weakly Connected Component:", len(largest_weakly_component))
    return len(weakly_connected_components) + len(strongly_connected_components), len(strongly_connected_components), len(weakly_connected_components)

In [150]:
num_cc, _, _ = get_num_of_cc(g)

Number of Weakly Connected Components: 3067
Number of Strongly Connected Components: 6008
Total number of Connected Components: 9075
Size of Largest Weakly Connected Component: 3746


In [151]:
num_cc, _, _ = get_num_of_cc(largest_cc)

Number of Weakly Connected Components: 1
Number of Strongly Connected Components: 2761
Total number of Connected Components: 2762
Size of Largest Weakly Connected Component: 3746


## Clustering Coefficient

In [152]:
g.transitivity_avglocal_undirected()

0.2088079566514251

In [153]:
largest_cc.transitivity_avglocal_undirected()

0.19628344812349038

## Save Raw (Full) Network and Largest (Weakly) Connected Component

In [154]:
if not os.path.exists(network_dataset_output_dir):
    os.mkdir(network_dataset_output_dir)
with open(raw_network_path, 'wb') as f:
    pickle.dump(g, f)
with open(largest_cc_path, 'wb') as f:
    pickle.dump(largest_cc, f)

In [None]:
# Find weakly connected components
d = {}
for name, g in filtered_igraph_dict.items():
    print(name)
    weakly_connected_components = g.components(mode="weak")
    print("Number of Weakly Connected Components:", len(weakly_connected_components))
    # Find strongly connected components
    strongly_connected_components = g.components(mode="strong")
    print("Number of Strongly Connected Components:", len(strongly_connected_components))

    print('Total number of Connected Components:', len(weakly_connected_components) + len(strongly_connected_components))

    # You can also explore the size of the largest component, or other properties
    largest_weakly_component = max(weakly_connected_components, key=len)
    print("Size of Largest Weakly Connected Component:", len(largest_weakly_component))
    
    d[name] = [len(weakly_connected_components), len(strongly_connected_components), len(weakly_connected_components) + len(strongly_connected_components), len(largest_weakly_component)]

In [None]:
pd.DataFrame.from_dict(d)

In [None]:
# Find weakly connected components
for name, g in raw_graphs_dict.items():
    print(name)
    degrees = g.degree()

    # Count the number of vertices with degree 0 (singletons)
    singletons = degrees.count(0)

    print("Number of singletons:", singletons)

## Power Law

In [None]:
degrees = g.degree()
in_degree_lst = g.degree(mode='in')
out_degree_lst = g.degree(mode='out')

In [None]:
alpha, C = calc_power_law_exp(degrees)
C, alpha

## Centrality Measures

In [None]:
# Centrality Measures
for name, g in filtered_igraph_dict.items():
    print(name)
    degree_centrality = g.degree()
    betweenness_centrality = g.closeness()
    closeness_centrality = g.betweenness()

    mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality = np.mean([degree_centrality, betweenness_centrality, closeness_centrality], axis=1)
    print(mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality)

In [None]:
# Check if the graph is connected
if g.is_connected():
    avg_path_length = g.average_path_length()
    print("Average shortest path length:", avg_path_length)
else:
    print("Graph is not connected. Average path length is undefined for the whole graph.")
#     for c in g.components(mode="strong"):
#         h = g.subgraph(c)
#         if h.vcount() < 2:
#             continue
#         print(h.vcount(), h.ecount())
#         # Centrality Measures
#         degree_centrality = h.degree()
#         betweenness_centrality = h.closeness()
#         closeness_centrality = h.betweenness()

#         mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality = np.mean([degree_centrality, betweenness_centrality, closeness_centrality], axis=1)
#         print(mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality)

## Truth Social

In [None]:
dataset = 'truth'

In [None]:
min_weight = 1

In [None]:
# truth_reposts_graph = get_reposts_graph('truth')
truth_reposts_path = path_confs['truth']['reposts']
truth_reposts_df = pd.read_csv(reposts_path, sep='\t', names=['source', 'target', 'retruths_list', 'weight'], skiprows=1)
truth_filtered_reposts_df = truth_reposts_df.query('`weight`>@min_weight and source!=target')
truth_filtered_reposts_df

In [None]:
g = ig.Graph.TupleList(truth_filtered_reposts_df.values, directed=True, edge_attrs="weight")
print(g.vcount(), g.ecount())
g.vs.select(_degree=0).delete()
g.simplify(multiple=True, loops=True)
print(g.vcount(), g.ecount())

### Plot

In [None]:
fig, ax = plt.subplots(figsize=(40,20))
save_dir = 'detection/experiments/sna'
title = f'{dataset} with min edge weight = {min_weight}'
save_path = os.path.join(save_dir, f'{title}.pdf')
plt.title(title)
ig.plot(g, target=ax, arrow_size=0.5, edge_size=1, vertex_size=7, vertex_color='lightblue', edge_color='gray', bbox=(0, 0, 600, 600))
plt.show()

### Clustering Coefficient

In [None]:
# Calculate clustering coefficient
# For directed graphs, you can use "average" to get the average of in and out coefficients
clustering_coefficient = g.transitivity_avglocal_undirected()
print("Average Clustering Coefficient:", clustering_coefficient)

### Connected Components

In [None]:
# Find weakly connected components
weakly_connected_components = g.components(mode="weak")
print("Number of Weakly Connected Components:", len(weakly_connected_components))

# Find strongly connected components
strongly_connected_components = g.components(mode="strong")
print("Number of Strongly Connected Components:", len(strongly_connected_components))

print('Total number of Connected Components:', len(weakly_connected_components) + len(strongly_connected_components))

# You can also explore the size of the largest component, or other properties
largest_weakly_component = max(weakly_connected_components, key=len)
print("Size of Largest Weakly Connected Component:", len(largest_weakly_component))

In [None]:
degrees = g.degree()

# Count the number of vertices with degree 0 (singletons)
singletons = degrees.count(0)

print("Number of singletons:", singletons)

### Power Law

In [None]:
degrees = g.degree()
in_degree_lst = g.degree(mode='in')
out_degree_lst = g.degree(mode='out')
degrees[0], degrees[0], degrees[0]

In [None]:
# Replace 'degrees' with your actual data containing node degrees
def calc_power_law_exp(degrees):

    # Convert the degrees to numpy arrays
    x = np.array(degrees)

    # Count the frequency of each degree
    degree_counts = np.bincount(x)

    # Remove the zero-degree entries
    x = np.nonzero(degree_counts)[0]
    y = degree_counts[x]

    # Perform the curve fitting
    popt, _ = curve_fit(power_law, x, y)
    alpha, C = popt
    
    plt.scatter(x, y, label="Data")
    plt.plot(x, power_law(x, alpha, C), color='red', label=f"Power-law fit (alpha={alpha:.2f})")
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel("Degree")
    plt.ylabel("Frequency")
    plt.legend()
    plt.title(f'{dataset} Power Law')
    plt.savefig(f'detection/experiments/{dataset}_power_law.png', dpi=300)
    plt.show()

    return alpha, C

In [None]:
alpha, C = calc_power_law_exp(degrees)
C, alpha

### Centrality Measures

In [None]:
np.mean(g.degree()), np.mean(g.closeness()), np.mean(g.betweenness())

In [None]:
# Centrality Measures
degree_centrality = g.degree()
betweenness_centrality = g.closeness()
closeness_centrality = g.betweenness()

mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality = np.mean([degree_centrality, betweenness_centrality, closeness_centrality], axis=1)
mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality

In [None]:
# Check if the graph is connected
if g.is_connected():
    avg_path_length = g.average_path_length()
    print("Average shortest path length:", avg_path_length)
else:
    print("Graph is not connected. Average path length is undefined for the whole graph.")
    for c in g.components(mode="strong"):
        h = g.subgraph(c)
        if h.vcount() < 2:
            continue
        print(h.vcount(), h.ecount())
        # Centrality Measures
        degree_centrality = h.degree()
        betweenness_centrality = h.closeness()
        closeness_centrality = h.betweenness()

        mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality = np.mean([degree_centrality, betweenness_centrality, closeness_centrality], axis=1)
        print(mean_degree_centrality, mean_betweenness_centrality, mean_closeness_centrality)

# Aggregative Methods

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate, cross_val_predict, cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
scoring_list = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
scoring_names = ['_'.join(f.__name__.split('_')[:-1]) for f in [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]]
scoring_dict = {n:f for n, f in zip(scoring_names, scoring_list)}
# scoring_dict

In [None]:
y = np.array(labeled_nodes['label'])
y.shape

## Fixed Threshold

In [None]:
seed = 42

In [None]:
post_th = 0.5

In [None]:
def get_hs_count(v, th=0):
    return (v['predictions']>=th).sum()

In [None]:
largest_cc.vs['hs_count'] = [get_hs_count(v, post_th) for v in largest_cc.vs]

In [None]:
X = pd.DataFrame(labeled_nodes['hs_count'], columns=['hs_count'], index=labeled_nodes['name'])
X

In [None]:
clf = LogisticRegression()

In [None]:
clf.fit(X, y)

In [None]:
probas = clf.predict_proba(X)[:, 1]

In [None]:
roc_auc_score(y, probas)

In [None]:
# pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='constant', fill_value=0), LogisticRegression(random_state=echo_seed))
pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='mean'), LogisticRegression(random_state=seed))
pipe.set_output(transform='pandas')
pipe

In [None]:
param_grid = {
    'logisticregression__C': np.logspace(0, -3, 20),
    'logisticregression__class_weight': ['balanced']
}

In [None]:
gs = GridSearchCV(pipe, param_grid, cv=StratifiedKFold(shuffle=True, random_state=seed), scoring=scoring_names, return_train_score=True, refit='f1', n_jobs=100, verbose=1)

In [None]:
gs.fit(X, y)

In [None]:
results_dict_fixed = get_best_results_from_gs(gs)
for name, df in results_dict_fixed.items():
    print(name)
    display(df)

In [None]:
for m, s in results_dict_fixed['test'].astype(float).values:
    print(f'{m:.3f}+-{s:.3f}')

## Relational Threshold

In [None]:
def get_relational_feats(v, post_th=0.5):
    hs_count = v['hs_count']
    following_hs_count = np.array(largest_cc.vs[largest_cc.neighbors(v, mode='in')]['hs_count'])
    following_mean_hs_count = following_hs_count.mean() if following_hs_count.size>0 else 0
    followees_hs_count = np.array(largest_cc.vs[largest_cc.neighbors(v, mode='out')]['hs_count'])
    followees_mean_hs_count = followees_hs_count.mean() if followees_hs_count.size>0 else 0
    return hs_count, following_mean_hs_count, followees_mean_hs_count

In [None]:
X = pd.DataFrame(np.array([get_relational_feats(v) for v in labeled_nodes]), columns=['hs_count', 'following_mean_hs_count', 'followees_mean_hs_count'], index=labeled_nodes['name'])
X

In [None]:
y = np.array(labeled_nodes['label'])
y.shape

In [None]:
clf = LogisticRegression()

In [None]:
clf.fit(X, y)

In [None]:
probas = clf.predict_proba(X)[:, 1]

In [None]:
roc_auc_score(y, probas)

In [None]:
# pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='constant', fill_value=0), LogisticRegression(random_state=echo_seed))
pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='mean'), LogisticRegression(random_state=seed))
pipe.set_output(transform='pandas')
pipe

In [None]:
param_grid = {
    'logisticregression__C': np.logspace(0, -3, 20),
    'logisticregression__class_weight': ['balanced']
}

In [None]:
gs = GridSearchCV(pipe, param_grid, cv=StratifiedKFold(shuffle=True, random_state=seed), scoring=scoring_names, return_train_score=True, refit='f1', n_jobs=100, verbose=1)

In [None]:
gs.fit(X, y)

In [None]:
results_dict_relat = get_best_results_from_gs(gs)
for name, df in results_dict_fixed.items():
    print(name)
    display(df)

In [None]:
for m, s in results_dict_relat['test'].astype(float).values:
    print(f'{m:.3f}+-{s:.3f}')

## Dynamic Threshold

In [None]:
def get_hs_stats(v, percentiles=np.array([1,5,10,25,50,75,90,95,99])):
    return np.array([v['predictions'].mean(), v['predictions'].std()] + np.percentile(v['predictions'], q=percentiles).tolist())

In [None]:
percentiles=np.array([1,5,10,25,50,75,90,95,99])
# percentiles/100

In [None]:
X = pd.DataFrame(np.array([get_hs_stats(v) for v in labeled_nodes]), columns=['mean', 'std'] + [f'{p}%' for p in percentiles], index=labeled_nodes['name'])
X

In [None]:
# pipe = make_pipeline(StandardScaler(), SimpleImputer(strategy='constant', fill_value=0), LogisticRegression(random_state=echo_seed))
pipe = make_pipeline(StandardScaler(), LogisticRegression(random_state=seed))
pipe.set_output(transform='pandas')
pipe

In [None]:
param_grid = {
    'logisticregression__C': np.logspace(0, -3, 20),
    'logisticregression__class_weight': ['balanced']
}

In [None]:
gs = GridSearchCV(pipe, param_grid, cv=StratifiedKFold(shuffle=True, random_state=seed), scoring=scoring_names, return_train_score=True, refit='f1', n_jobs=100, verbose=1)

In [None]:
gs.fit(X, y)

In [None]:
results_dict_dynamic = get_best_results_from_gs(gs)
for name, df in results_dict_dynamic.items():
    print(name)
    display(df)

In [None]:
for m, s in results_dict_dynamic['test'].astype(float).values:
    print(f'{m:.3f}+-{s:.3f}')

In [None]:
overleaf = ''
for m, s in results_dict_dynamic['test'].astype(float).values:
    overleaf += (f'${m:.3f} \pm {s:.3f}$ & ')
print(overleaf[:-3] + '\\\\')

# Degroots Diffusion

In [None]:
import numpy as np
import networkx as nx
from scipy import sparse


def fit_degroots_diffusion(nx_network, seed_hate_users, self_loops_dict,
                           iterations=2, initial_belief=1, fix_seed_haters_belief=False, verbose=False):
    """
    running a diffusion model as suggested in https://github.com/manoelhortaribeiro/HatefulUsersTwitter/blob/master/preprocessing/5_get_diffusion_graph.py

    :param nx_network: networkX object. The social network to run the model over
    :param seed_hate_users: list or set. the seed (original) hate users
    :param self_loops_dict: dict. a dictionary where key is a username and value is the weight of self loop (this is
    the total number of posts per user)
    :param iterations: int. Default: 2. number of iterations to run the difussion mode (usually in te [1,5] range)
    :param initial_belief: float. Default: 1. the initial belief values of the seed hate users. In most cases this is 1
    :param fix_seed_haters_belief: bool. Default: False. whether or not to set the value of the hate users as the
    initial_belief. This "help" propogate the hate and keep the seed hate users as haters over all iterations
    :param verbose: bool. whether to print information along the prccess
    :return: dict. a dictionary where key is the username and value is the belief value of the user at the end of the
    process

    Example:
    input_for_networkx = [('A', 'B', 4),
                          ('B', 'C', 3),
                          ('C', 'A', 1), ('C', 'B', 6),
                          ('D', 'B', 2), ('D', 'C', 4)]
    social_network = nx.DiGraph()
    social_network.add_weighted_edges_from(input_for_networkx)
    seed_hate_users = ('A', 'B')
    self_loops_dict = {'A': 6, 'B': 7, 'C': 0, 'D': 2}
    iterations=2
    initial_belief = 1.0
    fix_seed_haters_belief = False
    final_belief_dict = fit_degroots_diffusion(social_network, seed_hate_users, self_loops_dict, iterations,
                                               initial_belief, fix_seed_haters_belief)
    "Out of the 2 seed hate users, 2 are found in the network"
    final_belief_dict
    {'A': 0.8800000000000001, 'B': 0.7900000000000001, 'C': 0.7428571428571429, 'D': 0.7375}
    """
    # NOTE!!! There are 1994 hate-users which ARE NOT in the network (since they only commented and did not echo anyone)

    hate_users_in_network = [h for h in seed_hate_users if h in nx_network.nodes()]
    not_hate_users = list(set(nx_network.nodes()).difference(set(hate_users_in_network)))
    node_list = hate_users_in_network + not_hate_users
    # inverse the network, since diffusion moves from the writter to the reader + adding self-loop
    social_network_reversed = nx_network.reverse(copy=True)
    for n in social_network_reversed.nodes():
        if n in self_loops_dict:
            social_network_reversed.add_edge(n, n, weight=self_loops_dict[n])
        else:
            social_network_reversed.add_edge(n, n, weight=0)
    # the (i, j) place will have the weight of the edge between node i and j
    transition_matrix = nx.adjacency_matrix(social_network_reversed, nodelist=node_list).asfptype()
    # we need to transpose the matrix since now each column represents the INPUT arrows to each node and is sum to 1
    # it is a bit confusing, but take a look at the example above and then it will make sense
    transition_matrix = transition_matrix.transpose(copy=True)

    # normalization, the short way...
    rows_sum = np.sum(transition_matrix, axis=1).tolist()
    inverse_row_sum = [1 / i[0] if i[0] > 0 else 0 for i in rows_sum]
    inverse_row_sum_as_csr = sparse.csr_matrix(inverse_row_sum).transpose()
    transition_matrix = transition_matrix.multiply(inverse_row_sum_as_csr)

    beliefs = np.zeros(len(node_list))
    beliefs[:len(hate_users_in_network)] = initial_belief

    for _ in range(iterations):
        out = transition_matrix.dot(beliefs)
        beliefs = out
        # in case fix_seed_haters_belief is set to True, in each cycle the beliefs of the haters is set back to 1
        if fix_seed_haters_belief:
            beliefs[:len(hate_users_in_network)] = initial_belief
    final_beliefs_dict = dict()
    for node, belief in zip(node_list, beliefs):
        final_beliefs_dict[node] = float(belief)
    # sorting the dict by value
    final_beliefs_dict = dict(sorted(final_beliefs_dict.items(), key=lambda x: x[1], reverse=True))
    if verbose:
        print(f"Out of the {len(seed_hate_users)} seed hate users, {len(hate_users_in_network)} are in the network")
    return final_beliefs_dict


In [None]:
import pandas as pd
import numpy as np
import igraph as ig
import pickle

def get_doc_vectors(dataset):
    if dataset in doc_vectors_dict:
        return doc_vectors_dict[dataset]
    doc_vectors = pickle.load(open(path_confs[dataset]['doc_vectors'], "rb"))
    doc_vectors_dict[dataset] = doc_vectors
    return doc_vectors

def get_user_labels(dataset):
    if dataset in labeled_nodes_dict:
        return labeled_nodes_dict[dataset]
    user2label_path = user_level_conf[dataset]["data_path"]
    sep = ","
    if user2label_path.endswith("tsv"):
        sep = "\t"
    y = pd.read_csv(user2label_path, sep=sep, index_col=[0]).squeeze()
    y.index = y.index.astype('str')
    labeled_nodes_dict[dataset] = y
    return y

def get_reposts_graph(dataset, min_weight=1):
    if dataset in graphs_dict:
        g = graphs_dict[dataset]
        filtered_edges = g.es.select(weight_ge=min_weight)
        return g.subgraph_edges(filtered_edges)
    reposts_path = path_confs[dataset]['reposts']
    if reposts_path.endswith('.txt'):
        reposts_df = pd.read_csv(reposts_path, sep='\t', header=None, names=['source', 'target', 'weight'])
    elif reposts_path.endswith('.tsv'):
        if dataset == 'truth':
            pd.read_csv(reposts_path, sep='\t', names=['source', 'target', 'retruths_list', 'weight'], skiprows=1)
        else:
            reposts_df = pd.read_csv(reposts_path, sep='\t', names=['source', 'target', 'weight'], skiprows=1)
    elif reposts_path.endswith('.csv'):
        reposts_df = pd.read_csv(reposts_path, header=None, names=['source', 'target', 'weight'])
    else:
        reposts_edge_dict = pickle.load(open(reposts_path, "rb"))
        reposts_edge_list = [[k[0], k[1], v] for k,v in tqdm(reposts_edge_dict.items())]
        reposts_df = pd.DataFrame(reposts_edge_list, columns=['source', 'target', 'weight'])
    reposts_df['source'] = reposts_df['source'].astype(str)
    reposts_df['target'] = reposts_df['target'].astype(str)
    reposts_dict[dataset] = reposts_df
    edges = [tuple(x) for x in reposts_df[['source', 'target', 'weight']].values]
    g = ig.Graph.TupleList(edges, edge_attrs=['weight'], directed=True)    
    y = get_user_labels(dataset)
    g.vs['label'] = [y.loc[e['name']] if e['name'] in y.index else -1 for e in g.vs]
    # g.vs.select(_degree=0).delete()
    g.simplify(multiple=True, loops=True, combine_edges='sum')
    g['name'] = dataset
    graphs_dict[dataset] = g
    filtered_edges = g.es.select(weight_ge=min_weight)
    return g.subgraph_edges(filtered_edges)

def get_largest_weak_cc(g):
    # Assuming 'g' is your igraph Graph
    components = g.components(mode='WEAK')  # Find weakly connected components
    return components.giant()

def get_graph_with_docs(dataset):
    if dataset in graphs_with_docs_dict:
        return graphs_with_docs_dict[dataset]
    G = graphs_dict.get(dataset, get_reposts_graph(dataset))
    doc_vectors = doc_vectors_dict.get(dataset, get_doc_vectors(dataset))
    H = G.subgraph(doc_vectors).copy()
    graphs_with_docs_dict[dataset] = H
    return H

def get_label_count_and_percent(dataset):
    G = get_graph_with_docs(dataset)
    y = get_user_labels(dataset)
    ngbrs_labels = {n : np.array([y[ngbr] if ngbr in y.index else 2 for ngbr in G.neighbors(n)]) for n in tqdm(G.nodes())}
    ngbrs_labels_count = {k: Counter(v) for k,v in ngbrs_labels.items()}
    label_count_df = pd.DataFrame.from_dict(ngbrs_labels_count, orient='index').fillna(0).astype(int).sort_index(axis=0).sort_index(axis=1)
    return label_count_df

def power_law(x, alpha, C):
    return C * x**(-alpha)

def get_labeled_nodes(dataset: str = '', g: ig.Graph = None):
    if dataset == '' and g is not None:
        dataset = g['name']
    elif g is None and dataset!='':
        g = graphs_dict[dataset]
    else:
        raise ValueError("Both @dataset and @g can't be empty")
    y = get_user_labels(dataset)
    labeled_nodes = g.vs.select(lambda v:  v['label'] != -1)
    return labeled_nodes

# Replace 'degrees' with your actual data containing node degrees
def calc_power_law_exp(degrees):

    # Convert the degrees to numpy arrays
    x = np.array(degrees)

    # Count the frequency of each degree
    degree_counts = np.bincount(x)

    # Remove the zero-degree entries
    x = np.nonzero(degree_counts)[0]
    y = degree_counts[x]

    # Perform the curve fitting
    popt, _ = curve_fit(power_law, x, y)
    alpha, C = popt
    
    plt.scatter(x, y, label="Data")
    plt.plot(x, power_law(x, alpha, C), color='red', label=f"Power-law fit (alpha={alpha:.2f})")
    plt.xscale('log')
    plt.yscale('log')
    plt.xlabel("Degree")
    plt.ylabel("Frequency")
    plt.legend()
    plt.title(f'{dataset} Power Law')
    plt.savefig(f'detection/experiments/{dataset}_power_law.png', dpi=300)
    plt.show()

    return alpha, C

def get_ego_subgraph(g, vertices, order_k = 1):
    ego_network = g.neighborhood(vertices=vertices, order=order_k, mode='out')
    set_of_tuples = set(tuple(inner_list) for inner_list in ego_network)
    flattened_set = list({element for tupl in set_of_tuples for element in tupl})
    ego_subgraph = g.subgraph(flattened_set)
    return ego_subgraph