# Embedding and Classification Methods

This notebook contains multiple embedding methods and one Classification method (SVM)

In [1]:
import snap 
import csv
import pandas as pd
import random
import networkx as nx
from node2vec import Node2Vec
#from deepwalk import DeepWalk
#from graphembedding.sdne import SDNE
#from gem.embedding.hope import HOPE

import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
#from GraphEmbedding.ge import LINE, SDNE, Struc2Vec
from joblib import parallel_config
from karateclub import SINE, AE, HOPE



%run Fairness.ipynb



**Emebedding node2Vec:**
- this method creates an embedding of a graph by using the Node2Vec Algorithm
- returns an emebdding matrix of the vectors

In [2]:
def emebedding_node2Vec(graph, dimensions, walk_length, num_walks, workers):
    node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers)

    #Fit the Node2Vec model to the graph
    model = node2vec.fit(window=10, min_count=1, batch_words=4)

    # Obtain node embeddings
    node_embeddings = {str(node): model.wv[str(node)] for node in graph.nodes()}
    embedding_matrix = np.array(list(node_embeddings.values()))
    
    return embedding_matrix

**DeepWalk:**
- calculates the emebedding of each nodes by calculating random walks for every node
- returns: embeddings

In [3]:
def deepWalk_(graph, number_of_random_walks, walk_length, window):
    # calculate random walks for every node in the Graph
    all_nodes = list(graph.nodes())
    #number_of_random_walks = 5
    random_walks = []
    print(number_of_random_walks)
    print(walk_length)
    print(window)

    for node in tqdm(all_nodes):
        # number of random walks
        for i in range(number_of_random_walks):
            # append the random walk sequence of a node from a specified length
            random_walks.append(get_random_walk(graph, node, walk_length))
    
        # train word2vec model
    model = Word2Vec(window = window, sg = 1, hs = 0,negative = 10, alpha=0.03, min_alpha=0.0007)

    model.build_vocab(random_walks, progress_per=2)
    
    model.train(random_walks, total_examples = model.corpus_count, epochs=20, report_delay=1)
    # Get embeddings for nodes
    embeddings = model.wv
    return embeddings
    
# function to generate random walk sequences of nodes for a particular node
def get_random_walk(graph, node, walk_length):
    # initialization
    random_walk_length = [node]
    
    #loop over to get the nodes visited in a random walk
    for i in range(walk_length-1):
        # list of neighbors
        neighbors = list(graph.neighbors(node))
        # if the same neighbors are present in ranom_walk_length list, then donot add them as new neighbors
        neighbors = list(set(neighbors) - set(random_walk_length))    
        if len(neighbors) == 0:
            break
        # pick any one neighbor randomly from the neighbors list
        random_neighbor = random.choice(neighbors)
        # append that random_neighbor to the random_walk_length list
        random_walk_length.append(random_neighbor)
        node = random_neighbor
        
    return random_walk_length

**SINE**:
-  this method creates an attributed embedding by learning a low dimensional vector representation
- returns an emebdding matrix of the vectors

In [4]:
def sine(graph, coo_matrix, **params): 
    model = SINE(**params)
    model.fit(graph, coo_matrix,)
    embedding = model.get_embedding()
    
    return embedding

**Attributed Emebdding**
- creates a low dimesnional representation fo nodes by considering node attributes (coo_matrix)
- returns an emebdding matrix of the vectors

In [5]:
def ae(graph, coo_matrix, **params): 
    model = AE(**params)
    model.fit(graph, coo_matrix,)
    embedding = model.get_embedding()
    
    return embedding

In [6]:
# def deepWalk_embedding(graph):
#     num_walks = 10
#     walk_length = 30

#     # Train DeepWalk
#     model = DeepWalk(G, num_walks=num_walks, walk_length=walk_length, workers=4)
#     model.train(window_size=5, iter=1)  # You can adjust window_size and iter based on your needs

#     # Get embeddings for nodes
#     embeddings = model.get_embeddings()

In [7]:
def hope_embedding(graph, **params):
    # Train Hope
    #d=64, beta=0.01
    
    # Step: Construct High-Order Adjacency Matrix (Using Random Walks)
    # Construct a high-order adjacency matrix capturing higher-order proximity information
    # Here, we'll use the karateclub library's Hope implementation for simplicity
    hope = HOPE(**params)  # You can adjust the embedding dimension 'd' and 'beta' based on your needs
    hope.fit(graph)

   

    # Step: Apply Hope Embedding
    # Learn low-dimensional representations (embeddings) for each node in the graph
    # This will preserve both local and global structural information
    node_embeddings = hope.get_embedding()
    
    
    return node_embeddings

**Classification SVM**
- seperates the dataset into trainings and test dataset
- creates SVM classifier
- train the classifier on trainings dataset
- makes a prediction on the test dataset
- returns: prediction, true labels,  indices of the test dataset

In [16]:
#call classification method
def classification_SVM(embedding_matrix, labels):
    labels_array = np.array(labels)
    print(labels_array)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, train_indices, test_indices  = splitdataset(embedding_matrix,labels_array )
    #print(X_train)
    #print('hhs')
    #print(X_test)
    #print('hhs')
    #print(y_train)

    # Create an SVM classifier
    svm_classifier = SVC(kernel='rbf', random_state=42)

    # Train the classifier
    svm_classifier.fit(X_train, y_train)

    # Make predictions on the test set
    predictions = svm_classifier.predict(X_test)
    
    return predictions, y_test, test_indices

**Splitdatset:**
- split the datset in to trainings and testdatset
- returns: lists of trainings data/label and test data/label

In [15]:
def splitdataset(embeddings, labels):
    num_samples = len(labels)

    # num_train_samples: the number of data points you want in your training set (e.g., 80% of the data)
    num_train_samples = int(0.7 * num_samples)
    
    train_indices = np.random.choice(num_samples, size=num_train_samples, replace=False)
    test_indices = np.setdiff1d(np.arange(num_samples), train_indices)
    #print('train_indices')
    #print(train_indices)
    
    train_embeddings = []
    train_labels = []
    test_labels = []
    test_embeddings = []
    for indices in train_indices:
       # print(embeddings[indices])
        #print('jdj')
        train_embeddings.append(embeddings[indices])
        train_labels.append(labels[indices])
    for indices in test_indices:
        test_embeddings.append(embeddings[indices])
        test_labels.append(labels[indices])
    #print(len(labels))
    #print(len(train_labels))
    #print(len(test_labels))
        
    if((num_samples == len(train_indices)+ len(test_indices)) and (len(labels) == (len(train_indices)+ len(test_indices)))):
            return train_embeddings, test_embeddings, train_labels, test_labels, train_indices, test_indices
    else:
        return null, null, null, null, null, null
        
    
    

**Hope Grid search**:
- apply a grid search alogorithm on the Hope algorithm to find the best parameters of the best result
- returns best parameters, best FPR for female and male, Dataframe with the results of each fairness definition

In [10]:
def hope_grid_search(G_nx, parameter_grid, df_target, gender):
    best_params = None
    best_fpr_f = 2.0
    best_fpr_m = 2.0
    best_predictions_node2Vec = []
    best_y_test = []
    test_indices = []
    dataframe = pd.DataFrame()

    for params in ParameterGrid(parameter_grid):
        # Initialisieren Sie Node2Vec mit den aktuellen Parametern
        embeddingMA_deepWalk = hope_embedding(G_nx, **params)

        predictions, y_test, test_indices  = classification_SVM(embeddingMA_deepWalk, list(df_target))
        y_pred_f, y_pred_m, y_true_f, y_true_m = seperate_female_male(predictions, y_test, test_indices,list(gender))
        fpF, fp_m = false_positive_error_rate_balance(y_true_f, y_pred_f, y_true_m, y_pred_m)
        print('fpF:', fpF)
        print('fpM:', fp_m)
        dataframe_evaluation_DeepWalk = return_frame_with_evaluations(predictions, y_test, test_indices, list(gender))
        
        if best_fpr_f > fpF and best_fpr_m > fp_m:
            best_fpr_f = fpF
            best_fpr_m = fp_m
            best_params = params
            best_predictions_node2Vec = predictions
            best_y_test = y_test
            test_indices = test_indices
            dataframe = dataframe_evaluation_DeepWalk

    return best_params, best_fpr_f,best_fpr_m, dataframe


**peform Grid search DeepWalk**:
- apply a grid search alogorithm on the DeepWalk algorithm to find the best parameters of the best result
- returns best parameters, best FPR for female and male, Dataframe with the results of each fairness definition

In [11]:
def perform_grid_search_DeepWalk(G_nx, parameter_grid, df_target, gender):
    best_params = None
    best_fpr_f = 2.0
    best_fpr_m = 2.0
    best_predictions_node2Vec = []
    best_y_test = []
    test_indices = []
    dataframe = pd.DataFrame()

    for params in ParameterGrid(parameter_grid):
        # Initialisieren Sie Node2Vec mit den aktuellen Parametern
        embeddingMA_deepWalk = deepWalk_(G_nx, **params)

        predictions, y_test, test_indices  = classification_SVM(embeddingMA_deepWalk, list(df_target))
        y_pred_f, y_pred_m, y_true_f, y_true_m = seperate_female_male(predictions, y_test, test_indices,list(gender))
        fpF, fp_m = false_positive_error_rate_balance(y_true_f, y_pred_f, y_true_m, y_pred_m)
        print('fpF:', fpF)
        print('fpM:', fp_m)
        dataframe_evaluation_DeepWalk = return_frame_with_evaluations(predictions, y_test, test_indices, list(gender))
        
        if best_fpr_f > fpF and best_fpr_m > fp_m:
            best_fpr_f = fpF
            best_fpr_m = fp_m
            best_params = params
            best_predictions_node2Vec = predictions
            best_y_test = y_test
            test_indices = test_indices
            dataframe = dataframe_evaluation_DeepWalk

    return best_params, best_fpr_f,best_fpr_m, dataframe



**peform Grid search**:
- apply a grid search alogorithm on the Struc2Vec algorithm to find the best parameters of the best result
- returns best parameters, best FPR for female and male, Dataframe with the results of each fairness definition

In [12]:
def perform_grid_search(G_nx, parameter_grid, df_target, gender):
    best_params = None
    best_fpr_f = 2
    best_fpr_m = 2
    best_predictions_node2Vec = []
    best_y_test = []
    test_indices = []
    dataframe = []

    for params in ParameterGrid(parameter_grid):
        # Initialisieren Sie Node2Vec mit den aktuellen Parametern
        
        embeddingMA_node2Vec = emebedding_node2Vec(G_nx, **params)
        #print(embeddingMA_node2Vec)
        print('done create embeding vectors')
        predictions, y_test, test_indices  = classification_SVM(embeddingMA_node2Vec, list(df_target))
        y_pred_f, y_pred_m, y_true_f, y_true_m = seperate_female_male(predictions, y_test, test_indices, list(gender))
        
        fpF, fp_m = false_positive_error_rate_balance(y_true_f, y_pred_f, y_true_m, y_pred_m)
        print('done iteration')
        dataframe_evaluation_Node2Vec = return_frame_with_evaluations(predictions, y_test, test_indices, list(gender))
        print('fpF:', fpF)
        print('fpM:', fp_m)
        
        if best_fpr_f > fpF and best_fpr_m > fp_m:
            best_fpr_f = fpF
            best_fpr_m = fp_m
            best_params = params
            best_predictions_node2Vec = predictions
            best_y_test = y_test
            test_indices = test_indices
            dataframe = dataframe_evaluation_Node2Vec

    return best_params, best_fpr_f,best_fpr_m, dataframe

**Grid search SINE:**
- apply a grid search alogorithm on the SINE algorithm to find the best parameters of the best result
- returns best parameters, best FPR for female and male, Dataframe with the results of each fairness definition

In [13]:
def grid_search_SINE(G_nx, features, parameter_grid, df_target, gender):
    best_params = None
    best_fpr_f = 2
    best_fpr_m = 2
    best_predictions= []
    best_y_test = []
    test_indices = []
    dataframe = []
    
    for params in ParameterGrid(parameter_grid):
        # Initialisieren Sie Node2Vec mit den aktuellen Parametern
        
        embeddingMA  = sine(G_nx, coo_matrix, **params)
        #print(embeddingMA_node2Vec)
        print('done create embeding vectors')
        predictions, y_test, test_indices  = classification_SVM(embeddingMA, list(df_target))
        y_pred_f, y_pred_m, y_true_f, y_true_m = seperate_female_male(predictions, y_test, test_indices, list(gender))
        
        fpF, fp_m = false_positive_error_rate_balance(y_true_f, y_pred_f, y_true_m, y_pred_m)
        print('done iteration')
        dataframe_evaluation = return_frame_with_evaluations(predictions, y_test, test_indices, list(gender))
        print('fpF:', fpF)
        print('fpM:', fp_m)
        
        if best_fpr_f > fpF and best_fpr_m > fp_m:
            best_fpr_f = fpF
            best_fpr_m = fp_m
            best_params = params
            best_predictions = predictions
            best_y_test = y_test
            test_indices = test_indices
            dataframe = dataframe_evaluation
            
    return best_params, best_fpr_f,best_fpr_m, dataframe

**Grid search AE**:
- apply a grid search alogorithm on the AE algorithm to find the best parameters of the best result
- returns best parameters, best FPR for female and male, Dataframe with the results of each fairness definition

In [14]:
def grid_search_AE(G_nx, features, parameter_grid, df_target, gender):
    best_params = None
    best_fpr_f = 2
    best_fpr_m = 2
    best_predictions= []
    best_y_test = []
    test_indices = []
    dataframe = []
    
    for params in ParameterGrid(parameter_grid):
        # Initialisieren Sie Node2Vec mit den aktuellen Parametern
        
        embeddingMA  = ae(G_nx, coo_matrix, **params)
        #print(embeddingMA_node2Vec)
        print('done create embeding vectors')
        predictions, y_test, test_indices  = classification_SVM(embeddingMA, list(df_target))
        y_pred_f, y_pred_m, y_true_f, y_true_m = seperate_female_male(predictions, y_test, test_indices, list(gender))
        
        fpF, fp_m = false_positive_error_rate_balance(y_true_f, y_pred_f, y_true_m, y_pred_m)
        print('done iteration')
        dataframe_evaluation = return_frame_with_evaluations(predictions, y_test, test_indices, list(gender))
        print('fpF:', fpF)
        print('fpM:', fp_m)
        
        if best_fpr_f > fpF and best_fpr_m > fp_m:
            best_fpr_f = fpF
            best_fpr_m = fp_m
            best_params = params
            best_predictions = predictions
            best_y_test = y_test
            test_indices = test_indices
            dataframe = dataframe_evaluation
            
    return best_params, best_fpr_f,best_fpr_m, dataframe