In [1]:
import networkx as nx
# https://networkx.github.io/documentation/stable/reference/index.html
import matplotlib.pyplot as plt
from matplotlib import pylab
import numpy as np
import pandas as pd

In [2]:
nx.__version__

'3.1'

# Common files : 

In [3]:
# Open the groundtruth_files
groundtruth_college = pd.read_csv('./groundtruth/college.csv', sep='\t', header='infer')
groundtruth_employer = pd.read_csv('./groundtruth/employer.csv', sep='\t', header='infer')
groundtruth_location = pd.read_csv('./groundtruth/location.csv', sep='\t', header='infer')

# Open the files with only a part of the data
current_knowledge_college = pd.read_csv('college_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')
current_knowledge_employer = pd.read_csv('employer_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')
current_knowledge_location = pd.read_csv('location_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')

# open the graph file
graph = nx.read_gexf("mediumLinkedin.gexf")

# Global variables for predictions 
prediction_college = {}
prediction_employer = {}
prediction_location = {}

# Get the list of filled nodes 
filled_nodes_college = current_knowledge_college['name'].drop_duplicates().tolist()
filled_nodes_employer = current_knowledge_employer['name'].drop_duplicates().tolist()
filled_nodes_location = current_knowledge_location['name'].drop_duplicates().tolist()
filled_nodes = pd.concat([current_knowledge_college['name'], 
                               current_knowledge_employer['name'], 
                               current_knowledge_location['name']]).drop_duplicates().tolist()
                               

# Get the list of nodes that have no information
empty_nodes = set(graph.nodes())-set(filled_nodes)

cliques = nx.enumerate_all_cliques(graph)

# Data understanding : 

Here, we will check the following assertion : 
- In the current knowledge, if a someone has a location for example but no college, then it means that he did not went to college. 

Basically, this assertion will check if the names with missing information in the data are the same for the three current knwoledge. 

In [4]:
# count the number of time a value is present in the current knwoledge of a file but not in another while it is present in the groundtruth
count_anomaly = 0 

for name in filled_nodes_college:
    if name not in filled_nodes_employer and name in groundtruth_employer['name'].tolist():
        count_anomaly += 1
    if name not in filled_nodes_location and name in groundtruth_location['name'].tolist():
        count_anomaly += 1

for name in filled_nodes_employer:
    if name not in filled_nodes_college and name in groundtruth_college['name'].tolist():
        count_anomaly += 1
    if name not in filled_nodes_location and name in groundtruth_location['name'].tolist():
        count_anomaly += 1

for name in filled_nodes_location:
    if name not in filled_nodes_employer and name in groundtruth_employer['name'].tolist():
        count_anomaly += 1
    if name not in filled_nodes_college and name in groundtruth_college['name'].tolist():
        count_anomaly += 1

print(f"{count_anomaly} anomalies detected")

0 anomalies detected


0 anomalies were detected. It means that if a value is missing in a current_knowledge file while it is not in another, then the value trully does not exist. 

# Data preparation

### Data Cleaning

This part aims at cleaning the data we have at our disposal. an observation was made that some rows have the same value but with a slight difference. For example, the same name is written in two different ways. So, to counter this, we will process each file in order to format the names that are similar both in the missing data files and in the ground truth. This will be done by creating new files in order to keep the original data files untouched. We will only clean the college dataset since the other two datasets seem to be the clean. 

To clean the college data, we will only go through each row and remove the terms : 
- college
- university
- institute
- technology
- science
- of
- at 
- -/&
- numbers 

The idea will be to create a new file which is a copy of the previous file and to add a filtered column and a new name one. For each row analysis, we will see if the same filtered name was already found. if not, then the new name will be the original one. If it was found then the new name will be the new name of the previous row that was dealing with the same filtered name. 


In [5]:
import shutil

# Create a copy of each groundtruth files
new_groundtruth_college = shutil.copy("groundtruth/college.csv", "cleaned_groundtruth/cleaned_college.csv")

# Create a copy of each missing values files
new_current_knowledge_college = shutil.copy("college_with_60percent_of_nodes_remoded.csv", "cleaned_current_knwoledge/cleaned_college_with_60percent_of_nodes_remoded.csv")


In [6]:
import re
import pandas as pd
from html import unescape  
from difflib import SequenceMatcher  

def calculate_similarity(str1, str2):
    matcher = SequenceMatcher(None, str1, str2)
    return matcher.ratio()

def clean_college_data(input_file, output_file):
    # Regular expression pattern to match terms to be removed
    remove_pattern = re.compile(r'\b(college|university|institute|technology|science|business|of|at|and|finance|economics|\d+|[-&])\b', re.IGNORECASE)

    # Read the input file
    df = pd.read_csv(input_file, sep='\t', header='infer')

    # Replace HTML entities with their corresponding characters
    df['college'] = df['college'].apply(unescape)

    # Create a new DataFrame to store the cleaned data
    cleaned_df = df.copy()

    # Create a dictionary to store mappings from filtered names to new names
    name_mappings = {}

    # Create a counter to track the number of modifications
    modification_counter = 0

    # Iterate through each row
    for index, row in cleaned_df.iterrows():
        # Get the original name
        original_name = row['college']

        # Remove the specified terms from the original name
        filtered_name = remove_pattern.sub('', original_name).strip()

        # Check if the filtered name has been encountered before
        if filtered_name in name_mappings:
            # If yes, assign the new name from the previous occurrence
            new_name = name_mappings[filtered_name]
        else:
            # If not, assign the original name as the new name
            new_name = original_name
            # Store the mapping for future occurrences
            name_mappings[filtered_name] = original_name

            # Calculate similarity with all existing filtered names
            for existing_filtered_name in name_mappings.keys():
                similarity_score = calculate_similarity(filtered_name, existing_filtered_name)
                # If similarity score is above 0.80, use existing name
                if similarity_score > 0.80:
                    new_name = name_mappings[existing_filtered_name]
                    break  # Exit loop if a similar name is found

        if new_name != original_name:
            modification_counter += 1
            print('old name mismatch:', original_name, 'new name:', new_name)

        # Update the 'Filtered Name' column with the filtered name
        cleaned_df.at[index, 'Filtered Name'] = filtered_name
        # Update the 'New Name' column with the new name
        cleaned_df.at[index, 'New Name'] = new_name

    # Remove the 'college' and 'Filtered Name' columns
    cleaned_df.drop(columns=['college', 'Filtered Name'], inplace=True)

    # Rename the new name column to college
    cleaned_df = cleaned_df.rename(columns={'New Name': 'college'})

    # Write the cleaned data to the output file
    cleaned_df.to_csv(output_file, sep='\t', index=False)

    # Print the number of modifications made
    print(f"Number of modifications made: {modification_counter}")


In [7]:
current_knowledge_college_filename = 'college_with_60percent_of_nodes_remoded.csv'
groundtruth_college_filename = 'groundtruth/college.csv'
cleaned_current_knowledge_college_filename = "cleaned_current_knwoledge/cleaned_college_with_60percent_of_nodes_remoded.csv"
cleaned_groundtruth_college_filename = "cleaned_groundtruth/cleaned_college.csv"

clean_college_data(current_knowledge_college_filename, cleaned_current_knowledge_college_filename)
clean_college_data(groundtruth_college_filename, cleaned_groundtruth_college_filename)

old name mismatch: university of illinois at urbana-champaign - college of business new name: university of illinois at urbana-champaign
old name mismatch: bangladesh university of engineering and technology (buet) new name: bangladesh university of engineering and technology
old name mismatch: athens university of economics and business new name: university of athens
old name mismatch: beijing university of technology new name: university of science and technology beijing
old name mismatch: athens university of economics and business new name: university of athens
old name mismatch: beijing institute of technology new name: university of science and technology beijing
old name mismatch: shanghai jiaotong university new name: shanghai jiao tong university
old name mismatch: university of isfahan new name: isfahan university of technology
old name mismatch: esfahan university new name: isfahan university of technology
old name mismatch: national taiwan university of science and technolo

old name mismatch: birla institute of technology and science new name: birla institute of technology
old name mismatch: birla institute of technology and science new name: birla institute of technology
old name mismatch: university of illinois - urbana champaign new name: university of illinois at urbana-champaign
old name mismatch: birla institute of technology and science new name: birla institute of technology
old name mismatch: birla institute of technology and science new name: birla institute of technology
Number of modifications made: 15
old name mismatch: university of illinois at urbana-champaign - college of business new name: university of illinois at urbana-champaign
old name mismatch: birla institute of technology new name: birla institute of technology and science
old name mismatch: shanghai jiaotong university new name: shanghai jiao tong university
old name mismatch: bangladesh university of engineering and technology (buet) new name: bangladesh university of engineerin

In [8]:
# Open the groundtruth_files
groundtruth_college = pd.read_csv('cleaned_groundtruth/cleaned_college.csv', sep='\t', header='infer')

# Open the files with only a part of the data
current_knowledge_college = pd.read_csv('cleaned_current_knwoledge/cleaned_college_with_60percent_of_nodes_remoded.csv', sep='\t', header='infer')


# Accuracy Definitions : 

## First definition : 

The first definition of the accuracy is based on the number of good prediciton over all the predictions.

From a mathematical point of view, we could represent it like this : 
\begin{equation*}
\text{Accuracy} = \frac{\text{Number of correct predictions}}{\text{Total number of predictions}}
\end{equation*}

In [9]:
def accuracy_good_predictions_over_number_of_predictions(groundtruth, predictions):
    true_positive_prediction = 0
    predicted_values = 0 
    if not predictions:
        return 0
    for name, value in predictions.items():
        groundtruth_values = groundtruth.loc[groundtruth.name == name, groundtruth.columns[1]].values
        if not value and groundtruth_values.size == 0:
            true_positive_prediction += 1
            predicted_values += 1
        true_positive_prediction += len([item for item in value if item in groundtruth_values])
        predicted_values += len([item for item in value])

    return true_positive_prediction*100/predicted_values

## Second definition :

The second definition of the accuracy is based on the number of good values recovered compared to the groundtruth. 

From a mathematical point of view, we could represent it like this : 

\begin{equation*}
\text{Accuracy} = \frac{\text{Number of correct values recovered}}{\text{Total number of expected values in ground truth}}
\end{equation*}

In [10]:
def accuracy_correct_predictions_over_expected_number_of_predictions(groundtruth, predictions):
    true_positive_prediction = 0
    expected_predictions = 0 
    if not predictions:
        return 0
    for name, value in predictions.items():
        groundtruth_values = groundtruth.loc[groundtruth.name == name, groundtruth.columns[1]].values
        if not value and groundtruth_values.size == 0:
            print('a')
            true_positive_prediction += 1
        true_positive_prediction += len([item for item in value if item in groundtruth_values])
    for name in empty_nodes:
        groundtruth_values = groundtruth.loc[groundtruth.name == name, groundtruth.columns[1]].values
        expected_predictions += len(groundtruth_values)
    return true_positive_prediction*100/expected_predictions    

# Definition of generic functions : 

- reset_predictions : 

This function aims at resetting the prediction variables.

- global_evaluation :

This function aims at giving the accuracy of the prediction related to college, employer and location.

In [11]:
def reset_predictions():
    global prediction_college, prediction_employer, prediction_location
    prediction_college = {}
    prediction_employer = {}
    prediction_location = {}

In [12]:
def global_evaluation():
    global groundtruth_college, groundtruth_employer, groundtruth_location
    global current_knowledge_college, current_knowledge_employer, current_knowledge_location
    global prediction_college, prediction_employer, prediction_location
    global graph

    # Evaluation of the college predictions 
    accuracy = accuracy_good_predictions_over_number_of_predictions(groundtruth_college, prediction_college)
    print("%f%% of the college predictions are true" % accuracy)
    accuracy = accuracy_correct_predictions_over_expected_number_of_predictions(groundtruth_college, prediction_college)
    print("%f%% of the college values were found" % accuracy)

    # Evaluation of the employer predictions 
    accuracy = accuracy_good_predictions_over_number_of_predictions(groundtruth_employer, prediction_employer)
    print("%f%% of the employer predictions are true" % accuracy)
    accuracy = accuracy_correct_predictions_over_expected_number_of_predictions(groundtruth_employer, prediction_employer)
    print("%f%% of the employer values were found" % accuracy)

    # Evaluation of the location predictions 
    accuracy = accuracy_good_predictions_over_number_of_predictions(groundtruth_location, prediction_location)
    print("%f%% of the location predictions are true" % accuracy)
    accuracy = accuracy_correct_predictions_over_expected_number_of_predictions(groundtruth_location, prediction_location)
    print("%f%% of the location values were found" % accuracy)    

# Filling empty nodes

# Strategy Number 1 : Predict college based on college of the majority of neighbors

## Example : 

If you have two neighbors with the IMT College and one with the centrale Nantes college, then you predict that the college of the last person will be IMT.

### Implementation of the method : 

In [13]:
def predict_college_from_first_neighbors(graph, empty_nodes, current_knowledge):
    predicted_value = {}
    for node in empty_nodes:
        college_counter = {}
        for neighbor in graph.neighbors(node):
            neighbor_row = current_knowledge.loc[current_knowledge['name'] == neighbor]
            if not neighbor_row.empty:
                colleges = neighbor_row.iloc[:]['college']
                for college in colleges:
                    if college in college_counter:
                        college_counter[college] += 1
                    else:
                        college_counter[college] = 1
        if college_counter: 
            most_common_college = max(college_counter, key=college_counter.get)
            predicted_value[node] = [most_common_college]
    return predicted_value

### Testing of the method accuracy : 

In [14]:
reset_predictions()

# Prediction of missing values using the first method. 
prediction_college = predict_college_from_first_neighbors(graph, empty_nodes, current_knowledge_college)

global_evaluation()

28.404669% of the college predictions are true
21.987952% of the college values were found
0.000000% of the employer predictions are true
0.000000% of the employer values were found
0.000000% of the location predictions are true
0.000000% of the location values were found


As we can see, the predictions of college for users are not very accurate. This method seems to be too simple to be efficient.

# Strategy Number 2 : Detection of community common characteristics

## Example : 

If a node with missing values is part of a community with a common characteristics regarding the college, then this node is likely to have studied in the same college.

In [15]:
def predict_college_from_community(empty_nodes, current_knowledge, communities):
    predicted_value = {}
    for community in communities:
        college_counter = {}
        for node in community:
            node_rows = current_knowledge.loc[current_knowledge['name'] == node]
            if not node_rows.empty:
                colleges = node_rows.iloc[:]['college']
                for college in colleges:
                    if college in college_counter:
                        college_counter[college] += 1
                    else:
                        college_counter[college] = 1
        if college_counter:
            most_common_college = max(college_counter, key=college_counter.get)
            for node in community:
                if node in empty_nodes:
                    predicted_value[node] = [most_common_college]
    return predicted_value

### Testing of the method accuracy : 

In [16]:
reset_predictions()

# create the communities 
communities = nx.community.louvain_communities(graph)

# Prediction of missing values using the community method. 
prediction_college = predict_college_from_community(empty_nodes, current_knowledge_college, communities)

global_evaluation()

23.789474% of the college predictions are true
34.036145% of the college values were found
0.000000% of the employer predictions are true
0.000000% of the employer values were found
0.000000% of the location predictions are true
0.000000% of the location values were found


This strategy does not seem to work very well. This might be due to the fact that the reason why people are part of a community is not necessarely because of the college they did but it might also be because of their location, or their work. 

Now, let's try to keep this last algorithm but improve it by predicting a college if and only if it is the most common factor within the community compared to the location and employer : 

### Strategy 2 improved : predicting only if it is the most common factor within the community is college or employer or location

In [17]:
def predict_college_from_community_iff_community_is_most_common_factor(empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, communities):
    prediction_college = {}
    prediction_employer = {}
    prediction_location = {}

    for community in communities:
        college_counter = {}
        employer_counter = {}
        location_counter = {}
        most_common_college = None
        most_common_employer = None
        most_common_location = None
        most_common_college_counter = 0 
        most_common_employer_counter = 0
        most_common_location_counter = 0

        for node in community:
            node_rows_college = current_knowledge_college.loc[current_knowledge_college['name'] == node]
            node_rows_employer = current_knowledge_employer.loc[current_knowledge_employer['name'] == node]
            node_rows_location = current_knowledge_location.loc[current_knowledge_location['name'] == node]

            if not node_rows_college.empty:
                colleges = node_rows_college.iloc[:]['college']
                for college in colleges:
                    if college in college_counter:
                        college_counter[college] += 1
                    else:
                        college_counter[college] = 1
            if not node_rows_location.empty:
                locations = node_rows_location.iloc[:]['location']
                for location in locations:
                    if location in location_counter:
                        location_counter[location] += 1
                    else:
                        location_counter[location] = 1
                    
            if not node_rows_employer.empty:
                employers = node_rows_employer.iloc[:]['employer']
                for employer in employers:
                    if employer in employer_counter:
                        employer_counter[employer] += 1
                    else:
                        employer_counter[employer] = 1
                        
        if college_counter:
            most_common_college = max(college_counter, key=college_counter.get)
            most_common_college_counter = college_counter.get(most_common_college)
        if employer_counter:
            most_common_employer = max(employer_counter, key=employer_counter.get)
            most_common_employer_counter = employer_counter.get(most_common_employer)
        if location_counter:
            most_common_location = max(location_counter, key=location_counter.get)
            most_common_location_counter = location_counter.get(most_common_location)
        
        for node in community:
            if node in empty_nodes:
                if most_common_college_counter >= most_common_employer_counter and most_common_college_counter >= most_common_location_counter:
                    prediction_college[node] = [most_common_college]
                elif most_common_employer_counter >= most_common_college_counter and most_common_employer_counter >= most_common_location_counter:
                    prediction_employer[node] = [most_common_employer]
                else:
                    prediction_location[node] = [most_common_location]
    return prediction_college, prediction_employer, prediction_location

### Testing of the method accuracy : 

In [18]:
reset_predictions()

# create the communities 
communities = nx.community.louvain_communities(graph)

# Prediction of missing values using the community method. 
prediction_college, prediction_employer, prediction_location = predict_college_from_community_iff_community_is_most_common_factor(empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, communities)

print(prediction_college, prediction_employer, prediction_location)
global_evaluation()

{'U18557': ['university of florida'], 'U18514': ['university of florida'], 'U18586': ['university of florida'], 'U18518': ['university of florida'], 'U18597': ['university of florida'], 'U18543': ['university of florida'], 'U18560': ['university of florida'], 'U18520': ['university of florida'], 'U18545': ['university of florida'], 'U18516': ['university of florida'], 'U18564': ['university of florida'], 'U18549': ['university of florida'], 'U15318': ['bangladesh university of engineering and technology'], 'U8702': ['bangladesh university of engineering and technology'], 'U15308': ['bangladesh university of engineering and technology'], 'U15368': ['bangladesh university of engineering and technology'], 'U15296': ['bangladesh university of engineering and technology'], 'U15335': ['bangladesh university of engineering and technology'], 'U15350': ['bangladesh university of engineering and technology'], 'U15267': ['bangladesh university of engineering and technology'], 'U15317': ['banglade

The results are way better than before. In addition, it predicted some values for the employer and the location

# Strategy number 3 : cliques analysis version 1

Now, let's try to find some analysis that could improve the existant analysis. Here, we will analyse each clique in the datasets and for each clique, we will try to find out what is the common value that is connecting thoses nodes.

Precisely, here is what we are going to do : 

- Go through each node for which we don't know the data 
- Initialize a dictionnary that will contain the best scores for college, employer and location
- Go through each clique containing the node 
- Create a counter of occurrences for each college, employer, location that will be present in the information of the nodes in the clique
- For each node of the clique, get the universities, employer and location if it exists and increment the counter for these informations 
- once it is done, get the information (college, employer, location) that has the best score within the clique and compare it to the existing dictionnary of the node.  - If the score is better, then replace the value in the dictionnary.

In [19]:
def analyse_cliques_version_1(graph,empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location):
    prediction_college = {}
    prediction_employer = {}
    prediction_location = {}
    i = 0 

    for node in empty_nodes: 
        best_college_score = [None, 0]
        best_employer_score = [None, 0]
        best_location_score = [None, 0]
            
        cliques = nx.enumerate_all_cliques(graph)
        for clique in cliques: 
            if node in clique:
                college_counter = {}
                employer_counter = {}
                location_counter = {}
                most_common_college = None
                most_common_employer = None
                most_common_location = None
                most_common_college_counter = 0 
                most_common_employer_counter = 0
                most_common_location_counter = 0
                for member in clique:
                    if member not in empty_nodes:
                        member_rows_college = current_knowledge_college.loc[current_knowledge_college['name'] == member]
                        member_rows_employer = current_knowledge_employer.loc[current_knowledge_employer['name'] == member]
                        member_rows_location = current_knowledge_location.loc[current_knowledge_location['name'] == member]

                        if not member_rows_college.empty:
                            colleges = member_rows_college.iloc[:]['college']
                            for college in colleges:
                                if college in college_counter:
                                    college_counter[college] += 1
                                else:
                                    college_counter[college] = 1
                        if not member_rows_location.empty:
                            locations = member_rows_location.iloc[:]['location']
                            for location in locations:
                                if location in location_counter:
                                    location_counter[location] += 1
                                else:
                                    location_counter[location] = 1

                        if not member_rows_employer.empty:
                            employers = member_rows_employer.iloc[:]['employer']
                            for employer in employers:
                                if employer in employer_counter:
                                    employer_counter[employer] += 1
                                else:
                                    employer_counter[employer] = 1

                if college_counter:
                    most_common_college = max(college_counter, key=college_counter.get)
                    most_common_college_counter = college_counter.get(most_common_college)
                if employer_counter:
                    most_common_employer = max(employer_counter, key=employer_counter.get)
                    most_common_employer_counter = employer_counter.get(most_common_employer)
                if location_counter:
                    most_common_location = max(location_counter, key=location_counter.get)
                    most_common_location_counter = location_counter.get(most_common_location)
                

                if most_common_college_counter > best_college_score[1]:
                    best_college_score[0] = most_common_college
                    best_college_score[1] = most_common_college_counter
                if most_common_employer_counter > best_employer_score[1]:
                    best_employer_score[0] = most_common_employer
                    best_employer_score[1] = most_common_employer_counter
                if most_common_location_counter > best_location_score[1]:
                    best_location_score[0] = most_common_location
                    best_location_score[1] = most_common_location_counter
        if best_college_score[0] and best_college_score[1]>1:
            prediction_college[node] = [best_college_score[0]]
        if best_employer_score[0] and best_employer_score[1]>1:
            prediction_employer[node] = [best_employer_score[0]]
        if best_location_score[0] and best_location_score[1]>1:
            prediction_location[node] = [best_location_score[0]]
                    

    return prediction_college, prediction_employer, prediction_location


In [20]:
reset_predictions()

# Prediction of missing values using the community method. 
prediction_college, prediction_employer, prediction_location = analyse_cliques_version_1(graph,empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location)
print(prediction_employer, prediction_location, prediction_college)
global_evaluation()

{'U11559': ['university of illinois at urbana-champaign'], 'U18514': ['ultimate software'], 'U22825': ['bhabha atomic research center'], 'U8670': ['university of illinois at urbana-champaign'], 'U14520': ['vineyard church'], 'U4456': ['university of illinois at urbana-champaign'], 'U15336': ['university of illinois at urbana-champaign'], 'U25630': ['measured progress'], 'U24045': ['university of illinois at urbana-champaign'], 'U18549': ['ultimate software'], 'U18518': ['ultimate software'], 'U14072': ['university of illinois at urbana-champaign'], 'U12020': ['measured progress'], 'U27287': ['university of delaware'], 'U27588': ['yelp.com'], 'U13029': ['university of illinois at urbana-champaign'], 'U18516': ['ultimate software'], 'U15350': ['university of illinois at urbana-champaign'], 'U13969': ['university of illinois at urbana-champaign'], 'U7972': ['vineyard church'], 'U2656': ['university of illinois at urbana-champaign'], 'U3955': ['university of illinois at urbana-champaign'],

This solution seems to be quite accurate but does not find a lot of missing values. 

# Strategy number 4 : Hybrid iteration


This is the last strategy we will use to predict missing values. It involves making predictions using different strategies, starting with the most accurate and then moving to the least accurate, while setting prediction acceptance rates that we will change at each iteration. The prediction function sequences we will use at each iteration are as follows:

- Clique analysis (Keep only max 1 college, 5 employers, 1 location):
- Unknown node analysis with on degree (Keep only max 2 colleges, 5 employers, 1 location):
- Community analysis:

The idea is to make predictions starting with the most important nodes and decreasing the precision requirement with each iteration.

Three prediction dictionaries (employer, location, college) will be used to add information to unknown nodes, but also to add information when necessary and possible (for example, if an employer is predicted in the first iteration and in the second iteration, a second one needs to be predicted, it will be possible. However, this will not be possible for location and college).

In [21]:
# Definition of a function that takes a list of nodes and count the number of occurences of each attribute college, employer and location from the current knowledge

def count_attributes_occurences_current_knowledge(nodes_list, current_knowledge_college, current_knowledge_employer, current_knowledge_location):
    college_counter = {}
    employer_counter = {}
    location_counter = {}
    for node in nodes_list:
        node_rows_college = current_knowledge_college.loc[current_knowledge_college['name'] == node]
        node_rows_employer = current_knowledge_employer.loc[current_knowledge_employer['name'] == node]
        node_rows_location = current_knowledge_location.loc[current_knowledge_location['name'] == node]

        if not node_rows_college.empty:
            colleges = node_rows_college.iloc[:]['college']
            for college in colleges:
                if college in college_counter:
                    college_counter[college] += 1
                else:
                    college_counter[college] = 1
        if not node_rows_location.empty:
            locations = node_rows_location.iloc[:]['location']
            for location in locations:
                if location in location_counter:
                    location_counter[location] += 1
                else:
                    location_counter[location] = 1

        if not node_rows_employer.empty:
            employers = node_rows_employer.iloc[:]['employer']
            for employer in employers:
                if employer in employer_counter:
                    employer_counter[employer] += 1
                else:
                    employer_counter[employer] = 1

    return college_counter, employer_counter, location_counter

In [22]:
# Definition of a function that takes a list of nodes and count the number of occurences of each attributes college, employer and location from prediction dictionnaries
def count_attributes_occurences_predictions(nodes_list, prediction_college, prediction_employer, prediction_location):
    college_counter = {}
    employer_counter = {}
    location_counter = {}
    for node in nodes_list:
        if node in prediction_college:
            colleges = prediction_college[node][:]
            for college in colleges:
                if college in college_counter:
                    college_counter[college] += 1
                else:
                    college_counter[college] = 1
        
        if node in prediction_employer:
            employers = prediction_employer[node][:]
            for employer in employers:
                if employer in employer_counter:
                    employer_counter[employer] += 1
                else:
                    employer_counter[employer] = 1

        if node in prediction_location:
            locations = prediction_location[node][:]
            for location in locations:
                if location in location_counter:
                    location_counter[location] += 1
                else:
                    location_counter[location] = 1
    return college_counter, employer_counter, location_counter


In [23]:
def merge_dicts(*dict_args):
    result = {}
    for dictionary in dict_args:
        for key, value in dictionary.items():
            if key in result:
                result[key] += value
            else:
                result[key] = value
    return result

In [24]:
# Definition of a function that takes a list of nodes and count the number of occurences of each attributes college, employer and location from prediction dictionnaries and from current knowledge
def count_attributes_occurences_global(nodes_list, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location):

    current_knowledge_counts = count_attributes_occurences_current_knowledge(nodes_list, current_knowledge_college, current_knowledge_employer, current_knowledge_location)

    prediction_counts = count_attributes_occurences_predictions(nodes_list, prediction_college, prediction_employer, prediction_location)

    college_counter = merge_dicts(current_knowledge_counts[0], prediction_counts[0])
    employer_counter = merge_dicts(current_knowledge_counts[1], prediction_counts[1])
    location_counter = merge_dicts(current_knowledge_counts[2], prediction_counts[2])
    return college_counter, employer_counter, location_counter

In [25]:
# Generic function to assign predictions
def assign_prediction(node, prediction_dictionnary, counter_dictionnary, value_to_compare_with_counter_value, max_predictions):
    if (node in prediction_dictionnary and len(prediction_dictionnary[node]) < max_predictions) or node not in prediction_dictionnary:
        if counter_dictionnary:
            if node in prediction_dictionnary:
                items = [(key, value) for key, value in counter_dictionnary.items() if key not in prediction_dictionnary[node]]
                if items : 
                    sorted_items = sorted(items, key=lambda x: x[1], reverse=True)
                    best_key, best_value = sorted_items[0]
                    if counter_dictionnary[best_key] >= value_to_compare_with_counter_value:
                        prediction_dictionnary[node].append(best_key)
            else: 
                items = [(key, value) for key, value in counter_dictionnary.items()]
                sorted_items = sorted(items, key=lambda x: x[1], reverse=True)
                best_key, best_value = sorted_items[0]
                if counter_dictionnary[best_key] >= value_to_compare_with_counter_value:
                    prediction_dictionnary[node] = [best_key]
    return prediction_dictionnary

In [26]:
# Definition of the function that predict on cliques 
def cliques_predictions(graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location, number_of_iteration, values_to_compare_with):
    cliques = nx.enumerate_all_cliques(graph)
    large_cliques = [clique for clique in cliques if len(clique) >= 3]
    counter = 1
    empty_nodes_list = list(empty_nodes)
    for node in empty_nodes_list:
        # print(f"clique prediction : {counter}/{len(empty_nodes)} nodes processed")
        counter +=1
        cliques_with_node = [clique for clique in large_cliques if node in clique]
        if cliques_with_node : 
            max_clique_size = max(cliques_with_node, key=len)
            college_counter_clique, employer_counter_clique, location_counter_clique = count_attributes_occurences_global(max_clique_size, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location)
            max_predictions_college = 1
            max_predictions_employer = 5
            max_predictions_location = 1
            value_to_compare_with_counter_value = (values_to_compare_with[number_of_iteration] * len(max_clique_size))/100
            prediction_college = assign_prediction(node, prediction_college, college_counter_clique,value_to_compare_with_counter_value,  max_predictions_college)
            prediction_employer = assign_prediction(node, prediction_employer, employer_counter_clique,value_to_compare_with_counter_value,  max_predictions_employer )
            prediction_location = assign_prediction(node, prediction_location, location_counter_clique,value_to_compare_with_counter_value,  max_predictions_location)
    return prediction_college, prediction_employer, prediction_location

In [27]:
# Definition of the function that predict on high degree nodes 
def degree_predictions(graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location, number_of_iteration, values_to_compare_with):
    degrees_empty_nodes = {node: graph.degree(node) for node in empty_nodes}
    sorted_nodes_by_degree = sorted(degrees_empty_nodes.keys(), key=lambda x: degrees_empty_nodes[x], reverse=True)
    sorted_nodes_by_degree = [node for node in sorted_nodes_by_degree if graph.degree(node)>20]
    counter = 1
    for node in sorted_nodes_by_degree:
        # print(f"degree prediction : {counter}/{len(empty_nodes)} nodes processed")
        counter +=1
        neighbors = list(graph.neighbors(node))
        college_counter_clique, employer_counter_clique, location_counter_clique = count_attributes_occurences_global(neighbors, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location)
        max_predictions_college = 1
        max_predictions_employer = 5
        max_predictions_location = 1
        
        value_to_compare_with_counter_value = (values_to_compare_with[number_of_iteration] * len(neighbors))/100
        prediction_college = assign_prediction(node, prediction_college, college_counter_clique,value_to_compare_with_counter_value, max_predictions_college)
        prediction_employer = assign_prediction(node, prediction_employer, employer_counter_clique,value_to_compare_with_counter_value, max_predictions_employer )
        prediction_location = assign_prediction(node, prediction_location, location_counter_clique,value_to_compare_with_counter_value, max_predictions_location)
    return prediction_college, prediction_employer, prediction_location

In [28]:
# Definition of the function that predict on communities
def community_predictions(communities,graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location, number_of_iteration, values_to_compare_with):
    counter = 1
    for node in empty_nodes:
        community = list([community for community in communities if node in community][0])
        # print(f"degree prediction : {counter}/{len(empty_nodes)} nodes processed")
        counter +=1
        college_counter_clique, employer_counter_clique, location_counter_clique = count_attributes_occurences_global(community, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location)
        max_predictions_college = 1
        max_predictions_employer = 5
        max_predictions_location = 1
        value_to_compare_with_counter_value = (values_to_compare_with[number_of_iteration] * len(community))/100
        prediction_college = assign_prediction(node, prediction_college, college_counter_clique,value_to_compare_with_counter_value, max_predictions_college)
        prediction_employer = assign_prediction(node, prediction_employer, employer_counter_clique,value_to_compare_with_counter_value, max_predictions_employer )
        prediction_location = assign_prediction(node, prediction_location, location_counter_clique,value_to_compare_with_counter_value, max_predictions_location)
    return prediction_college, prediction_employer, prediction_location

In [29]:
def update_current_knowledge(current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location):
    for node, colleges in prediction_college.items():
        if node in current_knowledge_college['name'].values:
            current_knowledge_college.loc[current_knowledge_college['name'] == node, 'college'] = ', '.join(colleges)
        else:
            current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
    for node, employers in prediction_employer.items():
        if node in current_knowledge_employer['name'].values:
            current_knowledge_employer.loc[current_knowledge_employer['name'] == node, 'employer'] = ', '.join(employers)
        else:
            current_knowledge_employer = current_knowledge_employer.append({'name': node, 'employer': ', '.join(employers)}, ignore_index=True)
    
    for node, locations in prediction_location.items():
        if node in current_knowledge_location['name'].values:
            current_knowledge_location.loc[current_knowledge_location['name'] == node, 'location'] = ', '.join(locations)
        else:
            current_knowledge_location = current_knowledge_location.append({'name': node, 'location': ', '.join(locations)}, ignore_index=True)
    return current_knowledge_college, current_knowledge_employer, current_knowledge_location


In [30]:
# Definition of the strategy's function

def multi_strategies_prediction(graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location):
    prediction_college = {}
    prediction_employer = {}
    prediction_location = {}
    number_of_iteration = 0 
    max_iterations = 7
    communities = nx.community.louvain_communities(graph)
    values_to_compare_with_cliques = [80, 70, 60, 50, 70, 60, 35, 60]
    values_to_compare_with_degree = [80, 70, 60, 50, 70, 60, 35, 60]
    values_to_compare_with_community = [80, 70, 60, 50, 70, 60, 35, 60]
    while number_of_iteration < max_iterations:
        print(f"{number_of_iteration} round out of {max_iterations}")
        prediction_college, prediction_employer, prediction_location = cliques_predictions(graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location, number_of_iteration, values_to_compare_with_cliques)
        current_knowledge_college, current_knowledge_employer, current_knowledge_location = update_current_knowledge(current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location)
        prediction_college, prediction_employer, prediction_location = degree_predictions(graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location, number_of_iteration, values_to_compare_with_degree)
        current_knowledge_college, current_knowledge_employer, current_knowledge_location = update_current_knowledge(current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location)
        prediction_college, prediction_employer, prediction_location = community_predictions(communities, graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location, number_of_iteration, values_to_compare_with_community)
        current_knowledge_college, current_knowledge_employer, current_knowledge_location = update_current_knowledge(current_knowledge_college, current_knowledge_employer, current_knowledge_location, prediction_college, prediction_employer, prediction_location)
        number_of_iteration += 1
    return prediction_college, prediction_employer, prediction_location

In [31]:
# reset predictions : 
reset_predictions()
prediction_college, prediction_employer, prediction_location = multi_strategies_prediction(graph, empty_nodes, current_knowledge_college, current_knowledge_employer, current_knowledge_location)
global_evaluation()


0 round out of 7


1 round out of 7


  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)


2 round out of 7


  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': '

3 round out of 7


  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': '

4 round out of 7


  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)


5 round out of 7


  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_employer = current_knowledge_employer.append({'name': node, 'employer': ', '.join(employers)}, ignore_index=True)
  current_knowledge_employer = current_knowledge_employer.append({'name': node, 'employer': ', '.join(employers)}, ignore_index=True)
  current_knowledge_employer = current_knowledge_employer.append({'name': node, 'employer': ', '.join(employers)}, ignore_index=True)
  current_knowledge_employer = current_knowledge_employer.append({'name': node, 'employer': ', '.join(employers)}, ignore_index=True)
  current_knowledge_location = current_knowledge_location.append({'name': node, 'location': ', '.join(locations)}, ignore_index=True)
  current_knowledge_location = current_knowledge_location.append({'name': node, 'location': ', '.join(locations)}, ignore_index=True)
  current_knowledge_location = current_knowledge_location.append({

6 round out of 7


  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_college = current_knowledge_college.append({'name': node, 'college': ', '.join(colleges)}, ignore_index=True)
  current_knowledge_employer = current_knowledge_employer.append({'name': node, 'employer': ', '.join(employers)}, ignore_index=True)
  current_knowledge_employer = current_knowledge_employer.append({'name': node, 'emplo

50.769231% of the college predictions are true
19.879518% of the college values were found
40.677966% of the employer predictions are true
1.943320% of the employer values were found
50.000000% of the location predictions are true
14.736842% of the location values were found


As we can see, the accuracy is quite ok but there is only a tiny number of predictions that are actually made compare to the total number of predictions that should be made. The results being quite deceving, we will take the groundtruth to predict the 5 best influencers. 

# Top 5 influencers : 

In [32]:
# Open the groundtruth_files

groundtruth_college = pd.read_csv('cleaned_groundtruth/cleaned_college.csv', sep='\t', header='infer')
groundtruth_employer = pd.read_csv('./groundtruth/employer.csv', sep='\t', header='infer')
groundtruth_location = pd.read_csv('./groundtruth/location.csv', sep='\t', header='infer')

# open the graph file
graph = nx.read_gexf("mediumLinkedin.gexf")

## a) Searching for individuals with the most connections in the san francisco bay area 

In [33]:
keywords = ['san francisco', 'sacramento']
def contains_keywords(location):
    for keyword in keywords:
        if keyword in location:
            return True
    return False

matching_nodes = []

for node in graph.nodes():
    neighbors = list(graph.neighbors(node))
    neighbor_count = 0
    neighbors_list = []
    for neighbor in neighbors:
        neighbor_location = groundtruth_location.loc[groundtruth_location['name'] == neighbor, 'location'].values
        if len(neighbor_location) > 0:
            neighbor_location = neighbor_location[0]
            if contains_keywords(neighbor_location):
                neighbor_count += 1
                neighbors_list.append(neighbor)
    
    if neighbor_count > 0:
        matching_nodes.append((node, neighbor_count,neighbors_list ))

matching_nodes.sort(key=lambda x: x[1], reverse=True)

if matching_nodes:
    print("Top 10 nodes with the most neighbors matching the location of San Francisco or Sacramento:")
    for i, (node, count, list) in enumerate(matching_nodes[:10], 1):
        print(f"{i}. Node: {node}, Number of matching neighbors: {count}")
else:
    print("No matching nodes found.")

Top 10 nodes with the most neighbors matching the location of San Francisco or Sacramento:
1. Node: U27287, Number of matching neighbors: 22
2. Node: U7024, Number of matching neighbors: 21
3. Node: U11566, Number of matching neighbors: 7
4. Node: U3955, Number of matching neighbors: 5
5. Node: U22747, Number of matching neighbors: 5
6. Node: U27475, Number of matching neighbors: 5
7. Node: U27460, Number of matching neighbors: 4
8. Node: U4562, Number of matching neighbors: 4
9. Node: U27613, Number of matching neighbors: 4
10. Node: U27588, Number of matching neighbors: 3


In [34]:
sf_nodes = 0
for node in graph.nodes():
    node_location = groundtruth_location.loc[groundtruth_location['name'] == node, 'location'].values
    if len(node_location) > 0:
        node_location = node_location[0]
        if contains_keywords(node_location):
            sf_nodes += 1

print(f"There are {sf_nodes} nodes in San Francisco.")    

There are 82 nodes in San Francisco.


In [35]:
max_diff_count = 0
max_diff_nodes = None

for node, count, neighbors_list in matching_nodes:
        for node_2, count_2, neighbors_list_2 in matching_nodes:
            if node_2 != node:
                if len(neighbors_list) <= len(neighbors_list_2):
                    smaller_list = neighbors_list
                    larger_list = neighbors_list_2
                else:
                    smaller_list = neighbors_list_2
                    larger_list = neighbors_list

                different_neighbors = [neighbor for neighbor in smaller_list if neighbor not in larger_list]
                diff_count = len(different_neighbors)

                if diff_count > max_diff_count:
                    max_diff_count = diff_count
                    max_diff_nodes = (node, count, node_2, count_2, different_neighbors)

if max_diff_nodes is not None:
    node_1, count_1, node_2, count_2, different_neighbors = max_diff_nodes
    print(f"The pair of nodes with the largest number of different neighbors is:")
    print(f"{node_1} (number of neighbors: {count_1}) and {node_2} (number of neighbors: {count_2}):")
    print(f"Number of different neighbors: {max_diff_count}")
    print("Different neighbors:", different_neighbors)
else:
    print("No pair of nodes with the number of neighbors 22 was found.")



The pair of nodes with the largest number of different neighbors is:
U27287 (number of neighbors: 22) and U7024 (number of neighbors: 21):
Number of different neighbors: 20
Different neighbors: ['U7136', 'U7151', 'U7194', 'U7307', 'U7077', 'U7132', 'U7056', 'U7030', 'U7049', 'U7188', 'U7159', 'U7092', 'U7288', 'U7180', 'U7203', 'U7082', 'U7074', 'U7262', 'U7156', 'U7189']



The results are quite promising, and these two individuals will be selected for the marketing campaign. However, they are from Illinois and not from San Francisco. This likely means that they are very influential in the LinkedIn network and can have influence for the restaurant. However, to make the campaign more credible, people from San Francisco should also be selected.

## b) Searching for individuals from California with the highest mixed centrality



This time, we will focus on individuals whose mixed centrality is the highest within California and who are from California in order to bring credibility to the campaign.

In [36]:
keywords_california = ['san francisco', 'sacramento', 'san diego', 'california']
def contains_keywords(location):
    for keyword in keywords_california:
        if keyword in location:
            return True
    return False

california_nodes = [node for node in graph.nodes() if contains_keywords(groundtruth_location.loc[groundtruth_location['name'] == node, 'location'].values[0])]
california_subgraph = nx.Graph()
california_subgraph.add_nodes_from(california_nodes)
for u, v in graph.edges():
    if u in california_nodes and v in california_nodes:
        california_subgraph.add_edge(u, v)

centralities_list = []


degree_centralities = nx.degree_centrality(california_subgraph)
closeness_centralities = nx.closeness_centrality(california_subgraph)
betweenness_centralities = nx.betweenness_centrality(california_subgraph)


for node in california_subgraph.nodes():
    node_name = node
    
    degree_centrality = degree_centralities[node]
    closeness_centrality = closeness_centralities[node]
    betweenness_centrality = betweenness_centralities[node]
    weighted_centrality = 0.33 * degree_centrality + 0.33 * closeness_centrality + 0.33 * betweenness_centrality

    centralities_list.append((node_name, degree_centrality, closeness_centrality, betweenness_centrality,weighted_centrality))

centralities_list.sort(key=lambda x: x[4], reverse=True)
for node_name, degree_centrality, closeness_centrality, betweenness_centrality, weighted_centrality in centralities_list:
    print(f"Node: {node_name}, Degree Centrality: {degree_centrality}, Closeness Centrality: {closeness_centrality}, Betweenness Centrality: {betweenness_centrality}, Weighted Centrality: {weighted_centrality}")




Node: U27460, Degree Centrality: 0.0449438202247191, Closeness Centrality: 0.0449438202247191, Betweenness Centrality: 0.0002553626149131767, Weighted Centrality: 0.029747191011235956
Node: U27613, Degree Centrality: 0.0449438202247191, Closeness Centrality: 0.0449438202247191, Betweenness Centrality: 0.0002553626149131767, Weighted Centrality: 0.029747191011235956
Node: U2691, Degree Centrality: 0.033707865168539325, Closeness Centrality: 0.04044943820224719, Betweenness Centrality: 0.0028089887640449437, Weighted Centrality: 0.025398876404494383
Node: U27588, Degree Centrality: 0.033707865168539325, Closeness Centrality: 0.035955056179775284, Betweenness Centrality: 0.0, Weighted Centrality: 0.02298876404494382
Node: U27614, Degree Centrality: 0.033707865168539325, Closeness Centrality: 0.035955056179775284, Betweenness Centrality: 0.0, Weighted Centrality: 0.02298876404494382
Node: U8670, Degree Centrality: 0.033707865168539325, Closeness Centrality: 0.033707865168539325, Betweennes