# Single Graph Drawing

#### Load packages

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#### Load data

In [2]:
# This is a sampling for entire analysis
data_astronomy1 = pd.read_excel('data/Astronomy.xlsx', sheet_name='강민철')
data_astronomy2 = pd.read_excel('data/Astronomy.xlsx', sheet_name='강지헌')

# Drop an useless column
data_astronomy1.drop('Name', axis=1, inplace=True)
data_astronomy2.drop('Name', axis=1, inplace=True)

# Merge all dataframes into one dataframe:
data_astronomy = pd.concat([data_astronomy1, data_astronomy2], axis=0)
data_astronomy.drop(data_astronomy.loc[data_astronomy['ID'] == 53].index[0], inplace=True)
# data_astronomy

In [3]:
data_sampling1 = pd.read_excel('data/Sampling.xlsx', sheet_name='강지헌')
data_sampling2 = pd.read_excel('data/Sampling.xlsx', sheet_name='신아현')
data_sampling3 = pd.read_excel('data/Sampling.xlsx', sheet_name='신수연')

data_sampling1.drop('Name', axis=1, inplace=True)
data_sampling2.drop('Name', axis=1, inplace=True)
data_sampling3.drop('Name', axis=1, inplace=True)

data_sampling = pd.concat([data_sampling1, data_sampling2, data_sampling3], axis=0)
# data_sampling

In [4]:
data_database1 = pd.read_excel('data/Database.xlsx', sheet_name='신수연')
data_database2 = pd.read_excel('data/Database.xlsx', sheet_name='양연선')
data_database3 = pd.read_excel('data/Database.xlsx', sheet_name='김나영')

data_database1.drop('Name', axis=1, inplace=True)
data_database2.drop('Name', axis=1, inplace=True)
data_database3.drop('Name', axis=1, inplace=True)

data_database = pd.concat([data_database1, data_database2, data_database3], axis=0)
# data_database

#### Process graphs
- 이름 형식: `<Domain>_<Modality>_<ID>`
- Domain: `ASTRONOMY`, `SAMPLING`, `DATABASE`

In [5]:
graphs_astronomy = {}

for id, sub_df in data_astronomy.groupby('ID'):
    # New graph object
    graph_name = f"Astronomy_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_astronomy[graph_name] = G

In [6]:
graphs_sampling = {}

for id, sub_df in data_sampling.groupby('ID'):
    # New graph object
    graph_name = f"Sampling_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_sampling[graph_name] = G

In [7]:
graphs_database = {}

for id, sub_df in data_database.groupby('ID'):
    # New graph object
    graph_name = f"Database_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_database[graph_name] = G

#### Process multi-graphs
params:

- `graphs_dict`: 도메인별 그래프 딕셔너리
- `selected_G`: 멀티그래프 생성 시 제외할 그래프

In [8]:
def create_multigraph(graphs_dict, selected_G = None):
    G = nx.MultiDiGraph()
    for graph_name, graph in graphs_dict.items():
        if graph_name != selected_G:
            for node in graph.nodes():
                G.add_node(node)
            for edge in graph.edges():
                G.add_edge(edge[0], edge[1])
    return G

In [9]:
# Create multigraphs
multigraph_astronomy_full = create_multigraph(graphs_astronomy)
multigraph_sampling_full = create_multigraph(graphs_sampling)
multigraph_database_full = create_multigraph(graphs_database)

#### Methods for centrality metrics

1. `cal_node_betweenness`
2. `cal_node_closeness`
3. `cal_node_degree`

*(Get values from a multi-graph whose nodes accords with each individual graph, and then get a mean value with dividing the number of nodes of each graph.)*

In [10]:
def calculate_centrality_average_for_subgraph(multi_graph: nx.MultiGraph, indiv_graph: nx.DiGraph, centrality_type: str) -> float:
    # Convert the MultiGraph to a DiGraph to calculate centrality
    multi_graph = nx.DiGraph(multi_graph)

    # Calculate the specified centrality for all nodes in the multigraph
    if centrality_type == 'betweenness':
        centrality_values = nx.betweenness_centrality(multi_graph)
    elif centrality_type == 'closeness':
        centrality_values = nx.closeness_centrality(multi_graph)
    elif centrality_type == 'degree':
        centrality_values = nx.degree_centrality(multi_graph)
    else:
        raise ValueError("Invalid centrality type. Choose from 'betweenness', 'closeness', or 'degree'.")

    # Initialize sum of centrality values
    centrality_sum = 0

    # Iterate over the nodes in the subgraph and sum the corresponding centrality values from the multigraph
    for node in indiv_graph.nodes():
        if node in centrality_values:
            centrality_sum += centrality_values[node]

    # Calculate the average by dividing by the number of nodes in the subgraph
    average_centrality = centrality_sum / len(indiv_graph.nodes())

    return average_centrality

#### Analysis for centrality metrics

In [11]:
columns = ["ID", "Betweenness", "Closeness", "Degree"]
cent_astronomy = pd.DataFrame(columns=columns).set_index("ID")
cent_sampling = pd.DataFrame(columns=columns).set_index("ID")
cent_database = pd.DataFrame(columns=columns).set_index("ID")

In [12]:
for graph_name, G in graphs_astronomy.items():
    try:
        cent_astronomy.loc[graph_name] = [
            calculate_centrality_average_for_subgraph(multigraph_astronomy_full, G, "betweenness"),
            calculate_centrality_average_for_subgraph(multigraph_astronomy_full, G, "closeness"),
            calculate_centrality_average_for_subgraph(multigraph_astronomy_full, G, "degree")
        ]
    except nx.NetworkXNoPath as e:
        print(f"Path error at {graph_name}: {e}")

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
cent_astronomy.to_excel("result/Astronomy_central.xlsx", index=True)

In [13]:
for graph_name, G in graphs_sampling.items():
    try:
        cent_sampling.loc[graph_name] = [
            calculate_centrality_average_for_subgraph(multigraph_sampling_full, G, "betweenness"),
            calculate_centrality_average_for_subgraph(multigraph_sampling_full, G, "closeness"),
            calculate_centrality_average_for_subgraph(multigraph_sampling_full, G, "degree")
        ]
    except nx.NetworkXNoPath as e:
        print(f"Path error at {graph_name}: {e}")

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
cent_sampling.to_excel("result/Sampling_central.xlsx", index=True)

In [14]:
for graph_name, G in graphs_database.items():
    try:
        cent_database.loc[graph_name] = [
            calculate_centrality_average_for_subgraph(multigraph_database_full, G, "betweenness"),
            calculate_centrality_average_for_subgraph(multigraph_database_full, G, "closeness"),
            calculate_centrality_average_for_subgraph(multigraph_database_full, G, "degree")
        ]
    except nx.NetworkXNoPath as e:
        print(f"Path error at {graph_name}: {e}")

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
cent_database.to_excel("result/Database_central.xlsx", index=True)