# Single Graph Drawing

#### Load packages

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

#### Load data

In [2]:
# This is a sampling for entire analysis
data_astronomy1 = pd.read_excel('data/Astronomy.xlsx', sheet_name='강민철')
data_astronomy2 = pd.read_excel('data/Astronomy.xlsx', sheet_name='강지헌')

# Drop an useless column
data_astronomy1.drop('Name', axis=1, inplace=True)
data_astronomy2.drop('Name', axis=1, inplace=True)

# Merge all dataframes into one dataframe:
data_astronomy = pd.concat([data_astronomy1, data_astronomy2], axis=0)
data_astronomy.drop(data_astronomy.loc[data_astronomy['ID'] == 53].index[0], inplace=True)
# data_astronomy

In [3]:
data_sampling1 = pd.read_excel('data/Sampling.xlsx', sheet_name='강지헌')
data_sampling2 = pd.read_excel('data/Sampling.xlsx', sheet_name='신아현')
data_sampling3 = pd.read_excel('data/Sampling.xlsx', sheet_name='신수연')

data_sampling1.drop('Name', axis=1, inplace=True)
data_sampling2.drop('Name', axis=1, inplace=True)
data_sampling3.drop('Name', axis=1, inplace=True)

data_sampling = pd.concat([data_sampling1, data_sampling2, data_sampling3], axis=0)
# data_sampling

In [4]:
data_database1 = pd.read_excel('data/Database.xlsx', sheet_name='신수연')
data_database2 = pd.read_excel('data/Database.xlsx', sheet_name='양연선')
data_database3 = pd.read_excel('data/Database.xlsx', sheet_name='김나영')

data_database1.drop('Name', axis=1, inplace=True)
data_database2.drop('Name', axis=1, inplace=True)
data_database3.drop('Name', axis=1, inplace=True)

data_database = pd.concat([data_database1, data_database2, data_database3], axis=0)
# data_database

#### Process graphs
- 이름 형식: `<Domain>_<Modality>_<ID>`
- Domain: `ASTRONOMY`, `SAMPLING`, `DATABASE`

In [5]:
graphs_astronomy = {}

for id, sub_df in data_astronomy.groupby('ID'):
    # New graph object
    graph_name = f"Astronomy_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_astronomy[graph_name] = G

In [6]:
graphs_sampling = {}

for id, sub_df in data_sampling.groupby('ID'):
    # New graph object
    graph_name = f"Sampling_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_sampling[graph_name] = G

In [7]:
graphs_database = {}

for id, sub_df in data_database.groupby('ID'):
    # New graph object
    graph_name = f"Database_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_database[graph_name] = G

#### Process multi-graphs
params:

- `graphs_dict`: 도메인별 그래프 딕셔너리
- `selected_G`: 멀티그래프 생성 시 제외할 그래프

In [8]:
def create_multigraph(graphs_dict, selected_G = None):
    G = nx.MultiDiGraph()
    for graph_name, graph in graphs_dict.items():
        if graph_name != selected_G:
            for node in graph.nodes():
                G.add_node(node)
            for edge in graph.edges():
                G.add_edge(edge[0], edge[1])
    return G

In [9]:
# Create multigraphs
multigraph_astronomy_full = create_multigraph(graphs_astronomy)
multigraph_sampling_full = create_multigraph(graphs_sampling)
multigraph_database_full = create_multigraph(graphs_database)

#### Methods for centrality metrics
(Multi-graph를 digraph로 변환하여 계산)

1. `cal_node_betweeness`
2. `cal_node_closeness`
3. `cal_node_degree`

*(각각 상위 3개의 항목을 추출함)*

In [10]:
def cal_node_betweenness(G):
    G = nx.DiGraph(G)
    values = nx.betweenness_centrality(G)
    top_3_nodes = sorted(values.items(), key=lambda item: item[1], reverse=True)[:3]
    return dict(top_3_nodes)

def cal_node_closeness(G):
    G = nx.DiGraph(G)
    values = nx.closeness_centrality(G)
    top_3_nodes = sorted(values.items(), key=lambda item: item[1], reverse=True)[:3]
    return dict(top_3_nodes)

def cal_node_degree(G):
    G = nx.DiGraph(G)
    values = nx.degree_centrality(G)
    top_3_nodes = sorted(values.items(), key=lambda item: item[1], reverse=True)[:3]
    return dict(top_3_nodes)

#### Analysis for centrality metrics

In [11]:
# Filling method
def fill_df(graph_name, graph):
    global df_centrality
    
    betweenness_top_3 = cal_node_betweenness(graph)
    closeness_top_3 = cal_node_closeness(graph)
    degree_top_3 = cal_node_degree(graph)
    row = {'Graph': graph_name}
    
    for i, (node, value) in enumerate(betweenness_top_3.items(), 1):
        row[('Betweenness', f'node{i}')] = node
        row[('Betweenness', f'value{i}')] = value
    
    for i, (node, value) in enumerate(closeness_top_3.items(), 1):
        row[('Closeness', f'node{i}')] = node
        row[('Closeness', f'value{i}')] = value
    
    for i, (node, value) in enumerate(degree_top_3.items(), 1):
        row[('Degree', f'node{i}')] = node
        row[('Degree', f'value{i}')] = value

    df_centrality.loc[len(df_centrality)] = row

In [12]:
columns = pd.MultiIndex.from_product(
    [['Betweenness', 'Closeness', 'Degree'], 
     ['node1', 'value1', 'node2', 'value2', 'node3', 'value3']]
)
df_centrality = pd.DataFrame(columns=['Graph'] + columns.tolist())

fill_df('Astronomy', multigraph_astronomy_full)
fill_df('Sampling', multigraph_sampling_full)
fill_df('Database', multigraph_database_full)

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
df_centrality.to_excel("result/All_centrality.xlsx", index=False)