# Single Graph Analysis
### *(Individual Metrics)*

#### Load packages

In [11]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt

#### Load data

In [13]:
# This is a sampling for entire analysis
data_astronomy1 = pd.read_excel('data/Astronomy.xlsx', sheet_name='강민철')
data_astronomy2 = pd.read_excel('data/Astronomy.xlsx', sheet_name='강지헌')

# Drop an useless column
data_astronomy1.drop('Name', axis=1, inplace=True)
data_astronomy2.drop('Name', axis=1, inplace=True)

# Merge all dataframes into one dataframe:
data_astronomy = pd.concat([data_astronomy1, data_astronomy2], axis=0)
# data_astronomy

In [14]:
data_sampling1 = pd.read_excel('data/Sampling.xlsx', sheet_name='강지헌')
data_sampling2 = pd.read_excel('data/Sampling.xlsx', sheet_name='신아현')
data_sampling3 = pd.read_excel('data/Sampling.xlsx', sheet_name='신수연')

data_sampling1.drop('Name', axis=1, inplace=True)
data_sampling2.drop('Name', axis=1, inplace=True)
data_sampling3.drop('Name', axis=1, inplace=True)

data_sampling = pd.concat([data_sampling1, data_sampling2, data_sampling3], axis=0)
# data_sampling

In [15]:
data_database1 = pd.read_excel('data/Database.xlsx', sheet_name='신수연')
data_database2 = pd.read_excel('data/Database.xlsx', sheet_name='양연선')
data_database3 = pd.read_excel('data/Database.xlsx', sheet_name='김나영')

data_database1.drop('Name', axis=1, inplace=True)
data_database2.drop('Name', axis=1, inplace=True)
data_database3.drop('Name', axis=1, inplace=True)

data_database = pd.concat([data_database1, data_database2, data_database3], axis=0)
# data_database

#### Process graphs
- 이름 형식: `<Domain>_<Modality>_<ID>`
- Domain: `ASTRONOMY`, `SAMPLING`, `DATABASE`

***참고사항**:
    혹시 몰라 노드에 P.Knowledge를 추가로 라벨링해두었지만, 분석할 때는 그냥 df상에서도 충분히 가능해보임*

In [16]:
graphs_astronomy = {}

for id, sub_df in data_astronomy.groupby('ID'):
    # New graph object
    graph_name = f"Astronomy_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_astronomy[graph_name] = G

In [17]:
graphs_sampling = {}

for id, sub_df in data_sampling.groupby('ID'):
    # New graph object
    graph_name = f"Sampling_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_sampling[graph_name] = G

In [18]:
graphs_database = {}

for id, sub_df in data_database.groupby('ID'):
    # New graph object
    graph_name = f"Database_{sub_df['Mod.'].iloc[0]}_{sub_df['ID'].iloc[0]}"
    G = nx.DiGraph()
    
    # Add nodes and edges
    for _, row in sub_df.iterrows():
        start_node = row['Start']
        if pd.notna(row['End']):
            end_nodes = [end_node.rstrip() for end_node in row['End'].split(',')]
            for end_node in end_nodes:
                G.add_edge(start_node, end_node)
        # Add p.knowledge labels:  O -> 1(true)  |  X -> 0(false)
        try:
            G.nodes[start_node]['P.Knowledge'] = 1 if row['P.Knowledge'] == 'O' else 0
        except KeyError:
            G.add_node(start_node)
            G.nodes[start_node]['P.Knowledge'] = 0
    
    # Save the graph
    graphs_database[graph_name] = G

#### Methods for individual metrics
1. `num_of_concepts`: 노드 개수와 동일
2. `num_of_relationships`: 엣지 개수와 동일
3. `num_of_hierarchies`
4. `num_of_crosslinks`
5. `cal_aspl`: Average shortest path length
6. `cal_cc`: Clustering coefficient
7. `cal_network_density`
8. `cal_complexity`
9. `get_level_deepest_hierarchy`

In [None]:
def num_of_concepts(G):
    return G.number_of_nodes()

def num_of_relationships(G):
    return G.number_of_edges()

def num_of_hierarchies(G):
    return nx.number_strongly_connected_components(G)

def num_of_crosslinks(G):
    return num_of_relationships(G) - num_of_concepts(G) + num_of_hierarchies(G)

def cal_aspl(G):
    try :
        return nx.average_shortest_path_length(G)
    except nx.exception.NetworkXError:
        return 0

def cal_cc(G):
    return nx.average_clustering(G)

def cal_network_density(G):
    return nx.density(G)

def cal_complexity(G):
    E = G.number_of_edges()
    N = G.number_of_nodes()
    return (E / N) if (N > 0) else 0

def get_level_deepest_hierarchy(G):
    max_depth = 0
    for component in nx.connected_components(G.to_undirected()):
        subgraph = G.subgraph(component)
        try:
            depth = max(len(nx.shortest_path(subgraph, source, target)) for source in subgraph for target in subgraph)
        except nx.NetworkXNoPath:
            depth = 0
        if depth > max_depth:
            max_depth = depth
    return max_depth

#### Analysis for individual metrics

In [19]:
# Set a new dataframe for the analysis
columns = ["ID", "num_of_concepts", "num_of_relationships", "num_of_hierarchies", "num_of_crosslinks", "ASPL", "CC", "network_density", "complexity", "level_of_deepest_hierarchy"]
df_astronomy = pd.DataFrame(columns=columns).set_index("ID")
df_sampling = pd.DataFrame(columns=columns).set_index("ID")
df_database = pd.DataFrame(columns=columns).set_index("ID")

In [21]:
for graph_name, G in graphs_astronomy.items():
    try:
        df_astronomy.loc[graph_name] = [
            num_of_concepts(G), 
            num_of_relationships(G), 
            num_of_hierarchies(G), 
            num_of_crosslinks(G), 
            cal_aspl(G), 
            cal_cc(G), 
            cal_network_density(G), 
            cal_complexity(G), 
            get_level_deepest_hierarchy(G)
        ]
    except nx.NetworkXNoPath as e:
        print(f"Path error at {graph_name}: {e}")

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
df_astronomy.to_excel("result/Astronomy_indiv.xlsx", index=True)

In [None]:
for graph_name, G in graphs_sampling.items():
    try:
        df_sampling.loc[graph_name] = [
            num_of_concepts(G), 
            num_of_relationships(G), 
            num_of_hierarchies(G), 
            num_of_crosslinks(G), 
            cal_aspl(G), 
            cal_cc(G), 
            cal_network_density(G), 
            cal_complexity(G), 
            get_level_deepest_hierarchy(G)
        ]
    except nx.NetworkXNoPath as e:
        print(f"Path error at {graph_name}: {e}")

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
df_sampling.to_excel("result/Sampling_indiv.xlsx", index=True)

In [None]:
for graph_name, G in graphs_database.items():
    try:
        df_database.loc[graph_name] = [
            num_of_concepts(G), 
            num_of_relationships(G), 
            num_of_hierarchies(G), 
            num_of_crosslinks(G), 
            cal_aspl(G), 
            cal_cc(G), 
            cal_network_density(G), 
            cal_complexity(G), 
            get_level_deepest_hierarchy(G)
        ]
    except nx.NetworkXNoPath as e:
        print(f"Path error at {graph_name}: {e}")

# Save the result to excel file
# ❗WARNING❗ It will automatically overwrite the existing file!!!
df_database.to_excel("result/Database_indiv.xlsx", index=True)