In [16]:
import networkx as nx
import igraph
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations

import os
import sys

# Psioriasis Data Formatting

In [12]:
base_dir = 'data'
file_name = 'psoriasis.xlsx'
file_path = os.path.join(base_dir, 'psoriasis', file_name)

label_df = pd.read_excel(file_path, sheet_name='Labels')
dataset_df = pd.read_excel(file_path, sheet_name='Dataset')

In [13]:
print("Label Sheet:")
print(label_df.head())

print("\nDataset Sheet:")
print(dataset_df.head())

Label Sheet:
  PMID: PubMed Identifier for indexed documents in Pubmed used for the study. PMIDs do not change over time or during processing and are never reused.
0           DP: Date that the article was published.                                                                                                  
1  FAU: Author name for articles published. From ...                                                                                                  
2  FAU-Revised: Author name after cleansing proce...                                                                                                  

Dataset Sheet:
       PMID    DP             FAU        FAU-Revised
0  18913797  1942     COMEL, M, M           COMEL, M
1  20981993  1945     DEGOS, R, R           Degos, R
2  21020988  1945  GOMEZ ORBANEJA  Gómez Orbaneja, J
3  21020988  1945    GARCIA PEREZ       GARCIA PEREZ
4  20983377  1945  NIEMEYER, A, A        NIEMEYER, A


In [17]:
dataset_df['author_id'] = dataset_df['FAU-Revised'].astype('category').cat.codes
author_id_map = dataset_df[['FAU-Revised', 'author_id']].drop_duplicates().reset_index(drop=True)

# Group by PMID and get list of author IDs for each paper
pmid_author_groups = dataset_df.groupby('PMID')['author_id'].apply(list)

# Create edge list by finding all combinations of authors per paper
edges = set()
for author_list in pmid_author_groups:
    if len(author_list) > 1:
        for pair in combinations(sorted(set(author_list)), 2):
            edges.add(pair)

# Convert to DataFrame
edge_df = pd.DataFrame(list(edges), columns=['source', 'target'])

# Show results
print("Author ID Map:")
print(author_id_map.head())

print("\nEdge List:")
print(edge_df.head())

Author ID Map:
         FAU-Revised  author_id
0           COMEL, M       6148
1           Degos, R       9786
2  Gómez Orbaneja, J      16218
3       GARCIA PEREZ      13723
4        NIEMEYER, A      30123

Edge List:
   source  target
0   16252   44038
1    6327   36473
2   25709   45496
3    7197   46909
4    4388   14647


In [19]:
print(f"Number of edges in the graph: {len(edge_df)}")

Number of edges in the graph: 211540


In [20]:
G = nx.Graph()

# Add nodes with author name as an attribute
for _, row in author_id_map.iterrows():
    G.add_node(row['author_id'], name=row['FAU-Revised'])
G.add_edges_from(edges)

In [22]:
degrees = dict(G.degree())
max_degree = max(degrees.values())
top_authors = [node for node, deg in degrees.items() if deg == max_degree]

# Get their names
top_author_names = [G.nodes[node]['name'] for node in top_authors]

print(f"Maximum degree: {max_degree}")
print("Author with maximum degree:")
for name in top_author_names:
    print("-", name)

Maximum degree: 655
Author with maximum degree:
- Griffiths, Christopher E M
