In [1]:
# Imports
import json
import networkx as nx
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from joblib import Parallel, delayed
from IPython.display import Image, display

In [2]:
# Load DataFrames
df_articles = pd.read_csv("df_articles.csv")  # Replace with actual filename
df_authors = pd.read_csv("df_authors.csv")    # Replace with actual filename

In [None]:
import pandas as pd
import networkx as nx
from itertools import combinations


In [11]:

# Create an empty graph
G = nx.Graph()

# Iterate over each row in the dataframe
for index, row in df_articles.iterrows():
    authors = eval(row['author_ids'])
    # Add edges between each pair of authors with weight 1
    for author1, author2 in combinations(authors, 2):
        if G.has_edge(author1, author2):
            G[author1][author2]['weight'] += 1
        else:
            G.add_edge(author1, author2, weight=1)

# Print the weighted edgelist
weighted_edgelist = nx.to_pandas_edgelist(G)
print(weighted_edgelist)

            source       target  weight
0      A5014647140  A5082953212       2
1      A5014647140  A5067142016       4
2      A5014647140  A5008033989       5
3      A5014647140  A5069948947       1
4      A5014647140  A5078253058       1
...            ...          ...     ...
57512  A5083702049  A5004273745       1
57513  A5083702049  A5109934253       1
57514  A5100452647  A5004273745       1
57515  A5100452647  A5109934253       1
57516  A5004273745  A5109934253       1

[57517 rows x 3 columns]


In [13]:
# make undirected graph
G = nx.Graph()

# add weighted edges
G.add_weighted_edges_from(weighted_edgelist.itertuples(index=False, name=None))

G

<networkx.classes.graph.Graph at 0x7fbff91b36f0>

In [14]:
# Convert authors_df into a dictionary for quick lookup
author_metadata = {
    row["id"]: {
        "display_name": row["display_name"],
        "country_code": row["country_code"]
    }
    for _, row in df_authors.iterrows()
}

# Convert df_articles into a dictionary for first_pub_year & citation_count
author_publication_info = {}
for _, row in df_articles.iterrows():
    for author in row["author_ids"]:
        if author not in author_publication_info:
            author_publication_info[author] = {
                "first_pub_year": int(row["publication_year"]),  # Convert numpy.int64 to int
                "citation_count": int(row["cited_by_count"])  # Convert numpy.int64 to int
            }


In [15]:
for node in G.nodes():
    # Get author metadata from authors_df
    G.nodes[node]["display_name"] = author_metadata.get(node, {}).get("display_name", "Unknown")
    G.nodes[node]["country_code"] = author_metadata.get(node, {}).get("country_code", "Unknown")

    # Get publication info from df_articles
    G.nodes[node]["first_pub_year"] = author_publication_info.get(node, {}).get("first_pub_year", None)
    G.nodes[node]["citation_count"] = author_publication_info.get(node, {}).get("citation_count", 0)


In [16]:
import json
from networkx.readwrite import json_graph

# Convert NetworkX graph to JSON format
graph_data = json_graph.node_link_data(G)

# Save to a JSON file
with open("coauthorship_network_new.json", "w") as f:
    json.dump(graph_data, f, indent=4)

The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


In [12]:
import itertools

# generate co-author pairs and count occurrences
edge_list = []
for authors in df_articles['author_ids']:
    pairs = list(itertools.combinations(authors, 2))  # make all possible author A-B pairs
    edge_list.extend(pairs)

# count occurrences of each pair
edge_weights = {}
for pair in edge_list:
    if pair in edge_weights:
        edge_weights[pair] += 1
    else:
        edge_weights[pair] = 1

# convert to DataFrame (Weighted Edge List)
weighted_edge_list = pd.DataFrame(
    [(a, b, w) for (a, b), w in edge_weights.items()],
    columns=["source", "target", "weight"]
)

weighted_edge_list

Unnamed: 0,source,target,weight
0,[,',87146
1,[,A,43573
2,[,5,77915
3,[,0,77277
4,[,1,42138
...,...,...,...
220,3,3,60372
221,3,9,57225
222,3,8,62847
223,9,8,54568
