<a href="https://colab.research.google.com/github/Jeet009/Institute-Disambiguation-using-Author-Institution-Co-Occurence/blob/main/institute_disambiguation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q cupy-cuda12x

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.2/113.2 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h

The `fetch_all_works` function retrieves publication data from the Crossref API based on a specified filter and the number of rows per request. It implements a retry strategy for handling transient errors and uses cursor-based pagination to fetch up to 50,000 records. The fetched data is returned as a list of dictionaries and also saved to a CSV file named "items.csv".

In [None]:
import time
import requests
from requests.adapters import HTTPAdapter
from requests.exceptions import ChunkedEncodingError
from urllib3.util.retry import Retry
import pandas as pd

def fetch_all_works(filter_str: str, rows: int = 1000) -> pd.DataFrame:
    base_url = "https://api.crossref.org/works"
    session = requests.Session()

    # Retry strategy for robustness
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)

    # Custom User-Agent for polite access
    session.headers.update({"User-Agent": "MyScript/1.0"})

    # API request parameters
    params = {
        "query.affiliation": "India",
        "filter": filter_str,
        "rows": rows,
        "cursor": "*"
    }

    all_items = []
    # This number (50000) is total count of the rows.
    while len(all_items) < 50000:
        try:
            resp = session.get(base_url, params=params, timeout=30)
            resp.raise_for_status()

            data = resp.json()["message"]

            items = data.get("items", [])
            if not items:
                break
            all_items.extend(items)
            print(len(all_items))
            # Move to next page
            params["cursor"] = data.get("next-cursor")
        except ChunkedEncodingError as ex:
            print(f"ChunkedEncodingError encountered—retrying: {ex}")
            time.sleep(2)
            continue

    # Save results
    df = pd.DataFrame(all_items)
    df.to_csv("items.csv", index=False)
    print(f"Fetched {len(all_items)} total records.")
    return all_items


In [None]:
all_items = fetch_all_works(filter_str="from-pub-date:2025-01-01,until-pub-date:2025-12-31")

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
Fetched 50000 total records.


In [None]:
# Data is already saved in the working dir.
all_items = pd.DataFrame(all_items)
all_items.to_csv('items-2.csv')

### To continue, load the data from the CSV file if it exists,
### *Otherwise proceed with the data fetched from the API (Recommended for first time use).*

In [None]:
import pandas as pd
all_items = pd.read_csv('/content/items-2.csv')

  all_items = pd.read_csv('/content/items-2.csv')


In [None]:
# all_items.head()['indexed'].values
len(all_items)

50000

### Experiments performed for metadata (IGNORE)

In [None]:
# total_count = 0
# firstname_missing_count = 0
# lastname_missing_count = 0
# both_missing_count = 0
# total_missing_list = []

In [None]:
# from collections import defaultdict
# name_affiliation_count = defaultdict(lambda: defaultdict(int))
# counter=0
# import re
# import ast

# def clean_string(input_str):
#     # Remove leading numbers
#     input_str = re.sub(r'^\d+', '', input_str).strip()

#     # Check if the string starts with 'department' or 'division' and remove up to the first comma
#     if re.match(r'^(department|division)\s', input_str, flags=re.IGNORECASE):
#         print(input_str)
#         input_str = re.sub(r'^(department|division)\s[^,]*,', '', input_str, flags=re.IGNORECASE).strip()
#         print(input_str)
#     return input_str

# for i, item in all_items.iterrows():
#   try:
#     authors = item['author']
#     print('New Paper' + ' ' + str(counter))
#     counter+=1
#     for author in ast.literal_eval(authors):
#       try:
#         first_name = author['given']
#       except:
#         firstname_missing_count += 1
#         total_count += 1
#         total_missing_list.append(item)
#         continue
#       try:
#         last_name = author['family']
#       except:
#         lastname_missing_count += 1
#         total_count += 1
#         total_missing_list.append(item)
#         continue
#       if first_name and last_name:
#         full_name = first_name + ' ' + last_name

#         try:
#           affiliation = author['affiliation'][0]['name']
#           if affiliation[0] != ',':
#             affiliation_cleaned = clean_string(affiliation)
#             if affiliation_cleaned[0:5]!='India':
#               name_affiliation_count[full_name][affiliation_cleaned] += 1
#         except:
#               # print(author['affiliation'])
#             both_missing_count += 1
#             total_count += 1
#             total_missing_list.append(item)
#             continue

#   except:
#     print('NOT Worked')
#     total_count += 1
#     total_missing_list.append(item)
#     pass

In [None]:
# print(total_count)
# print(firstname_missing_count)
# print(lastname_missing_count)
# print(both_missing_count)
# len(total_missing_list)

In [None]:
# df_new = pd.DataFrame(total_missing_list)

In [None]:
# df_new['author'][2]

### Institute Disambiguation using Author-Institution Co-Occurence

In [None]:
from collections import defaultdict
name_affiliation_count = defaultdict(lambda: defaultdict(int))
counter=0
import re
import ast
def clean_string(input_str):
    # Remove leading numbers
    input_str = re.sub(r'^\d+', '', input_str).strip()

    # Check if the string starts with 'department' or 'division' and remove up to the first comma
    if re.match(r'^(department|division)\s', input_str, flags=re.IGNORECASE):
        # print(input_str)
        input_str = re.sub(r'^(department|division)\s[^,]*,', '', input_str, flags=re.IGNORECASE).strip()
        # print(input_str)
    return input_str

for i, item in all_items.iterrows():
  try:
    authors = item['author']
    print(counter)
    counter+=1
    for author in ast.literal_eval(authors):
      try:
        first_name = author['given']
      except:
        pass
      try:
        last_name = author['family']
      except:
        pass
      if first_name and last_name:
        full_name = first_name + ' ' + last_name

        try:
          affiliation = author['affiliation'][0]['name']
          if affiliation[0] != ',':
            affiliation_cleaned = clean_string(affiliation)
            if affiliation_cleaned[0:5]!='India':
              name_affiliation_count[full_name][affiliation_cleaned] += 1
        except:
              # print(author['affiliation'])
            pass

  except:
    print('NOT Worked')
    pass

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
45015
45016
45017
45018
45019
45020
45021
45022
45023
45024
45025
45026
45027
45028
45029
45030
45031
45032
45033
45034
45035
45036
45037
45038
45039
45040
45041
45042
45043
45044
45045
45046
45047
45048
45049
45050
45051
45052
45053
45054
45055
45056
45057
45058
45059
45060
45061
45062
45063
45064
45065
45066
45067
45068
45069
45070
45071
45072
45073
45074
45075
45076
45077
45078
45079
45080
45081
45082
45083
45084
45085
45086
45087
45088
45089
45090
45091
45092
45093
45094
45095
45096
45097
45098
45099
45100
45101
45102
45103
45104
45105
45106
45107
45108
45109
45110
45111
45112
45113
45114
45115
45116
45117
45118
45119
45120
45121
45122
45123
45124
45125
45126
45127
45128
45129
45130
45131
45132
45133
45134
45135
45136
45137
45138
45139
45140
45141
45142
45143
45144
45145
45146
45147
45148
45149
45150
45151
45152
45153
45154
45155
45156
45157
45158
45159
45160
45161
45162
45163
45164
45165
45166
45167
45168
45169
45170

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

# Step 1: Map names and affiliations to integer indices (saves memory)
names = list(name_affiliation_count.keys())
name_to_idx = {name: i for i, name in enumerate(names)}

affiliations = set()
for aff_counts in name_affiliation_count.values():
    affiliations.update(aff_counts.keys())
affiliations = list(affiliations)
aff_to_idx = {aff: i for i, aff in enumerate(affiliations)}

# Step 2: Prepare data for sparse matrix (COO format)
data = []
row_idx = []
col_idx = []

for name, aff_counts in name_affiliation_count.items():
    for aff, count in aff_counts.items():
        row_idx.append(name_to_idx[name])
        col_idx.append(aff_to_idx[aff])
        data.append(count)

# Step 3: Build sparse matrix
sparse_matrix = coo_matrix((data, (row_idx, col_idx)),
                           shape=(len(names), len(affiliations)))

# Step 4: Convert to pandas DataFrame with sparse types
sparse_df = pd.DataFrame.sparse.from_spmatrix(sparse_matrix,
                                              index=names,
                                              columns=affiliations)

# Optional: reset index if needed
sparse_df.reset_index(inplace=True)
sparse_df.rename(columns={'index': 'Name'}, inplace=True)

# Done!
sparse_df.head()



Unnamed: 0,Name,Unnamed: 2,"Accenture, Warsaw, Poland","Lovely Professional University Phagwara,India","A.J. Institute of Medical Sciences and Research Centre, Mangalore, Karnataka, India","Datta Meghe College of Engineering, Airoli, India","School of Electronics Engineering, VIT-AP University,Inavolu, Amaravati, Guntur,Andhra Pradesh,India,522241","IBM Research India, Bengaluru, Karnataka, India","KCG College of Technology,Department of Electrical and Electronics Engineering,Chennai,India","Audiology Program, College of Applied Medical Sciences, King Khalid University, Kingdom of Saudi Arabia",...,"Faculty of Engineering and Technology, SRM Institute of Science and Technology, Kattankulathur, Tamil Nadu, India","Government College of Engineering and Textile Technology, Serampore 712201, Calcutta, India","Dr. D. Y. Patil Institute of Technology,Department of Electronics and Telecommunication,Pimpri, Pune,India","University Institute of Pharma Sciences (UIPS), Chandigarh University, Gharuan, Mohali, 140413, Punjab, India","Indira Gandhi Delhi Technical University for Women,IT Department,Delhi,India","King Khalid University,College of Computer Science,Department of Computer Science,Asir-Abha,Saudi Arabia,61421","Postgraduate Student, Department of Oral and Maxillofacial Surgery, Sinhgad Dental College and Hospital, Pune, Maharashtra, India","Dr. YS Parmar University of Horticulture and Forestry, Solan, Himachal Pradesh, India","Aarupadai Veedu Institute of Technology,Department of EEE,Chennai,India",ICMR‐National Institute of Traditional Medicine Nehru Nagar Belagavi Karnataka 590010 India
0,VIJAY VISWANATHAN,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,RAJESH KESARI,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ANUBHA SRIVASTAVA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AMIT GUPTA,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AJOY TEWARI,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# To remove the numeric values from the institute name
sparse_df.columns = sparse_df.columns.str.replace(r'\d+', '', regex=True)
sparse_df.shape

(150604, 77484)

In [None]:
# EXPERIMENTS WITH BATCH PROCESSING
import math
import gc
import numpy as np
import pandas as pd
import jax
import jax.numpy as jnp
from scipy.sparse import csr_matrix

# ----------------------------------------------------------------------------
# TPU‐based Cooccurrence Computation using JAX
# ----------------------------------------------------------------------------

def compute_batch_cooccurrence(mask: jnp.ndarray) -> jnp.ndarray:
    """
    JIT‐compiled function to compute mask.T @ mask on TPU.
    mask: [batch_size, n_cols] float32 {0,1}
    returns: [n_cols, n_cols] float32
    """
    print("MATRIX MULTIPLICATION")
    return mask.T @ mask

# JIT‐compile the batch function for speed on TPU
compute_batch_cooccurrence_jit = jax.jit(compute_batch_cooccurrence, device=jax.devices('tpu')[0])



In [None]:

def compute_cooccurrence_on_tpu(
    df: pd.DataFrame,
    name_col: str = "Name",
    batch_size: int = 5000
):
    """
    1) Slice the DataFrame into batches
    2) Densify each batch to float32 mask
    3) Transfer to TPU and compute cooccurrence via JIT
    4) Accumulate results on CPU
    """
    # ─── Prune unwanted columns ──────────────────────────────────────────
    df = df.loc[:, ~df.columns.str.startswith("Unnamed")]
    df = df.loc[:, ~df.columns.duplicated()]

    # ─── Feature selection ──────────────────────────────────────────────
    feature_cols = [c for c in df.columns if c != name_col]
    n_feats = len(feature_cols)
    n_rows  = len(df)
    total_co = None

    # ─── Batch loop ─────────────────────────────────────────────────────
    for start in range(0, n_rows, batch_size):
        end = min(start + batch_size, n_rows)
        sub_df = df.iloc[start:end][feature_cols]
        print(f"Processing rows {start} to {end}, shape={sub_df.shape}")

        # Densify to float32 mask
        mask_np = (sub_df.values.astype(np.int8) != 0).astype(np.int8)

        # Transfer to TPU and compute
        mask_jax = jax.device_put(mask_np, device=jax.devices('tpu')[0])
        print('TPU CALCULATION')
        batch_co = compute_batch_cooccurrence_jit(mask_jax)
        print(jax.devices('tpu')[0].memory_stats())

        # Bring result back to CPU
        batch_co_cpu = np.array(batch_co)

        # Accumulate
        total_co = batch_co_cpu if total_co is None else total_co + batch_co_cpu

        # Cleanup
        del sub_df, mask_np, mask_jax, batch_co
        gc.collect()

    # Wrap into DataFrame
    co_df = pd.DataFrame(total_co, index=feature_cols, columns=feature_cols)
    print("Final co‑matrix shape:", co_df.shape)
    return co_df, df


In [None]:
feature_cols = [c for c in sparse_df.columns if c != "Name"]

corr_mat, san_df = compute_cooccurrence_on_tpu(
        sparse_df,
        name_col    = "Name",
        batch_size  = 3000, # Reduced batch size
    )

Processing rows 0 to 3000, shape=(3000, 77305)
TPU CALCULATION
MATRIX MULTIPLICATION
{'num_allocs': 4, 'bytes_in_use': 6209537536, 'peak_bytes_in_use': 6209537536, 'largest_alloc_size': 5977145344, 'bytes_limit': 16909336064, 'bytes_reserved': 0, 'peak_bytes_reserved': 0, 'bytes_reservable_limit': 16909336064, 'largest_free_block_bytes': 10699798528}
Processing rows 3000 to 6000, shape=(3000, 77305)
TPU CALCULATION
{'num_allocs': 4, 'bytes_in_use': 6209537536, 'peak_bytes_in_use': 6209537536, 'largest_alloc_size': 5977145344, 'bytes_limit': 16909336064, 'bytes_reserved': 0, 'peak_bytes_reserved': 0, 'bytes_reservable_limit': 16909336064, 'largest_free_block_bytes': 10699798528}
Processing rows 6000 to 9000, shape=(3000, 77305)
TPU CALCULATION
{'num_allocs': 4, 'bytes_in_use': 6209537536, 'peak_bytes_in_use': 6209537536, 'largest_alloc_size': 5977145344, 'bytes_limit': 16909336064, 'bytes_reserved': 0, 'peak_bytes_reserved': 0, 'bytes_reservable_limit': 16909336064, 'largest_free_block_

In [None]:
corr_mat.shape

(77305, 77305)

In [None]:
correlation_df = pd.DataFrame(corr_mat,
                              index=san_df.columns[1:],
                              columns=san_df.columns[1:])

In [None]:
correlation_df

Unnamed: 0,Unnamed: 1,"Accenture, Warsaw, Poland","Lovely Professional University Phagwara,India","A.J. Institute of Medical Sciences and Research Centre, Mangalore, Karnataka, India","Datta Meghe College of Engineering, Airoli, India","School of Electronics Engineering, VIT-AP University,Inavolu, Amaravati, Guntur,Andhra Pradesh,India,","IBM Research India, Bengaluru, Karnataka, India","KCG College of Technology,Department of Electrical and Electronics Engineering,Chennai,India","Audiology Program, College of Applied Medical Sciences, King Khalid University, Kingdom of Saudi Arabia","American Physical Society, College\nPark, MD , United States of America",...,"Faculty of Engineering and Technology, SRM Institute of Science and Technology, Kattankulathur, Tamil Nadu, India","Government College of Engineering and Textile Technology, Serampore , Calcutta, India","Dr. D. Y. Patil Institute of Technology,Department of Electronics and Telecommunication,Pimpri, Pune,India","University Institute of Pharma Sciences (UIPS), Chandigarh University, Gharuan, Mohali, , Punjab, India","Indira Gandhi Delhi Technical University for Women,IT Department,Delhi,India","King Khalid University,College of Computer Science,Department of Computer Science,Asir-Abha,Saudi Arabia,","Postgraduate Student, Department of Oral and Maxillofacial Surgery, Sinhgad Dental College and Hospital, Pune, Maharashtra, India","Dr. YS Parmar University of Horticulture and Forestry, Solan, Himachal Pradesh, India","Aarupadai Veedu Institute of Technology,Department of EEE,Chennai,India",ICMR‐National Institute of Traditional Medicine Nehru Nagar Belagavi Karnataka India
,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Accenture, Warsaw, Poland",0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Lovely Professional University Phagwara,India",0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"A.J. Institute of Medical Sciences and Research Centre, Mangalore, Karnataka, India",0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Datta Meghe College of Engineering, Airoli, India",0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"King Khalid University,College of Computer Science,Department of Computer Science,Asir-Abha,Saudi Arabia,",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
"Postgraduate Student, Department of Oral and Maxillofacial Surgery, Sinhgad Dental College and Hospital, Pune, Maharashtra, India",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
"Dr. YS Parmar University of Horticulture and Forestry, Solan, Himachal Pradesh, India",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,0
"Aarupadai Veedu Institute of Technology,Department of EEE,Chennai,India",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Graph Visualization

In [None]:
import numpy as np
import networkx as nx

# Set threshold to filter edges
threshold = 0  # only correlations above this (absolute) are considered

# Mask self-correlations
corr_matrix = correlation_df.copy()
np.fill_diagonal(corr_matrix.values, 0)

# Create mask of where abs(correlation) > threshold
mask = (corr_matrix.abs() > threshold)

# Get upper triangle only to avoid duplicates (symmetric matrix)
mask_upper = np.triu(mask.values, k=1)

# Get index pairs where mask is True
rows, cols = np.where(mask_upper)

# Create edge list with weights
edges = [
    (correlation_df.index[i], correlation_df.columns[j], correlation_df.values[i, j])
    for i, j in zip(rows, cols)
]

# Build graph
G = nx.Graph()
G.add_nodes_from(correlation_df.columns)  # Add nodes
G.add_weighted_edges_from(edges)   # Add edges with weights


In [None]:
import networkx as nx
import plotly.graph_objects as go
import numpy as np

# --- Step 1: Threshold edges ---
threshold = 5

# Create a new graph only with strong edges
G_threshold = G.copy()
edges_to_remove = [(u, v) for u, v, d in G_threshold.edges(data=True) if d.get('weight', 0) <= threshold]
G_threshold.remove_edges_from(edges_to_remove)

# Remove isolated nodes (optional, cleaner)
G_threshold.remove_nodes_from(list(nx.isolates(G_threshold)))

# --- Step 2: Find Top 5 Largest Connected Components ---
components = sorted(nx.connected_components(G_threshold), key=len, reverse=True)
top_5_components = components[:5]

# --- Step 3: Plotting function ---
def plot_component(subgraph, title="Graph Component"):

    pos = nx.spring_layout(subgraph, seed=42, k=0.3)

    # Edges
    edge_traces = []
    for u, v, data in subgraph.edges(data=True):
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        weight = data.get('weight', 1)
        edge_width = weight  # scale edge width

        edge_trace = go.Scatter(
            x=[x0, x1, None],
            y=[y0, y1, None],
            line=dict(width=edge_width, color='#888'),
            hoverinfo='none',
            mode='lines'
        )
        edge_traces.append(edge_trace)

    # Nodes
    centrality = nx.degree_centrality(subgraph)
    node_colors = np.array(list(centrality.values()))

    node_x = []
    node_y = []
    node_text = []
    for node in subgraph.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        node_text.append(f"{node}")

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=node_text,
        textposition='top center',
        textfont_size=10,
        marker=dict(
            showscale=True,
            colorscale='YlGnBu',
            reversescale=True,
            color=node_colors,
            size=8,
            colorbar=dict(
                thickness=15,
                title='Degree Centrality',
                xanchor='left',
                titleside='right'
            ),
            line_width=2
        ),
        hoverinfo='text'
    )

    fig = go.Figure(data=edge_traces + [node_trace],
                    layout=go.Layout(
                        title=title,
                        titlefont_size=16,
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
                    ))

    fig.show()
    return fig

# --- Step 4: Plot Top 5 Components ---
for i, component_nodes in enumerate(top_5_components):
    subgraph = G_threshold.subgraph(component_nodes).copy()
    plot_component(subgraph, title=f"Top {i+1} Connected Component (Threshold > {threshold})")


In [None]:
components[1155]

{'CSIR‐Institute of Minerals and Materials Technology  Bhubaneswar  India',
 'Materials Chemistry and Interfacial Engineering Department, CSIR-Institute of Minerals and Materials Technology, Acharya Vihar, Bhubaneswar-, Odisha, India'}

### Save as a JSON File

In [None]:
import json
import networkx as nx
import numpy as np # Import numpy

json_dict = {}
# Get memory stats once
memory_stats = jax.devices('tpu')[0].memory_stats()

# Convert numpy integers to Python integers
json_dict["memory_stats"] = {key: int(value) if isinstance(value, np.integer) else value for key, value in memory_stats.items()}


# Iterate only through the top 10 components
for i, component_nodes in enumerate(components):
    subgraph = G_threshold.subgraph(component_nodes)
    # Calculate centrality (degree centrality is used in plotting)
    centrality = nx.degree_centrality(subgraph)
    # Find the node with the highest centrality (most central)
    central_node = max(centrality, key=centrality.get)

    # Get edges within the subgraph from the correlation_df
    subgraph_edges = []
    for u, v, data in subgraph.edges(data=True):
        # Ensure edge weight is a standard Python int or float
        weight = data.get('weight', 0)
        subgraph_edges.append({
            "source": u,
            "target": v,
            "weight": int(weight) if isinstance(weight, np.integer) else weight
        })


    json_dict[str(i)] = {
        "nodes": sorted(list(component_nodes)),
        "num_nodes": len(component_nodes),
        "central_node": central_node,
        "edges": subgraph_edges
    }


with open('components50kinstututesth2.json', 'w') as f:
    json.dump(json_dict, f, indent=2)

### Download as HTML File

In [None]:
import plotly.io as pio
top_10_components = components[:10]

with open("50k_components_th5.html", "w") as f:
    for i, component_nodes in enumerate(top_10_components):
        subgraph = G_threshold.subgraph(component_nodes).copy()
        fig = plot_component(subgraph, title=f"Top {i+1} Connected Component (Threshold > {threshold})")
        f.write(pio.to_html(fig, full_html=False, include_plotlyjs='cdn'))


### Store all components as JSON

In [None]:
import json

indexed_data = {str(index): sorted(list(entry)) for index, entry in enumerate(components)}
# Save to a JSON file
with open('institutes.json', 'w') as f:
    json.dump(indexed_data, f, indent=4)