In [None]:
"""
IMPORTANT:
This project is designed to run exclusively on Google Colab.

It relies on Google Drive being mounted at:
    /content/drive/MyDrive/

Local execution is not supported.
"""


**INSTALLING LIBRARIES**

In [None]:
!pip install igraph faiss-cpu leidenalg

Collecting igraph
  Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting texttable>=1.6.2 (from igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux201

**MOUNTS DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Mounts Google Drive into the Colab environment to access project files

# Changes the current working directory to the NeuroScape project folder in Google Drive
%cd /content/drive/MyDrive/NeuroScape

Mounted at /content/drive
/content/drive/MyDrive/Trabalhos/TJ/NeuroScape


**IMPORTING LIBRARIES**

In [None]:
import os
import igraph as ig
import psutil
import sys
import leidenalg
import numpy as np
import pandas as pd
from collections import deque
from glob import glob
from src.utils.clustering import *
from src.utils.parsing import parse_directories

from src.utils.load_and_save import load_embedding_shards
from dotenv import load_dotenv, find_dotenv

**Graph construction**

In [None]:
from src.utils.clustering import *
from src.utils.parsing import parse_directories

from src.utils.load_and_save import load_embedding_shards
from dotenv import load_dotenv, find_dotenv

sys.path.append('/content/drive/MyDrive/NeuroScape/src')

load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']

if __name__ == '__main__':
    configurations = load_configurations()['graph_construction']
    directories = parse_directories()

    shard_directory = os.path.join(
        BASEPATH, directories['internal']['intermediate']['hdf5']['neuro'])
    graph_directory = os.path.join(
        BASEPATH, directories['internal']['intermediate']['graphs'])

    files = glob(os.path.join(shard_directory, '*.h5'))
    graph_file = os.path.join(graph_directory, 'article_similarity.graphml')

    print('Loading embeddings...')
    embeddings, pmids = load_embedding_shards(files)

    num_points = embeddings.shape[0]
    num_neighbors = configurations['num_neighbors']
    available_memory_gb = psutil.virtual_memory().available / (1024**3)

    # Check if the selected k will fit into memory
    if not check_memory_constraints(num_points, num_neighbors,
                                    available_memory_gb):
        raise MemoryError(
            "Not enough memory for the selected k. Please reduce k or upgrade your hardware."
        )

    print('Constructing k-NN graph...')
    edges, weights = construct_knn_graph(embeddings, num_neighbors)

    print('Creating igraph Graph object...')
    G = ig.Graph(edges=edges, directed=False)
    G.vs['pmid'] = pmids
    G.es['weight'] = weights

    print('Saving graph...')
    os.makedirs(os.path.dirname(graph_file), exist_ok=True)
    G.write(graph_file, format='graphml')

Loading embeddings...


100%|██████████| 12/12 [00:07<00:00,  1.59it/s]


Estimated memory usage: 0.00 GB
Constructing k-NN graph...
Performing k-NN search...
Constructing edge list...


100%|██████████| 2298/2298 [00:00<00:00, 7985.26it/s]


Symmetrizing the graph...


100%|██████████| 2298/2298 [00:00<00:00, 23126.52it/s]


Creating igraph Graph object...
Saving graph...


**Community detection**

In [None]:
from src.utils.clustering import load_configurations
from src.utils.parsing import parse_directories

sys.path.append('/content/drive/MyDrive/NeuroScape/src')

load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']

if __name__ == '__main__':

    configurations = load_configurations()
    directories = parse_directories()

    csv_directory = os.path.join('/content/drive/MyDrive/NeuroScape/output/tratados/neuroscience')
    graph_directory = os.path.join(
        BASEPATH, directories['internal']['intermediate']['graphs'])

    csv_file = os.path.join(csv_directory,
                            'articles_merged_cleaned_filtered.csv')

    graph_file = os.path.join(graph_directory, 'article_similarity.graphml')

    print('loading graph')
    G = ig.Graph.Read(graph_file, format='graphml')
    pmids = G.vs['pmid']

    print(
        'running Leiden community detection for different resolution parameters'
    )
    num_resolution_parameter = configurations['community_detection'][
        'num_resolution_parameter']
    max_resolution_parameter = configurations['community_detection'][
        'max_resolution_parameter']
    min_resolution_parameter = max_resolution_parameter / num_resolution_parameter

    resolution_parameters = np.linspace(min_resolution_parameter,
                                        max_resolution_parameter,
                                        num_resolution_parameter)

    modularity_values = np.zeros(num_resolution_parameter)
    num_unique_clusters = np.zeros(num_resolution_parameter)

    decreasing = deque(np.zeros(5, dtype=bool))

    for i, resolution_parameter in enumerate(resolution_parameters):
        partition = leidenalg.find_partition(
            G,
            leidenalg.CPMVertexPartition,
            weights='weight',
            resolution_parameter=resolution_parameter)

        modularity_values[i] = G.modularity(partition.membership,
                                            weights='weight')
        num_unique_clusters[i] = len(np.unique(partition.membership))
        print(f'number of unique clusters: {num_unique_clusters[i]}')
        print(f'modularity values: {modularity_values[i]}')

        if i > 0:
            decreasing.popleft()
            decreasing.append(modularity_values[i] < modularity_values[i - 1])

        if all(decreasing):
            print('modularity is decreasing, breaking')
            break

    best_resolution_parameter = resolution_parameters[np.argmax(
        modularity_values)]
    partition = leidenalg.find_partition(
        G,
        leidenalg.CPMVertexPartition,
        weights='weight',
        resolution_parameter=best_resolution_parameter)

    pmid_cluster = dict(zip(pmids, partition.membership))

    print('saving cluster')
    df = pd.read_csv(csv_file)
    df['Cluster ID'] = df['Pmid'].map(pmid_cluster)

    new_csf_file = csv_file.replace('.csv', '_clustered.csv')
    new_csv_file = os.path.join(csv_directory, new_csf_file)

    df.to_csv(new_csv_file, index=False)

loading graph
running Leiden community detection for different resolution parameters
number of unique clusters: 1.0
modularity values: 2.220446049250313e-16
number of unique clusters: 1.0
modularity values: 2.220446049250313e-16
number of unique clusters: 2.0
modularity values: 0.4383308009084261
number of unique clusters: 2.0
modularity values: 0.4439423512722538
number of unique clusters: 3.0
modularity values: 0.4709014707588849
number of unique clusters: 3.0
modularity values: 0.5524550631472556
number of unique clusters: 4.0
modularity values: 0.5995278702743098
number of unique clusters: 5.0
modularity values: 0.6183030926340735
number of unique clusters: 6.0
modularity values: 0.6607716276665074
number of unique clusters: 6.0
modularity values: 0.6596201635592188
number of unique clusters: 6.0
modularity values: 0.6641362937071349
number of unique clusters: 7.0
modularity values: 0.668476775516933
number of unique clusters: 8.0
modularity values: 0.6780492633346127
number of uni