In [None]:
"""
IMPORTANT:
This project is designed to run exclusively on Google Colab.

It relies on Google Drive being mounted at:
    /content/drive/MyDrive/

Local execution is not supported.
"""


**INSTALLING LIBRARIES**

In [None]:
!pip install igraph

Collecting igraph
  Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting texttable>=1.6.2 (from igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.11.9 texttable-1.7.0


**MOUNTS DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Mounts Google Drive into the Colab environment to access project files

# Changes the current working directory to the NeuroScape project folder in Google Drive
%cd /content/drive/MyDrive/NeuroScape

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1vM2NZYPQBx0CCXgmkPKl1lguMSLh85MO/NeuroScape


**IMPORTING LIBRARIES**

In [None]:
import os
import sys
import pandas as pd
import igraph as ig

**Cluster Density**

In [None]:

from glob import glob
from tqdm import tqdm
from src.utils.cluster_graph import *
from src.utils.parsing import parse_directories
from src.utils.load_and_save import load_articles_from_hdf5

from dotenv import load_dotenv, find_dotenv

# Add the 'src' directory to the system path
# This allows importing project-specific modules
sys.path.append('/content/drive/MyDrive/NeuroScape/src')

# Load environment variables from the .env file
load_dotenv(find_dotenv())
BASEPATH = os.environ['BASEPATH']

if __name__ == '__main__':
    configurations = load_configurations()
    directories = parse_directories()

    print('Loading csv files...')

    csv_directory = os.path.join('/content/drive/MyDrive/NeuroScape/output/tratados/neuroscience')

    article_csv_file = os.path.join(
        csv_directory, 'articles_with_citation_rate.csv')

    cluster_csv_file = os.path.join(
        csv_directory,
        'clusters_defined_distinguished_questions_trends_assessed.csv')

    graph_directory = os.path.join(
        BASEPATH, directories['internal']['intermediate']['graphs'])
    graph_file = os.path.join(graph_directory, 'citation_density.graphml')
    article_df = pd.read_csv(article_csv_file)
    cluster_df = pd.read_csv(cluster_csv_file)

    print('Loading articles...')
    shard_directory = os.path.join(
        BASEPATH, directories['internal']['intermediate']['hdf5']['neuro'])
    article_files = glob(os.path.join(shard_directory, '*.h5'))

    article_graph = {
        'Pmid': [],
        'Cluster ID': [],
        'Year': [],
        'Age': [],
        'In_links': [],
        'Out_links': []
    }

    for file in tqdm(article_files):
        articles = load_articles_from_hdf5(file, disable_tqdm=True)
        for article in articles:
            article_graph['Pmid'].append(article.pmid)
            article_graph['Cluster ID'].append(
                article_df.loc[article_df['Pmid'] == article.pmid,
                               'Cluster ID'].values[0])
            article_graph['Year'].append(article.year)
            article_graph['Age'].append(article.age)
            article_graph['In_links'].append(article.in_links)
            article_graph['Out_links'].append(article.out_links)

    article_graph_df = pd.DataFrame(article_graph)

    cluster_df['Reference Krackhardt'] = 0.0
    cluster_df['Citation Krackhardt'] = 0.0
    cluster_df['Most Cited Cluster'] = ''
    cluster_df['Most Citing Cluster'] = ''

    edge_list = []
    weights_list = []

    print('Performing density analyses...')
    for source_cluster in tqdm(cluster_df['Cluster ID']):
        krackhardt, frequent_clusters = node_analysis(article_graph_df,
                                                      source_cluster)
        cluster_df.loc[cluster_df['Cluster ID'] == source_cluster,
                       'Reference Krackhardt'] = krackhardt['References']
        cluster_df.loc[cluster_df['Cluster ID'] == source_cluster,
                       'Citation Krackhardt'] = krackhardt['Citations']
        cluster_df.loc[cluster_df['Cluster ID'] == source_cluster,
                       'Most Cited Cluster'] = frequent_clusters['References']
        cluster_df.loc[cluster_df['Cluster ID'] == source_cluster,
                       'Most Citing Cluster'] = frequent_clusters['Citations']

        for destination_cluster in cluster_df['Cluster ID']:
            edge = tuple([source_cluster, destination_cluster])
            edge_list.append(edge)
            weight = edge_analysis(article_graph_df, edge)
            weights_list.append(weight)

    # Most Cited Cluster and Most Citing Cluster before Most Similar Cluster
    columns = cluster_df.columns.tolist()
    columns = columns[:6] + columns[-4:] + columns[6:-4]
    cluster_df = cluster_df[columns]

    print('Saving csv file...')
    cluster_csv_file = cluster_csv_file.replace('.csv', '_density.csv')
    cluster_df.to_csv(cluster_csv_file, index=False)

    print('Creating igraph Graph object')
    reference_density_graph = ig.Graph(edges=edge_list, directed=True)
    reference_density_graph.es['weight'] = weights_list
    reference_density_graph.vs['label'] = cluster_df['Cluster ID']

    print('Saving graphs...')
    reference_density_graph.save(graph_file, format='graphml')

Loading csv files...
Loading articles...


100%|██████████| 12/12 [00:07<00:00,  1.55it/s]


Performing density analyses...


100%|██████████| 9/9 [00:29<00:00,  3.26s/it]

Saving csv file...
Creating igraph Graph object
Saving graphs...



