In [None]:
"""
IMPORTANT:
This project is designed to run exclusively on Google Colab.

It relies on Google Drive being mounted at:
    /content/drive/MyDrive/

Local execution is not supported.
"""


**INSTALLING LIBRARIES**

In [None]:
!pip install biopython          # Provides access to biological databases, including PubMed, through the Entrez API

!pip install sentence-transformers  # Generates text embeddings (vector representations that capture semantic meaning)

!pip install faiss-cpu          # Library for efficient vector indexing, similarity search, and clustering

!pip install habanero           # Interface to the CrossRef API (retrieves article metadata, DOIs, and citation info)

!pip install crossrefapi        # Alternative CrossRef client for metadata and citation retrieval

Collecting biopython
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting leidenalg
  Downloading leidenalg-0.10.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting python-igraph
  Downloading python_igraph-0.11.9-py3-none-any.whl.metadata (3.1 kB)
Collecting igraph<0.12,>=0.10.0 (from leidenalg)
  Downloading igraph-0.11.9-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting texttable>=1.6.2 (from igraph<0.12,>=0.10.0->leidenalg)
  Downl

**MOUNTS DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Mounts Google Drive into the Colab environment to access project files

# Changes the current working directory to the NeuroScape project folder in Google Drive
%cd /content/drive/MyDrive/NeuroScape

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1vM2NZYPQBx0CCXgmkPKl1lguMSLh85MO/NeuroScape


**IMPORTING LIBRARIES**

In [None]:
# --- Core Libraries ---
from Bio import Entrez                      # Provides access to PubMed through the NCBI Entrez API for data ingestion
import pandas as pd                         # Used for handling tabular data (titles, abstracts, metadata, etc.)
from time import sleep                      # Adds pauses between API requests to avoid rate limiting
from sentence_transformers import SentenceTransformer  # Used to generate text embeddings for scientific abstracts
import numpy as np                           # Fundamental library for numerical operations and matrix manipulation
import torch                                 # Deep learning framework used for tensor operations and model inference
import os, glob, sys                         # Standard Python libraries for file, path, and system-level operations

# --- Project Utility Modules (Custom scripts under /src/utils) ---
from src.utils.parsing import *              # Functions for parsing and structuring raw article data (XML/JSON from PubMed)
from src.utils.checkpoints import *          # Handles saving and loading intermediate progress during ingestion
from src.utils.scraping import *             # Contains web-scraping utilities for additional metadata retrieval
from src.utils.load_and_save import determine_output_filename  # Utility to dynamically define output file names
from src.utils.cleaning import *             # Text normalization and cleaning functions (remove punctuation, brackets, etc.)
from src.utils.initial_embedding import *    # Handles initial embedding generation (text-to-vector conversion)

# --- Project Class Definitions ---
from src.classes.article_metadata import ArticleMetadata  # Data structure to store and organize metadata for each article
from src.classes.data_types import Embeddings             # Class representing and managing text embeddings

# --- Environment & Configuration ---
from dotenv import load_dotenv, find_dotenv   # Loads environment variables (e.g., API keys, paths) from a .env file

# --- Progress Visualization ---
from tqdm import tqdm                         # Progress bar for loops and long-running ingestion or embedding tasks

# --- Transformer Models (optional alternative to SentenceTransformers) ---
from transformers import BertModel, AutoTokenizer  # Used for direct embedding extraction or fine-tuning using BERT models

**NEUROSCIENCE SCRAPING**

In [None]:
"""
This script is designed to automate the process of scraping journal article metadata and abstracts from PubMed.
"""
from src.utils.parsing import *
from src.utils.checkpoints import *
from src.utils.scraping import *
from src.utils.load_and_save import determine_output_filename

# Add the 'src' directory to the Python path
# This allows importing custom project modules (e.g., src.utils, src.classes) without import errors
sys.path.append('/content/drive/MyDrive/NeuroScape/src')

# Prevent argparse-related errors when running notebooks in Google Colab
# Colab automatically injects command-line arguments that can interfere with scripts using argparse
sys.argv = [sys.argv[0]]

articlemetadata = ArticleMetadata()

load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']
EMAIL = os.environ['EMAIL']

if __name__ == '__main__':

    # Initialize the ArticleMetadata
    articlemetadata = ArticleMetadata()

    # Load the configurations and unpack the scraping parameters
    directories = parse_directories()
    scraping_parameters = load_configurations()
    prefix, suffix, sleep_time, num_attempts, items_per_shard = unpack_scraping_parameters(
        scraping_parameters)
    discipline = parse_discipline()
    quartile = parse_quartile()
    #max_results = parse_max_results()
    max_results = 50

    print(f'Scraping data for {discipline}...')

    # Define the base directory and the input, processed, and output folders
    input_folder = "/content/drive/MyDrive/NeuroScape/scimago/neuroscience"
    input_files = glob.glob(os.path.join(input_folder, "*.csv"))

    checkpoints_folder = "/content/drive/MyDrive/NeuroScape/output/checkpoints/neuroscience"
    lut_file = "/content/drive/MyDrive/NeuroScape/data/internal/reference/neuroscience/journal_lut.csv"

    output_folder = "/content/drive/MyDrive/NeuroScape/output/neuroscience"
    os.makedirs(output_folder, exist_ok=True)

    # Determine the output file
    output_file, shard_id = determine_output_filename(output_folder, 'csv')

    # Initialize the data dictionary
    data = reset_data()

    # Load the lookup table for relating Scimago and PubMed journal names
    lut = pd.read_csv(lut_file)

    # Load the processed articles
    processed_file = os.path.join(checkpoints_folder, 'scraped_articles.json')
    processed_articles = load_processed_articles(processed_file)

    # Initialize the number of items in the current shard
    num_items = 0
    print('Searching for articles...')

    print("Input folder:", input_folder)
    print("Files found:", input_files)
    print("Discipline:", discipline)
    print("Quartile filter:", quartile)

    # Loop through each input (scimago) file
    for file in sorted(input_files):
        year = file.split(prefix)[1].split(suffix)[0].strip()
        print(f' Year: {year}')

        # Load the dataframe from the input file
        scimago_df = pd.read_csv(file, sep=';')
        scimago_df = scimago_df[scimago_df['SJR Best Quartile'] == quartile]

        # Loop through each journal falling within the specified quartile
        for journal in scimago_df['Title']:
            print(f' Journal: {journal}')

            # Get the disciplines this journal falls under
            disciplines = scimago_df[scimago_df['Title'] ==
                                     journal]['Areas'].values[0].replace(
                                         ';', ' /')

            # Check if the journal has an alternate name
            query_journal = check_alternate_journal_names(journal, lut)

            # Define the PubMed query
            query = f"""
            ("{query_journal}"[Journal]) AND  (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))
            """

            # Try to get the PubMed IDs for the query
            for _ in range(num_attempts):
                try:
                    pubmed_ids = get_id_list(query,
                                             EMAIL,
                                             max_results=max_results)
                    break
                except:
                    sleep(sleep_time)

            # If no PubMed IDs were found, continue to the next journal
            if pubmed_ids is None:
                continue

            # Initialize the number of obtained articles
            total_articles = len(pubmed_ids)
            obtained_articles = 0
            print(f' Total articles found: {total_articles}')

            # Loop through each PubMed ID
            for article_id in pubmed_ids:

                # If the article has already been processed, continue to the next article
                if article_id in processed_articles:
                    obtained_articles += 1
                    continue

                # Add the article to the set of processed articles
                processed_articles.add(article_id)

                # Try to fetch the metadata for the article
                for _ in range(num_attempts):
                    try:
                        metadata = articlemetadata.fetch(article_id)
                        break
                    except:
                        sleep(sleep_time)

                # If metadata was found, add it to the data dictionary
                if metadata is not None:
                    data = update_data(article_id, data, metadata, disciplines)
                    obtained_articles += 1
                    num_items += 1

                # If the number of items in the current shard is equal to the items per shard,
                # save the data and update the output file (new shard)
                if (num_items == items_per_shard):
                    save_data(data, output_file)
                    save_processed_articles(processed_file, processed_articles)

                    data = reset_data()
                    shard_id = shard_id + 1
                    output_file = os.path.join(output_folder,
                                               f'shard_{shard_id:04d}.csv')
                    num_items = 0
            print(f' Articles obtained: {obtained_articles}')

Scraping data for Neuroscience...
Searching for articles...
Input folder: /content/drive/MyDrive/NeuroScape/scimago/neuroscience
Files found: ['/content/drive/MyDrive/NeuroScape/scimago/neuroscience/scimagojr 2023  Subject Area - Neuroscience.csv', '/content/drive/MyDrive/NeuroScape/scimago/neuroscience/scimagojr 2022  Subject Area - Neuroscience.csv', '/content/drive/MyDrive/NeuroScape/scimago/neuroscience/scimagojr 2020  Subject Area - Neuroscience.csv', '/content/drive/MyDrive/NeuroScape/scimago/neuroscience/scimagojr 2021  Subject Area - Neuroscience.csv']
Discipline: Neuroscience
Quartile filter: Q1
 Year: 2020
 Journal: Nature Neuroscience
Query sendo enviada para PubMed: 
            ("Nature Neuroscience"[Journal]) AND  (("2020/01/01"[Date - Publication] : "2020/12/31"[Date - Publication]))
            
IDs retornados: ['33230329', '31959936', '32341542', '32632286', '32778792', '33288908', '32661395', '32341540', '32632287', '33169032', '32066983', '32112058', '32989295', '333

KeyboardInterrupt: 

**SCRAPING OTHERDISCIPLINES**

In [None]:
"""
This script is designed to automate the process of scraping journal article metadata and abstracts from PubMed.
"""
from src.utils.parsing import *
from src.utils.checkpoints import *
from src.utils.scraping import *
from src.utils.load_and_save import determine_output_filename

%cd /content/drive/MyDrive/NeuroScape

# Add the 'src' directory to the Python path
# This allows importing custom project modules (e.g., src.utils, src.classes) without import errors
sys.path.append('/content/drive/MyDrive/NeuroScape/src')

# Prevent argparse-related errors when running notebooks in Google Colab
# Colab automatically injects command-line arguments that can interfere with scripts using argparse
sys.argv = [sys.argv[0]]

articlemetadata = ArticleMetadata()

load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']
EMAIL = os.environ['EMAIL']

if __name__ == '__main__':

    # Initialize the ArticleMetadata
    articlemetadata = ArticleMetadata()

    # Load the configurations and unpack the scraping parameters
    directories = parse_directories()
    scraping_parameters = load_configurations()
    prefix, suffix, sleep_time, num_attempts, items_per_shard = unpack_scraping_parameters(
        scraping_parameters)
    discipline = 'Computer Science'
    quartile = parse_quartile()
    #max_results = parse_max_results()
    max_results = 50

    print(f'Scraping data for {discipline}...')

    # Define the base directory and the input, processed, and output folders
    input_folder = "/content/drive/MyDrive/NeuroScape/scimago/otherdisciplines"
    input_files = glob.glob(os.path.join(input_folder, "*.csv"))

    checkpoints_folder = "/content/drive/MyDrive/NeuroScape/output/checkpoints/otherdisciplines"
    lut_file = "/content/drive/MyDrive/NeuroScape/data/internal/reference/otherdisciplines/journal_lut_otherdisciplines.csv"

    output_folder = "/content/drive/MyDrive/NeuroScape/output/otherdisciplines"
    os.makedirs(output_folder, exist_ok=True)

    # Determine the output file
    output_file, shard_id = determine_output_filename(output_folder, 'csv')

    # Initialize the data dictionary
    data = reset_data()

    # Load the lookup table for relating Scimago and PubMed journal names
    lut = pd.read_csv(lut_file)

    # Load the processed articles
    processed_file = os.path.join(checkpoints_folder, 'scraped_articles.json')
    processed_articles = load_processed_articles(processed_file)

    # Initialize the number of items in the current shard
    num_items = 0
    print('Searching for articles...')

    print("Input folder:", input_folder)
    print("Files found:", input_files)
    print("Discipline:", discipline)
    print("Quartile filter:", quartile)

    # Loop through each input (scimago) file
    for file in sorted(input_files):
        year = file.split(prefix)[1].split(suffix)[0].strip()
        print(f' Year: {year}')

        # Load the dataframe from the input file
        scimago_df = pd.read_csv(file, sep=';')
        scimago_df = scimago_df[scimago_df['SJR Best Quartile'] == quartile]

        # Loop through each journal falling within the specified quartile
        for journal in scimago_df['Title']:
            print(f' Journal: {journal}')

            # Get the disciplines this journal falls under
            disciplines = scimago_df[scimago_df['Title'] ==
                                     journal]['Areas'].values[0].replace(
                                         ';', ' /')

            # Check if the journal has an alternate name
            query_journal = check_alternate_journal_names(journal, lut)

            # Define the PubMed query
            query = f"""
            ("{query_journal}"[Journal]) AND  (("{year}/01/01"[Date - Publication] : "{year}/12/31"[Date - Publication]))
            """

            # Try to get the PubMed IDs for the query
            for _ in range(num_attempts):
                try:
                    pubmed_ids = get_id_list(query,
                                             EMAIL,
                                             max_results=max_results)
                    break
                except:
                    sleep(sleep_time)

            # If no PubMed IDs were found, continue to the next journal
            if pubmed_ids is None:
                continue

            # Initialize the number of obtained articles
            total_articles = len(pubmed_ids)
            obtained_articles = 0
            print(f' Total articles found: {total_articles}')

            # Loop through each PubMed ID
            for article_id in pubmed_ids:

                # If the article has already been processed, continue to the next article
                if article_id in processed_articles:
                    obtained_articles += 1
                    continue

                # Add the article to the set of processed articles
                processed_articles.add(article_id)

                # Try to fetch the metadata for the article
                for _ in range(num_attempts):
                    try:
                        metadata = articlemetadata.fetch(article_id)
                        break
                    except:
                        sleep(sleep_time)

                # If metadata was found, add it to the data dictionary
                if metadata is not None:
                    data = update_data(article_id, data, metadata, disciplines)
                    obtained_articles += 1
                    num_items += 1

                # If the number of items in the current shard is equal to the items per shard,
                # save the data and update the output file (new shard)
                if (num_items == items_per_shard):
                    save_data(data, output_file)
                    save_processed_articles(processed_file, processed_articles)

                    data = reset_data()
                    shard_id = shard_id + 1
                    output_file = os.path.join(output_folder,
                                               f'shard_{shard_id:04d}.csv')
                    num_items = 0
            print(f' Articles obtained: {obtained_articles}')

/content/drive/MyDrive/Trabalhos/TJ/NeuroScape
Scraping data for Computer Science...
Searching for articles...
Input folder: /content/drive/MyDrive/NeuroScape/scimago/otherdisciplines
Files found: ['/content/drive/MyDrive/NeuroScape/scimago/otherdisciplines/scimagojr 2019  Subject Area - Computer Science.csv', '/content/drive/MyDrive/NeuroScape/scimago/otherdisciplines/scimagojr 2018  Subject Area - Computer Science.csv', '/content/drive/MyDrive/NeuroScape/scimago/otherdisciplines/scimagojr 2017  Subject Area - Computer Science.csv']
Discipline: Computer Science
Quartile filter: Q1
 Year: 2017
 Journal: Journal of Statistical Software
 Total articles found: 8
 Articles obtained: 8
 Journal: Molecular Systems Biology
 Total articles found: 50
 Articles obtained: 50
 Journal: Bioinformatics
 Total articles found: 50


KeyboardInterrupt: 

**NEUROSCIENCE MERGE AND CLEAN**

In [None]:
from src.utils.cleaning import *
from src.utils.parsing import parse_directories, parse_discipline

# Add the 'src' directory to the system path
# This allows importing project-specific modules
sys.path.append('/content/drive/MyDrive/NeuroScape/src')

# Load environment variables from the .env file
load_dotenv(find_dotenv())
BASEPATH = os.environ['BASEPATH']

if __name__ == '__main__':
    directories = parse_directories()        # Reads directory structure defined in configuration files
    config = load_configurations()           # Loads parameters such as word limits, cutoff years, etc.

    # Define filtering criteria for cleaning based on configuration settings
    cutoffs = (
        config['word_limit']['lower'],       # Minimum number of words allowed in an abstract
        config['word_limit']['upper'],       # Maximum number of words allowed in an abstract
        config['year_cutoff']                # Exclude papers published before this year
    )

    discipline = parse_discipline()          # Defines which scientific field is being processed

    raw_directory = "/content/drive/MyDrive/NeuroScape/output/neuroscience"       # Folder containing raw CSV files from PubMed ingestion
    cleaned_directory = "/content/drive/MyDrive/NeuroScape/output/tratados/neuroscience"  # Folder to store cleaned and processed files

    # Retrieve all CSV files from the raw directory
    raw_files = glob.glob(os.path.join(raw_directory, '*.csv'))

    print(f'Merging {len(raw_files)} files.')
    df = concatenate_files(raw_files)        # Concatenates multiple CSVs into a unified dataset
    df['Pmid'] = df['Pmid'].astype(int)      # Ensures PubMed IDs are treated as integers

    print(f'Cleaning the merged dataframe with {len(df)} articles.')
    df = clean_dataframe(df, cutoffs)        # Applies text cleaning, length filters, and removes invalid entries

    print(f'Number of articles after cleaning: {len(df)}')

    print('Sorting the clean dataframe.')
    df = sort_dataframe(df)                  # Sorts articles, typically by publication date or ID

    print(f'Saving the clean dataframe to {cleaned_directory}.')
    os.makedirs(cleaned_directory, exist_ok=True)  # Creates the output directory if it doesn’t exist

    # Save the final cleaned dataset to CSV format
    df.to_csv(
        os.path.join(cleaned_directory, 'articles_merged_cleaned.csv'),
        index=False
    )

Merging 84 files.


 74%|███████▍  | 62/84 [00:14<00:05,  4.26it/s]


KeyboardInterrupt: 

**OTHERDISCIPLINES MERGE AND CLEAN**

In [None]:
from src.utils.cleaning import *
from src.utils.parsing import parse_directories, parse_discipline

# Add the root 'src' directory to the system path
# This enables importing custom project modules (e.g., src.utils, src.classes)
sys.path.append('/content/drive/MyDrive/NeuroScape/src')

# Load environment variables from the .env file (e.g., BASEPATH, API keys, config paths)
load_dotenv(find_dotenv())
BASEPATH = os.environ['BASEPATH']

if __name__ == '__main__':
    # Load directory structure and configuration parameters
    directories = parse_directories()        # Reads directories defined in project settings
    config = load_configurations()           # Loads word limits, year cutoffs, and other processing parameters

    # Define filtering thresholds for the cleaning step
    cutoffs = (
        config['word_limit']['lower'],       # Minimum abstract length
        config['word_limit']['upper'],       # Maximum abstract length
        config['year_cutoff']                # Minimum publication year
    )

    discipline = parse_discipline()          # Defines the scientific field being processed (e.g., "otherdisciplines")

    # Define input (raw) and output (cleaned) directories
    raw_directory = "/content/drive/MyDrive/NeuroScape/output/otherdisciplines"
    cleaned_directory = "/content/drive/MyDrive/NeuroScape/output/tratados/otherdisciplines"

    # Retrieve all CSV files from the raw data folder
    raw_files = glob.glob(os.path.join(raw_directory, '*.csv'))

    # --- Merge raw CSV files into one DataFrame ---
    print(f'Merging {len(raw_files)} files.')
    df = concatenate_files(raw_files)        # Combines all CSVs into a single DataFrame
    df['Pmid'] = df['Pmid'].astype(int)      # Ensures PubMed IDs are stored as integers

    # --- Clean the merged dataset ---
    print(f'Cleaning the merged dataframe with {len(df)} articles.')
    df = clean_dataframe(df, cutoffs)        # Removes duplicates, filters by word count and year, and cleans text

    print(f'Number of articles after cleaning: {len(df)}')

    # --- Sort and save the cleaned DataFrame ---
    print('Sorting the clean dataframe.')
    df = sort_dataframe(df)                  # Sorts articles, typically by publication date or ID

    print(f'Saving the clean dataframe to {cleaned_directory}.')
    os.makedirs(cleaned_directory, exist_ok=True)  # Creates output directory if it does not exist

    # Save the final cleaned dataset to CSV format
    df.to_csv(
        os.path.join(cleaned_directory, 'articles_merged_cleaned.csv'),
        index=False
    )

Merging 72 files.


 22%|██▏       | 16/72 [00:03<00:12,  4.63it/s]


KeyboardInterrupt: 

**NEUROSCIENCE EMBEDDING WITH ALLMINILM-L6-V2**

In [None]:
from src.utils.initial_embedding import *
from src.classes.data_types import Embeddings
from src.utils.parsing import parse_directories, parse_discipline
from src.utils.load_and_save import determine_output_filename
from src.utils.checkpoints import save_processed_articles, load_processed_articles

# Add the root 'src' directory to the system path
# This enables importing custom project modules (e.g., src.utils, src.classes)
sys.path.append('/content/drive/MyDrive/NeuroScape/src')


class SentenceTransformerEmbeddings:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
        # Initializes the embedding model using SentenceTransformers
        # model_name: pre-trained model for semantic embedding generation
        # batch_size: number of documents to process per iteration
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size

    def embed_documents(self, texts):
        # Converts a list of texts into numerical embeddings
        return self.model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=False,
            convert_to_numpy=True
        )


if __name__ == "__main__":

    # Load project directories and embedding configuration parameters
    directories = parse_directories()
    embedding_parameters = load_configurations()

    # Extract model and batching parameters from configuration
    model_name, sleep_time, batch_size, items_per_shard = unpack_embedding_parameters(
        embedding_parameters
    )

    # Define checkpoint and embedding output directories
    checkpoints_folder = os.path.join(BASEPATH, directories['internal']['checkpoints'])
    discipline = parse_discipline()

    df_dir = os.path.join(
        BASEPATH,
        directories['internal']['intermediate']['csv'],
        discipline
    )

    embedding_dir = '/content/drive/MyDrive/NeuroScape/output/embeddings/neuroscience'

    # Load the cleaned dataset (previously processed in the ingestion stage)
    file = '/content/drive/MyDrive/NeuroScape/output/tratados/neuroscience/articles_merged_cleaned.csv'
    df = pd.read_csv(file)

    # Initialize the embedding model
    embedding_model = SentenceTransformerEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        batch_size=batch_size
    )

    # Load the list of already embedded articles to skip redundant processing
    embedded_articles_file = os.path.join(
        '/content/drive/MyDrive/NeuroScape/data/internal/checkpoints/neuroscience',
        'embedded_articles.json'
    )
    embedded_articles = load_processed_articles(embedded_articles_file)

    # Remove already embedded articles from the dataframe
    df = df[~df['Pmid'].isin(embedded_articles)]

    # Ensure embedding output directory exists
    os.makedirs(embedding_dir, exist_ok=True)

    # Determine initial output filename for embedding shards
    output_file, shard_id = determine_output_filename(embedding_dir, 'pkl')

    # Iterate through the dataframe in chunks (shards) to generate embeddings
    for start in tqdm(range(0, len(df), items_per_shard)):
        end = start + items_per_shard

        abstract_embeddings = Embeddings(pmids=[], embeddings=[])

        selection = df.iloc[start:end]
        abstracts = selection['Abstract'].tolist()

        # Generate embeddings for abstracts
        embedded_abstracts = embedding_model.embed_documents(abstracts)

        # Store PMIDs and embeddings
        abstract_embeddings.pmids = selection['Pmid'].tolist()
        abstract_embeddings.embeddings = embedded_abstracts

        # Save the current shard of embeddings to disk
        save_embeddings(abstract_embeddings, output_file)

        # Update the list of processed articles
        embedded_articles.update(abstract_embeddings.pmids)
        save_processed_articles(embedded_articles_file, embedded_articles)

        # Prepare next shard filename
        shard_id += 1
        output_file = os.path.join(embedding_dir, f'shard_{shard_id:04d}.pkl')

        # Wait before next batch to avoid overloading resources
        time.sleep(sleep_time)


KeyboardInterrupt: 

**NEUROSCIENCE TEST EMBEDDING WITH MIREAD**

In [None]:
from src.utils.initial_embedding import *
from src.classes.data_types import Embeddings
from src.utils.parsing import parse_directories, parse_discipline
from src.utils.load_and_save import determine_output_filename
from src.utils.checkpoints import save_processed_articles, load_processed_articles

# Add the root 'src' directory to the system path
# This allows importing custom project modules (e.g., src.utils, src.classes)
sys.path.append('/content/drive/MyDrive/NeuroScape/src')

class MIReADEmbeddings:
    def __init__(self, model_name="arazd/miread", batch_size=8, device="cuda"):
        # Initialize tokenizer and model from Hugging Face
        # model_name: pre-trained MIReAD BERT model
        # batch_size: number of documents per inference batch
        # device: "cuda" for GPU or "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(device)
        self.batch_size = batch_size
        self.device = device

    def embed_documents(self, texts):
        # Convert list of texts into embeddings using MIReAD BERT
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i+self.batch_size]
            inputs = self.tokenizer(batch_texts,
                                    padding=True,
                                    truncation=True,
                                    max_length=512,
                                    return_tensors="pt")
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
                batch_embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token embedding
                embeddings.append(batch_embeddings.cpu().numpy())
        return np.vstack(embeddings)


if __name__ == "__main__":

    # Load directories and embedding parameters
    directories = parse_directories()
    embedding_parameters = load_configurations()
    model_name, sleep_time, batch_size, items_per_shard = unpack_embedding_parameters(embedding_parameters)

    discipline = parse_discipline()
    df_dir = os.path.join(BASEPATH, directories['internal']['intermediate']['csv'], discipline)
    embedding_dir = '/content/drive/MyDrive/NeuroScape/output/embeddings/neuroscience'

    # Load cleaned dataset
    file = '/content/drive/MyDrive/NeuroScape/output/tratados/neuroscience/articles_merged_cleaned.csv'
    df = pd.read_csv(file)

    # Initialize MIReAD embedding model
    embedding_model = MIReADEmbeddings(model_name="arazd/miread", batch_size=batch_size, device="cuda")

    # Load list of already embedded articles
    embedded_articles_file = os.path.join(
        '/content/drive/MyDrive/NeuroScape/data/internal/checkpoints/neuroscience',
        'embedded_articles.json'
    )
    embedded_articles = load_processed_articles(embedded_articles_file)

    # Remove articles already embedded
    df = df[~df['Pmid'].isin(embedded_articles)]
    print("Artigos a processar agora:", len(df))

    # Ensure embedding directory exists
    os.makedirs(embedding_dir, exist_ok=True)

    # Determine initial output filename and shard index
    output_file, shard_id = determine_output_filename(embedding_dir, 'pkl')

    # Iterate through the dataset in shards to generate embeddings
    for start in tqdm(range(0, len(df), items_per_shard)):
        end = start + items_per_shard
        abstract_embeddings = Embeddings(pmids=[], embeddings=[])
        selection = df.iloc[start:end]

        # Concatenate Title + Abstract for richer representation
        abstracts = (selection['Title'] + " " + selection['Abstract']).tolist()
        embedded_abstracts = embedding_model.embed_documents(abstracts)

        # Store PMIDs and embeddings
        abstract_embeddings.pmids = selection['Pmid'].tolist()
        abstract_embeddings.embeddings = embedded_abstracts

        # Save current shard
        save_embeddings(abstract_embeddings, output_file)

        # Update processed articles
        embedded_articles.update(abstract_embeddings.pmids)
        save_processed_articles(embedded_articles_file, embedded_articles)

        # Prepare next shard filename
        shard_id += 1
        output_file = os.path.join(embedding_dir, f'shard_{shard_id:04d}.pkl')

        # Sleep to avoid overloading resources
        time.sleep(sleep_time)


tokenizer_config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

KeyboardInterrupt: 

**OTHERDISCIPLINES EMBEDDING WITH ALLMINILM-L6-V2**

In [None]:
from src.utils.initial_embedding import *
from src.classes.data_types import Embeddings
from src.utils.parsing import parse_directories, parse_discipline
from src.utils.load_and_save import determine_output_filename
from src.utils.checkpoints import save_processed_articles, load_processed_articles

sys.path.append('/content/drive/MyDrive/NeuroScape/src')

class SentenceTransformerEmbeddings:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size

    def embed_documents(self, texts):
        return self.model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=False,
            convert_to_numpy=True
        )

if __name__ == "__main__":

    directories = parse_directories()

    embedding_parameters = load_configurations()
    model_name, sleep_time, batch_size, items_per_shard = unpack_embedding_parameters(
        embedding_parameters
    )

    checkpoints_folder = os.path.join(BASEPATH,
                                      directories['internal']['checkpoints'])

    discipline = parse_discipline()
    df_dir = os.path.join(
        BASEPATH,
        directories['internal']['intermediate']['csv'],
        discipline
    )
    embedding_dir = '/content/drive/MyDrive/NeuroScape/output/embeddings/otherdisciplines'

    file = '/content/drive/MyDrive/NeuroScape/output/tratados/otherdisciplines/articles_merged_cleaned.csv'
    df = pd.read_csv(file)

    embedding_model = SentenceTransformerEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        batch_size=batch_size
    )

    embedded_articles_file = os.path.join(
        '/content/drive/MyDrive/NeuroScape/data/internal/checkpoints/otherdisciplines',
        'embedded_articles.json'
    )
    embedded_articles = load_processed_articles(embedded_articles_file)

    # remove already embedded articles from df
    df = df[~df['Pmid'].isin(embedded_articles)]

    # check if directory exists
    os.makedirs(embedding_dir, exist_ok=True)

    output_file, shard_id = determine_output_filename(embedding_dir, 'pkl')

    for start in tqdm(range(0, len(df), items_per_shard)):
        end = start + items_per_shard

        abstract_embeddings = Embeddings(pmids=[], embeddings=[])

        selection = df.iloc[start:end]

        abstracts = selection['Abstract'].tolist()
        embedded_abstracts = embedding_model.embed_documents(abstracts)

        abstract_embeddings.pmids = selection['Pmid'].tolist()
        abstract_embeddings.embeddings = embedded_abstracts

        save_embeddings(abstract_embeddings, output_file)

        embedded_articles.update(abstract_embeddings.pmids)

        save_processed_articles(embedded_articles_file, embedded_articles)
        shard_id = shard_id + 1
        output_file = os.path.join(embedding_dir, f'shard_{shard_id:04d}.pkl')
        time.sleep(sleep_time)


TypeError: 'module' object is not callable. Did you mean: 'tqdm.tqdm(...)'?

**OTHERDISCIPLINES TEST EMBEDDING WITH MIREAD**

In [None]:
from src.utils.initial_embedding import *
from src.classes.data_types import Embeddings
from src.utils.parsing import parse_directories, parse_discipline
from src.utils.load_and_save import determine_output_filename
from src.utils.checkpoints import save_processed_articles, load_processed_articles

# Add the root 'src' directory to the system path
sys.path.append('/content/drive/MyDrive/NeuroScape/src')


class SentenceTransformerEmbeddings:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32):
        # Initialize SentenceTransformer model for embedding generation
        self.model = SentenceTransformer(model_name)
        self.batch_size = batch_size

    def embed_documents(self, texts):
        # Encode a list of texts into embeddings
        return self.model.encode(
            texts,
            batch_size=self.batch_size,
            show_progress_bar=False,
            convert_to_numpy=True
        )


if __name__ == "__main__":

    # Load project directories and embedding configuration
    directories = parse_directories()
    embedding_parameters = load_configurations()
    model_name, sleep_time, batch_size, items_per_shard = unpack_embedding_parameters(
        embedding_parameters
    )

    checkpoints_folder = os.path.join(BASEPATH,
                                      directories['internal']['checkpoints'])

    discipline = parse_discipline()
    df_dir = os.path.join(
        BASEPATH,
        directories['internal']['intermediate']['csv'],
        discipline
    )
    embedding_dir = '/content/drive/MyDrive/NeuroScape/output/embeddings/otherdisciplines'

    # Load cleaned dataset
    file = '/content/drive/MyDrive/NeuroScape/output/tratados/otherdisciplines/articles_merged_cleaned.csv'
    df = pd.read_csv(file)

    # Initialize embedding model
    embedding_model = SentenceTransformerEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        batch_size=batch_size
    )

    # Load already processed articles
    embedded_articles_file = os.path.join(
        '/content/drive/MyDrive/NeuroScape/data/internal/checkpoints/otherdisciplines',
        'embedded_articles.json'
    )
    embedded_articles = load_processed_articles(embedded_articles_file)

    # Remove already embedded articles from dataframe
    df = df[~df['Pmid'].isin(embedded_articles)]

    # Ensure output directory exists
    os.makedirs(embedding_dir, exist_ok=True)

    # Determine initial output filename and shard index
    output_file, shard_id = determine_output_filename(embedding_dir, 'pkl')

    # Iterate through dataframe in shards to generate embeddings
    for start in tqdm(range(0, len(df), items_per_shard)):
        end = start + items_per_shard

        abstract_embeddings = Embeddings(pmids=[], embeddings=[])
        selection = df.iloc[start:end]

        abstracts = selection['Abstract'].tolist()
        embedded_abstracts = embedding_model.embed_documents(abstracts)

        abstract_embeddings.pmids = selection['Pmid'].tolist()
        abstract_embeddings.embeddings = embedded_abstracts

        # Save embeddings shard
        save_embeddings(abstract_embeddings, output_file)

        # Update list of already embedded articles
        embedded_articles.update(abstract_embeddings.pmids)
        save_processed_articles(embedded_articles_file, embedded_articles)

        # Prepare next shard file
        shard_id += 1
        output_file = os.path.join(embedding_dir, f'shard_{shard_id:04d}.pkl')

        # Sleep to avoid resource overload
        time.sleep(sleep_time)


pytorch_model.bin:   0%|          | 0.00/448M [00:00<?, ?B/s]

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

**BUILD ADJACENCY MATRIX**

In [None]:
from glob import glob

from src.utils.adjacency import *
from src.utils.parsing import parse_directories
from src.utils.checkpoints import load_processed_articles, save_processed_articles
from src.utils.load_and_save import save_articles_to_hdf5, load_articles_from_hdf5, determine_output_filename

sys.path.append('/content/drive/MyDrive/NeuroScape/src')

load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']
EMAIL = os.environ['EMAIL']


def fetch_links(articles, all_pubmed_ids, config, items_per_shard,
                article_directory, utils_folder):
    """
    Fetch the in-links and out-links for the given articles.

    Parameters:
    - articles: list
        List of Article objects.
    - all_pubmed_ids: list
        List of all PubMed IDs.
    - config: dict
        Configuration for fetching the links.
    - items_per_shard: int
        Number of items per shard.
    - article_directory: str
        Directory for the articles.
    - utils_folder: str
        Directory for the utility files.

    Returns:
    - articles: list
        List of Article objects with in-links and out-links added.
    """

    num_attempts = config['num_attempts']
    sleep_time = config['sleep_time']

    all_dois = [article.doi for article in articles]
    doi_to_pubmed_id = {article.doi: article.pmid for article in articles}

    output_directory = os.path.join(article_directory, 'backup')
    os.makedirs(output_directory, exist_ok=True)
    output_file, shard_id = determine_output_filename(output_directory, 'h5')

    num_items = 0
    processed_file = os.path.join(utils_folder, 'linked_articles.json')
    processed_articles = load_processed_articles(processed_file)
    # Fetching the in-links and out-links
    shard = []
    for article in tqdm(articles, total=len(articles)):

        pubmed_id = article.pmid
        doi = article.doi

        if pubmed_id in processed_articles:
            continue

        for _ in range(num_attempts):
            try:
                in_link_candidates = fetch_in_links(pubmed_id)
                break
            except:
                sleep(sleep_time)

        for _ in range(num_attempts):
            try:
                out_link_candidates = fetch_out_links(doi)
                break
            except:
                sleep(sleep_time)

        in_links = get_intersection(in_link_candidates, all_pubmed_ids)
        out_dois = get_intersection(out_link_candidates, all_dois)
        out_links = [doi_to_pubmed_id[doi] for doi in out_dois]

        article.in_links = in_links
        article.out_links = out_links
        shard.append(article)
        processed_articles.add(pubmed_id)
        num_items += 1

        if (num_items == items_per_shard):
            save_articles_to_hdf5(shard, output_file, disable_tqdm=True)
            save_processed_articles(processed_file, processed_articles)
            shard = []
            num_items = 0
            shard_id += 1
            output_file = os.path.join(output_directory,
                                       f'shard_{shard_id:04d}.h5')

    return articles


def update_links(articles, all_pubmed_ids):
    """
    Update the in-links and out-links for the given articles.

    Parameters:
    - articles: list
        List of Article objects.
    - all_pubmed_ids: list
        List of all PubMed IDs.

    Returns:
    - articles: list
        List of Article objects with updated in-links and out-links.
    """
    # Create a dictionary mapping PubMed IDs to indices
    id_to_index = {
        pubmed_id: index
        for index, pubmed_id in enumerate(all_pubmed_ids)
    }

    for article in tqdm(articles, total=len(articles)):
        for out_link in article.out_links:
            article_index = id_to_index[out_link]
            articles[article_index].in_links = list(
                set(articles[article_index].in_links) | {article.pmid})

        for in_link in article.in_links:
            article_index = id_to_index[in_link]
            articles[article_index].out_links = list(
                set(articles[article_index].out_links) | {article.pmid})

    return articles


if __name__ == '__main__':
    configurations = load_configurations()
    Entrez.email = EMAIL
    fetch_config = configurations['pubmed_requests']
    items_per_shard = configurations['storage']['items_per_shard']

    directories = parse_directories()
    article_directory = os.path.join(
        '/content/drive/MyDrive/NeuroScape/output/filtrados')

    checkpoints_folder = os.path.join(BASEPATH,
                                      directories['internal']['checkpoints'])

    # Loading the articles
    print('Loading articles...')
    file_names = glob(os.path.join(article_directory, '*.h5'))
    all_articles = []

    file_pmid_dict = {}
    for file_name in tqdm(file_names):
        articles = load_articles_from_hdf5(file_name, disable_tqdm=True)
        all_articles.extend(articles)
        file_pmid_dict[file_name] = [article.pmid for article in articles]

    all_pubmed_ids = [article.pmid for article in all_articles]

    # Fetching the in-links and out-links
    print('Fetching links...')
    all_articles = fetch_links(all_articles, all_pubmed_ids, fetch_config,
                               items_per_shard, article_directory,
                               checkpoints_folder)
    print('Updating links...')
    all_articles = update_links(all_articles, all_pubmed_ids)

    print('Calculating citation rates...')
    for article in all_articles:
        num_citations = len(article.in_links)  # total de citações recebidas
        age_in_years = max(1, 2025 - article.year)  # evita divisão por zero
        article.citation_rate = num_citations / age_in_years

    # Saving the articles with in-links and out-links
    print('Saving articles...')

    for file_name in tqdm(file_names):
        articles = [
            article for article in all_articles
            if article.pmid in file_pmid_dict[file_name]
        ]

        # replace the old file with the new one
        save_articles_to_hdf5(articles, file_name, disable_tqdm=True)

Loading articles...



  0%|          | 0/12 [00:00<?, ?it/s][A
  8%|▊         | 1/12 [00:00<00:07,  1.51it/s][A
 17%|█▋        | 2/12 [00:01<00:06,  1.58it/s][A
 25%|██▌       | 3/12 [00:02<00:06,  1.40it/s][A
 33%|███▎      | 4/12 [00:02<00:05,  1.42it/s][A
 42%|████▏     | 5/12 [00:03<00:04,  1.47it/s][A
 50%|█████     | 6/12 [00:03<00:03,  1.56it/s][A
 58%|█████▊    | 7/12 [00:04<00:03,  1.46it/s][A
 67%|██████▋   | 8/12 [00:05<00:02,  1.38it/s][A
 75%|███████▌  | 9/12 [00:06<00:02,  1.20it/s][A
 83%|████████▎ | 10/12 [00:07<00:01,  1.24it/s][A
 92%|█████████▏| 11/12 [00:08<00:00,  1.29it/s][A
100%|██████████| 12/12 [00:08<00:00,  1.40it/s]


Fetching links...



  0%|          | 0/2298 [00:00<?, ?it/s][A
 96%|█████████▌| 2201/2298 [00:00<00:00, 4189.43it/s][A
 96%|█████████▌| 2201/2298 [00:13<00:00, 4189.43it/s][A
 97%|█████████▋| 2231/2298 [00:13<00:00, 113.79it/s] [A
 97%|█████████▋| 2236/2298 [00:19<00:00, 117.33it/s]


KeyboardInterrupt: 