In [None]:
"""
IMPORTANT:
This project is designed to run exclusively on Google Colab.

It relies on Google Drive being mounted at:
    /content/drive/MyDrive/

Local execution is not supported.
"""


**MOUNTS DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')  # Mounts Google Drive into the Colab environment to access project files

# Changes the current working directory to the NeuroScape project folder in Google Drive
%cd /content/drive/MyDrive/NeuroScape

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1vM2NZYPQBx0CCXgmkPKl1lguMSLh85MO/NeuroScape


**IMPORTING LIBRARIES**

In [None]:
import os                      # Operating system interactions (paths, directories, environment variables)
import glob                    # File pattern matching to find datasets or shards
import time                    # Timing and sleep operations
import pandas as pd            # Data manipulation and analysis (DataFrames)
import torch                   # Core PyTorch library for deep learning
import numpy as np             # Numerical computing (arrays, matrix operations)
import torch.nn as nn          # PyTorch module for building neural networks
import sys                     # System-specific parameters and functions (e.g., path manipulation)
import pickle                  # Save/load Python objects (like trained models or embeddings)
import tomllib                 # Parse TOML configuration files

from sklearn.model_selection import train_test_split  # Split dataset into train/test sets

# Project-specific utilities for the discipline classifier
from src.utils.parsing import parse_directories
from src.utils.classifier import (
    load_configurations,         # Load hyperparameters and settings from config
    save_and_create_dataset,     # Preprocess data and save as structured dataset
    load_shard,                  # Load a shard of the dataset (for large data)
    extract_data,                # Extract features and labels from shards
    get_disciplines,             # List the disciplines present in the dataset
    generate_n_hot_vector,       # Convert labels to multi-hot vectors
    get_unique_disciplines_and_count, # Get all unique disciplines and their counts
    train_one_epoch,             # Train the model for one epoch
    validate,                    # Validate model performance on test set
    save_model,                  # Save trained model to disk
    data_loader,                 # Create PyTorch DataLoader from dataset
    to_device,                   # Move tensors or models to GPU/CPU
    compute_expected_accuracy,   # Compute accuracy metric
    compute_kappa,               # Compute Cohen's Kappa metric
    drop_class                   # Remove classes with insufficient examples
)

# Data type definitions
from src.classes.data_types import EmbeddingsWithLabels  # Holds embeddings along with corresponding labels
from src.classes.discipline_classifier import DisciplineClassifier  # Neural network model for classifying disciplines

# Environment management
from dotenv import load_dotenv, find_dotenv  # Load environment variables for sensitive keys or paths

from copy import deepcopy  # Deep copy objects to prevent mutation


**Prepare Classifier Training Data**

In [None]:
from src.utils.parsing import parse_directories
from src.classes.data_types import EmbeddingsWithLabels
from src.utils.classifier import load_configurations, save_and_create_dataset, \
    load_shard, extract_data, get_disciplines, generate_n_hot_vector, get_unique_disciplines_and_count

# Load environment variables from the keys.env file
load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']

seed = int(time.time())  # Random seed based on current time

def process_shards(shards,
                   dataframe,
                   unique_disciplines,
                   shard_ids,
                   directories,
                   set_type,
                   threshold,
                   delete_shards=False):
    """
    Process a list of shards and split the data into single-label (monolabel)
    and multi-label (multilabel) datasets.

    Parameters:
    - shards (list): List of embedding shard files to process.
    - dataframe (pd.DataFrame): DataFrame containing articles and their disciplines.
    - unique_disciplines (list): All unique disciplines in the dataset.
    - shard_ids (tuple): Current IDs for the mono and multi datasets.
    - directories (tuple): Directories to save monolabel and multilabel datasets.
    - set_type (str): Type of dataset ('Train', 'Val', 'Test').
    - threshold (int): Max number of items per dataset file before saving.
    - delete_shards (bool): Whether to delete shard files after processing.

    Returns:
    - id_mono (int): Updated ID for monolabel dataset.
    - id_multi (int): Updated ID for multilabel dataset.
    """

    # Initialize empty containers for mono (single-label) and multi (multi-label) data
    mono_data = EmbeddingsWithLabels(pmids=[], embeddings=[], labels=[])
    multi_data = EmbeddingsWithLabels(pmids=[], embeddings=[], labels=[])
    num_mono = 0
    num_multi = 0

    id_mono, id_multi = shard_ids
    monolabel_directory, multilabel_directory = directories

    for shard in shards:
        # Save datasets if threshold is reached
        if num_mono >= threshold:
            mono_data, id_mono = save_and_create_dataset(
                mono_data, id_mono, monolabel_directory, set_type)
            num_mono = 0

        if num_multi >= threshold:
            multi_data, id_multi = save_and_create_dataset(
                multi_data, id_multi, multilabel_directory, set_type)
            num_multi = 0

        # Load embeddings from the shard
        abstracts = load_shard(shard)
        if delete_shards:
            os.remove(shard)  # Optionally delete the shard after processing

        # Retrieve disciplines for each article in the shard
        shard_disciplines = [
            get_disciplines(dataframe, pmid) for pmid in abstracts.pmids
        ]

        # Convert disciplines into n-hot vectors for multilabel classification
        n_hot_vector, num_hot = generate_n_hot_vector(shard_disciplines, unique_disciplines)

        # ------------------------
        # MONOLABEL: articles with exactly one discipline
        # ------------------------
        mono_pmids, mono_embeddings, mono_labels = extract_data(
            abstracts, n_hot_vector, num_hot, lambda x: x == 1)
        mono_data.pmids.extend(mono_pmids)
        mono_data.embeddings.extend(mono_embeddings)
        mono_data.labels.extend(mono_labels)
        num_mono += len(mono_pmids)

        # ------------------------
        # MULTILABEL: articles with more than one discipline
        # ------------------------
        multi_pmids, multi_embeddings, multi_labels = extract_data(
            abstracts, n_hot_vector, num_hot, lambda x: x > 1)
        multi_data.pmids.extend(multi_pmids)
        multi_data.embeddings.extend(multi_embeddings)
        multi_data.labels.extend(multi_labels)
        num_multi += len(multi_pmids)

    # Save any remaining articles after processing all shards
    if len(mono_data.pmids) > 0:
        mono_data, id_mono = save_and_create_dataset(mono_data, id_mono, monolabel_directory, set_type)

    if len(multi_data.pmids) > 0:
        multi_data, id_multi = save_and_create_dataset(multi_data, id_multi, multilabel_directory, set_type)

    return id_mono, id_multi


if __name__ == "__main__":

    configurations = load_configurations()  # Load hyperparameters and other settings

    data_directories = parse_directories()  # Retrieve project directory paths

    # Paths for cleaned CSV data and embeddings
    dataframe_directory = os.path.join('/content/drive/MyDrive/NeuroScape/output/tratados/')
    embedding_directory = os.path.join('/content/drive/MyDrive/NeuroScape/output/embeddings')

    other_dataframe_directory = os.path.join(dataframe_directory, 'otherdisciplines')
    neuro_dataframe_directory = os.path.join(dataframe_directory, 'neuroscience')

    other_embedding_directory = os.path.join(embedding_directory, 'otherdisciplines')
    neuro_embedding_directory = os.path.join(embedding_directory, 'neuroscience')

    # Directories to store monolabel and multilabel datasets
    multilabel_directory = os.path.join(
        BASEPATH, data_directories['internal']['intermediate']['classifier'], 'Multilabel')
    monolabel_directory = os.path.join(
        BASEPATH, data_directories['internal']['intermediate']['classifier'], 'Monolabel')

    # Ensure all directories exist for Train, Validation, and Test splits
    os.makedirs(os.path.join(multilabel_directory, 'Train'), exist_ok=True)
    os.makedirs(os.path.join(multilabel_directory, 'Val'), exist_ok=True)
    os.makedirs(os.path.join(multilabel_directory, 'Test'), exist_ok=True)
    os.makedirs(os.path.join(monolabel_directory, 'Train'), exist_ok=True)
    os.makedirs(os.path.join(monolabel_directory, 'Val'), exist_ok=True)
    os.makedirs(os.path.join(monolabel_directory, 'Test'), exist_ok=True)

    # Load CSVs
    other_dataframe = pd.read_csv(os.path.join(other_dataframe_directory, 'articles_merged_cleaned.csv'))
    neuro_dataframe = pd.read_csv(os.path.join(neuro_dataframe_directory, 'articles_merged_cleaned.csv'))

    # Load embedding shards
    other_shards = glob.glob(os.path.join(other_embedding_directory, '*.pkl'))
    neuro_shards = glob.glob(os.path.join(neuro_embedding_directory, '*.pkl'))

    # Get all unique disciplines and their count
    unique_disciplines, num_classes = get_unique_disciplines_and_count(other_dataframe)

    directories = (monolabel_directory, multilabel_directory)
    threshold = configurations['preparation']['item_threshold']

    # Split "other disciplines" into Train, Val, and Test
    train_val_shards, test_shards = train_test_split(other_shards, test_size=0.1, random_state=seed)
    train_shards, val_shards = train_test_split(train_val_shards, test_size=0.1, random_state=seed)

    shard_ids = (0, 0)  # Initial IDs for mono and multi datasets

    print('Processing training shards of other disciplines...')
    other_train_ids = process_shards(train_shards, other_dataframe, unique_disciplines, shard_ids,
                                     directories, 'Train', threshold)

    print('Processing validation of other disciplines...')
    other_val_ids = process_shards(val_shards, other_dataframe, unique_disciplines, shard_ids,
                                   directories, 'Val', threshold)

    print('Processing test shards of other disciplines...')
    other_test_ids = process_shards(test_shards, other_dataframe, unique_disciplines, shard_ids,
                                    directories, 'Test', threshold)

    # Use a small portion of neuroscience shards to augment datasets
    used_shards, _ = train_test_split(neuro_shards, test_size=0.9, random_state=seed)
    train_val_shards, test_shards = train_test_split(used_shards, test_size=0.1, random_state=seed)
    train_shards, val_shards = train_test_split(train_val_shards, test_size=0.1, random_state=seed)

    print('Processing training shards of neuroscience...')
    neuro_train_ids = process_shards(train_shards, neuro_dataframe, unique_disciplines, other_train_ids,
                                     directories, 'Train', threshold, delete_shards=True)

    print('Processing validation shards of neuroscience...')
    neuro_val_ids = process_shards(val_shards, neuro_dataframe, unique_disciplines, other_val_ids,
                                   directories, 'Val', threshold, delete_shards=True)

    print('Processing test shards of neuroscience...')
    neuro_test_ids = process_shards(test_shards, neuro_dataframe, unique_disciplines, other_test_ids,
                                    directories, 'Test', threshold, delete_shards=True)

    print('Data preparation completed.')


**Train Discipline Classifier**

In [None]:
from src.utils.parsing import parse_directories
from src.classes.discipline_classifier import DisciplineClassifier
from src.utils.classifier import load_configurations, train_one_epoch, validate, \
    save_model, data_loader, to_device, compute_expected_accuracy, compute_kappa, \
    drop_class

# Ensure the project path is correct
sys.path.append('/content/drive/MyDrive/NeuroScape/')
os.chdir("/content/drive/MyDrive/NeuroScape")
print("Working directory:", os.getcwd())

# Load environment variables from .env file
load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']

# TRAINING FUNCTION
def train_model(model, filter_model, data_directory, model_directory,
                model_name, configurations, device, loss_function, optimizer):
    """
    Train a discipline classifier model with optional filtering on multi-label datasets.

    Parameters:
    - model: PyTorch model to train
    - filter_model: Optional pretrained model used to filter uncertain labels
    - data_directory: Directory containing training/validation data (Mono or Multi)
    - model_directory: Directory to save the trained model
    - model_name: Name of the saved model file
    - configurations: Training hyperparameters (epochs, batch_size, etc.)
    - device: 'cuda' or 'cpu'
    - loss_function: Loss function for training (BCELoss)
    - optimizer: Optimizer (Adam)

    Returns:
    - best_model: Model with lowest validation loss
    """

    confidence_cutoff = configurations['confidence_cutoff']
    epochs = configurations['epochs']
    buffer_size = configurations['buffer_size']
    batch_size = configurations['batch_size']
    save_path = os.path.join(model_directory, f'{model_name}.pth')
    best_loss = float('inf')
    best_model = None

    # Load training and validation shards
    train_files = glob.glob(os.path.join(data_directory, 'Train/*.pkl'))
    val_files = glob.glob(os.path.join(data_directory, 'Val/*.pkl'))
    X_val, Y_val = data_loader(val_files)

    # If using a filter model, drop uncertain labels from validation set
    if filter_model is not None:
        Y_val = drop_class(filter_model, X_val, Y_val, device, confidence_cutoff)

    expected_accuracy = compute_expected_accuracy(Y_val)
    X_val, Y_val = to_device(X_val, Y_val, device)

    print(f"Expected Accuracy: {expected_accuracy:.4f}")
    print('---' * 10)

    for epoch in range(epochs):
        average_loss = 0
        total_samples = 0
        np.random.shuffle(train_files)  # Shuffle the shards for each epoch

        for i in range(0, len(train_files), buffer_size):
            files = train_files[i:i + buffer_size]

            X, Y = data_loader(files)

            # Skip invalid shards
            if X is None or len(X) == 0 or X.shape[1] == 0:
                print(f"[WARN] Ignoring invalid shard: {files} (X shape = {None if X is None else X.shape})")
                continue

            # Apply filter_model to remove uncertain multi-label data if applicable
            if filter_model is not None:
                Y = drop_class(filter_model, X, Y, device, confidence_cutoff)

            total_samples += len(X)

            # Train in mini-batches
            for j in range(0, len(X), batch_size):
                X_batch = X[j:j + batch_size]
                Y_batch = Y[j:j + batch_size]

                X_batch, Y_batch = to_device(X_batch, Y_batch, device)

                loss = train_one_epoch(model, X_batch, Y_batch, loss_function, optimizer)
                average_loss += loss

        average_loss /= total_samples

        # Validation after each epoch
        val_loss, val_accuracy = validate(model, X_val, Y_val, loss_function)
        kappa = compute_kappa(val_accuracy, expected_accuracy)

        # Save the best model based on validation loss
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = model
            save_model(best_model, save_path)

        print(f"Epoch {epoch + 1:03d}/{epochs:03d} - "
              f"Training Loss: {average_loss:.4f}, "
              f"Validation Loss: {val_loss:.4f}, "
              f"Validation Accuracy: {val_accuracy:.4f}, "
              f"Cohen's Kappa: {kappa:.4f}")

    return best_model

# TESTING FUNCTION
def test_model(model, data_directory, model_directory, file_name, device,
               loss_function):
    """
    Evaluate the trained model on the test dataset.

    Parameters:
    - model: Trained PyTorch model
    - data_directory: Directory containing the Test set (Mono or Multi)
    - model_directory: Directory to save the test report
    - file_name: Name of the report file
    - device: 'cuda' or 'cpu'
    - loss_function: Loss function used for evaluation

    Outputs:
    - Writes test report including Loss, Accuracy, Expected Accuracy, Cohen's Kappa
    """
    test_files = glob.glob(os.path.join(data_directory, 'Test/*.pkl'))
    X_test, Y_test = data_loader(test_files)

    expected_accuracy = compute_expected_accuracy(Y_test)
    X_test, Y_test = to_device(X_test, Y_test, device)

    test_loss, test_accuracy = validate(model, X_test, Y_test, loss_function)
    kappa = compute_kappa(test_accuracy, expected_accuracy)

    report = f"Test Loss: {test_loss:.4f}, " \
             f"Test Accuracy: {test_accuracy:.4f}, " \
             f"Test Expected Accuracy: {expected_accuracy:.4f}, " \
             f"Cohen's Kappa: {kappa:.4f}"
    print(report)

    report_file = os.path.join(model_directory, file_name)
    with open(report_file, 'w') as f:
        f.write(report)

if __name__ == '__main__':
    configurations = load_configurations()
    directories = parse_directories()

    # Directories for embeddings and model saving
    data_directory = os.path.join(
        '/content/drive/MyDrive/NeuroScape/data/internal/intermediate/embeddings/classifier')
    model_directory = os.path.join(
        BASEPATH, directories['internal']['intermediate']['models'])

    device = configurations['model']['device']
    layer_sizes = configurations['model']['layer_sizes']
    num_classes = configurations['model']['num_classes']

    # Initialize the classifier
    model = DisciplineClassifier(layer_sizes, num_classes).to(device)
    loss_function = nn.BCELoss()

    pretrain_configurations = configurations['pretraining']
    train_configurations = configurations['training']
    tune_configurations = configurations['finetuning']

    learning_rate = pretrain_configurations['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Define Mono (single-label) and Multi (multi-label) directories
    mono_directory = os.path.join(data_directory, 'Monolabel')   # Monolabel = single-discipline articles
    multi_directory = os.path.join(data_directory, 'Multilabel') # Multilabel = multi-discipline articles

    # PRETRAINING ON MONOLABEL
    print("Pretraining the model...")
    model = train_model(model, None, mono_directory, model_directory,
                        'discipline_classification_model_pretrained',
                        pretrain_configurations, device, loss_function,
                        optimizer)
    print("Pretraining completed.")
    print('---' * 10)

    # TRAINING ON MULTILABEL
    filter_model = deepcopy(model)  # Copy pretrained model for filtering
    filter_model.eval()
    learning_rate = train_configurations['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print("Training the model...")
    model = train_model(model, filter_model, multi_directory, model_directory,
                        'discipline_classification_model_trained',
                        train_configurations, device, loss_function, optimizer)
    print("Training completed.")
    print('---' * 10)

    # FINETUNING ON MONOLABEL
    learning_rate = tune_configurations['learning_rate']
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    print("Finetuning the model...")
    model = train_model(model, None, mono_directory, model_directory,
                        'discipline_classification_model_finetuned',
                        tune_configurations, device, loss_function, optimizer)
    print("Finetuning completed.")
    print('---' * 10)

    # TESTING PHASE
    test_model(model, multi_directory, model_directory,
               'multi_label_report.txt', device, loss_function)
    test_model(model, mono_directory, model_directory,
               'mono_label_report.txt', device, loss_function)


**Filter Data**

In [None]:
from src.utils.filtering import *
from src.utils.parsing import parse_directories
from src.utils.classifier import get_disciplines
from src.utils.load_and_save import save_articles_to_hdf5

sys.path.append('/content/drive/MyDrive/NeuroScape/')

# Load environment variables from .env
load_dotenv("/content/drive/MyDrive/NeuroScape/keys.env")
BASEPATH = os.environ['BASEPATH']

def filter(embedding_files, dataframe, device, model, class_index, cutoff):
    """
    Filters articles based on predictions of a neural classification model.

    For each embedding file:
    - Load article embeddings and their PubMed IDs
    - Move embeddings to the target device (CPU/GPU)
    - Compute predicted probabilities for each class
    - Calculate confidence for the target class as the ratio of its predicted
      probability to the maximum predicted probability
    - Retrieve the disciplines corresponding to each PubMed ID
    - Identify articles to remove based on confidence threshold and custom rules
    - Exclude embeddings containing NaN or Inf values
    - Update a global list of kept articles and store filtered articles
    - Finally, drop removed articles from the dataframe
    """
    global_keep_index = []
    filtered_data = []

    for file in tqdm(embedding_files, total=len(embedding_files)):
        with open(file, 'rb') as f:
            data = pickle.load(f)

        pubmed_ids = data.pmids
        embeddings = data.embeddings

        # Move embeddings to device for model inference
        embeddings_on_device = torch.tensor(embeddings).to(device)
        probabilities = model(embeddings_on_device).cpu().detach().numpy()

        # Compute class-specific confidence scores
        class_probabilities = probabilities[:, class_index]
        max_probabilities = np.max(probabilities, axis=1)
        confidence = class_probabilities / max_probabilities

        # Get article disciplines and decide which to remove based on confidence
        disciplines = get_disciplines(dataframe, pubmed_ids)
        remove_indices = remove(disciplines, confidence, cutoff)

        # Check for invalid embeddings containing NaN or Inf
        nan_indices = np.isnan(embeddings).any(axis=1)
        inf_indices = np.isinf(embeddings).any(axis=1)

        # Keep only valid articles above confidence threshold
        keep_index = [
            pubmed_ids[i] for i in range(len(pubmed_ids))
            if not remove_indices[i] and not nan_indices[i] and not inf_indices[i]
        ]

        # Update global list of kept indices
        global_keep_index = update_keep_index(global_keep_index, keep_index, dataframe)

        # Convert kept articles to structured objects for later processing
        for pmid in keep_index:
            index = pubmed_ids.index(pmid)
            article = fill_article(pmid, dataframe, embeddings[index])
            filtered_data.append(article)

    # Drop rows from dataframe corresponding to removed articles
    drop_index = list(set(dataframe.index) - set(global_keep_index))
    dataframe = dataframe.drop(drop_index)

    return filtered_data, dataframe


if __name__ == '__main__':
    configurations = load_configurations()
    items_per_shard = configurations['filtering']['shard_size']
    neuro_class_index = configurations['filtering']['class_index']
    confidence_cutoff = configurations['filtering']['confidence_cutoff']

    # Parse directories for data and embeddings
    data_directories = parse_directories()

    # Load the pre-trained discipline classification model
    model_file = os.path.join(
        '/content/drive/MyDrive/NeuroScape/data/internal/intermediate/models',
        'discipline_classification_model_finetuned.pth'
    )
    model = load_model(configurations['model'], model_file)
    device = configurations['model']['device']

    # Set up directories for multidisciplinary and neuroscience data
    multidisciplinary_dataframe_dir = os.path.join(
        '/content/drive/MyDrive/NeuroScape/output/tratados', 'otherdisciplines'
    )
    neuroscience_dataframe_dir = os.path.join(
        '/content/drive/MyDrive/NeuroScape/output/tratados', 'neuroscience'
    )
    multidisciplinary_embedding_dir = os.path.join(
        '/content/drive/MyDrive/NeuroScape/output/embeddings', 'otherdisciplines'
    )
    neuroscience_embedding_dir = os.path.join(
        '/content/drive/MyDrive/NeuroScape/output/embeddings', 'neuroscience'
    )

    filtered_data = []

    # Load dataframes with article metadata
    multi_dataframe = pd.read_csv(
        os.path.join(multidisciplinary_dataframe_dir, 'articles_merged_cleaned.csv')
    )
    neuro_dataframe = pd.read_csv(
        os.path.join(neuroscience_dataframe_dir, 'articles_merged_cleaned.csv')
    )

    # Gather embedding files to process
    multi_embedding_files = glob.glob(os.path.join(multidisciplinary_embedding_dir, '*.pkl'))
    neuro_embedding_files = glob.glob(os.path.join(neuroscience_embedding_dir, '*.pkl'))

    # Filter multidisciplinary articles based on classification confidence
    print('Filtering multidisciplinary data...')
    multi_filtered_data, multi_dataframe = filter(
        multi_embedding_files, multi_dataframe, device, model, neuro_class_index, confidence_cutoff
    )

    # Filter neuroscience articles
    print('Filtering neuroscience data...')
    neuro_filtered_data, neuro_dataframe = filter(
        neuro_embedding_files, neuro_dataframe, device, model, neuro_class_index, confidence_cutoff
    )

    # Merge the filtered dataframes into a single dataframe
    print('Merging data...')
    dataframe = pd.concat([multi_dataframe, neuro_dataframe], ignore_index=True)
    filtered_data.extend(multi_filtered_data)
    filtered_data.extend(neuro_filtered_data)

    # Prepare output directory
    output_directory = os.path.join('/content/drive/MyDrive/NeuroScape/output/filtrados')
    os.makedirs(output_directory, exist_ok=True)

    # Save the filtered dataframe as CSV
    df_output_file = os.path.join(neuroscience_dataframe_dir, 'articles_merged_cleaned_filtered.csv')
    emb_output_file = os.path.join(output_directory, 'articles_merged_cleaned_filtered.h5')
    print('Saving data...')
    dataframe.to_csv(df_output_file, index=False)

    # Save filtered articles as HDF5 shards for further processing
    print('Saving articles...')
    for i, start in tqdm(enumerate(range(0, len(filtered_data), items_per_shard))):
        file_name = os.path.join(output_directory, f'shard_{i:04d}.h5')
        end = start + items_per_shard
        save_articles_to_hdf5(filtered_data[start:end], file_name, disable_tqdm=True)


RuntimeError: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.