In [1]:
import datasets
import numpy as np

In [3]:
data_path = '../data/arxiv-metadata-oai-snapshot.json'

data = datasets.load_dataset(
  "json",
  data_files=data_path,
)

data = data.select_columns(['title', 'abstract', 'categories'])
data = data['train'].train_test_split(train_size=0.125, test_size=0.1, seed=42)
data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract', 'categories'],
        num_rows: 340675
    })
    test: Dataset({
        features: ['title', 'abstract', 'categories'],
        num_rows: 272541
    })
})

In [4]:
def preprocess_categories(batch): 
  batch['categories'] = [(cat if '.' not in cat else cat.split('.')[0]) for cat in batch['categories']]
  return {'categories': batch['categories']} 

data = data.map(preprocess_categories, batched=True, keep_in_memory=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/340675 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/272541 [00:00<?, ? examples/s]

In [5]:
all_categories = [cat for sublist in data['train']['categories'] for cat in sublist.split()]
unique_categories = sorted(set(all_categories))

class2id = {cat: i for i, cat in enumerate(unique_categories)}
id2class = {i: cat for i, cat in enumerate(unique_categories)}

print(f"Unique categories: {len(unique_categories)}")

Unique categories: 38


In [6]:
def preprocess_categories(batch):
  """
  Processes a batch of data to convert category strings into multi-hot encoded labels.
  Assumes 'class2id' dictionary is available in the scope.
  """
  batch_labels = []
  num_labels = len(class2id) 

  for categories_str in batch['categories']:
    labels = [0.0] * num_labels
    all_labels = categories_str.split()
    for label in all_labels:
      if label in class2id:
        label_id = class2id[label]
        labels[label_id] = 1.0
    batch_labels.append(labels)
  
  return {'labels': batch_labels, 'text': [title+'\n\n'+abstract for title, abstract in zip(batch['title'], batch['abstract'])]}


data = data.map(preprocess_categories, batched=True, num_proc=4, keep_in_memory=True) 

Map (num_proc=4):   0%|          | 0/340675 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/272541 [00:00<?, ? examples/s]

In [12]:
data

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract', 'categories', 'labels', 'text'],
        num_rows: 340675
    })
    test: Dataset({
        features: ['title', 'abstract', 'categories', 'labels', 'text'],
        num_rows: 272541
    })
})

In [16]:
data.select_columns(['text', 'labels']).save_to_disk('../data/preprocessed')

Saving the dataset (0/1 shards):   0%|          | 0/340675 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/272541 [00:00<?, ? examples/s]