This notebook is where the json file downloaded from Kaggle is split into training, testing and validation sets and where some initial exploration is done with the HuggingFace transformers library.

In [1]:
# Installing packages (required only once since using Google Drive to install and save packages https://colab.research.google.com/drive/1KpMDi9CjImudrzXsyTDAuRjtbahzIVjq#scrollTo=trAxoCkMLgF1)
# No longer viable since importing packages from Drive takes too long. Installing every time is quicker.
!pip install transformers datasets huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.13.0-py3-none-any.whl (485 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m28.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m62.8 MB/s[0m eta [36m0:00:0

In [4]:
#Logging into HuggingFace CLI
!git config --global credential.helper store
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
#Importing packages
import numpy as np
import pandas as pd
import re

from datasets import load_dataset

In [None]:
# Mounting Google Drive so data files can be saved
import os, sys
from google.colab import drive
drive.mount('/content/mnt')
nb_path = '/content/notebooks'
os.symlink('/content/mnt/My Drive/Colab Notebooks', nb_path)
sys.path.insert(0, nb_path)

In [None]:
# Load arxiv data json file from Google Drive
dataset = load_dataset("json", data_files='mnt/MyDrive/Colab Notebooks/arxiv-metadata-oai-snapshot.json')

In [None]:
dataset

In [None]:
# Sample entry
dataset["train"][0]

In [None]:
# Looking at all the possible categories - too many to list
all_categories = np.array(dataset["train"]["categories"])
all_unique_categories = np.unique(all_categories)

In [None]:
all_unique_categories.shape #70981

(70981,)

In [None]:
all_unique_categories

array(['acc-phys hep-ex physics.acc-ph', 'acc-phys physics.acc-ph',
       'adap-org astro-ph cond-mat nlin.AO', ...,
       'supr-con cond-mat.supr-con hep-th',
       'supr-con cond-mat.supr-con mtrl-th',
       'supr-con cond-mat.supr-con quant-ph'], dtype='<U125')

In [None]:
# Looking at all unique astronomy categories
all_astro_categories = np.array([s for s in all_unique_categories if re.search(r'astro-ph.', s)])

In [None]:
all_astro_categories

array(['adap-org astro-ph cond-mat nlin.AO',
       'astro-ph adap-org gr-qc nlin.AO',
       'astro-ph adap-org hep-ph nlin.AO physics.plasm-ph', ...,
       'stat.ML astro-ph.IM cs.LG physics.data-an stat.ME',
       'stat.OT astro-ph.IM physics.data-an',
       'stat.OT astro-ph.IM physics.data-an stat.AP'], dtype='<U115')

In [None]:
sub_cat_arr = []
for cat in all_astro_categories:
  for sub_cat in cat.split():
    if re.search(r'astro-ph\.', sub_cat):
      sub_cat_arr.append(sub_cat)
sub_cat_arr = np.array(sub_cat_arr)

In [None]:
unique_astro_categories = np.unique(sub_cat_arr)
unique_astro_categories

array(['astro-ph.CO', 'astro-ph.EP', 'astro-ph.GA', 'astro-ph.HE',
       'astro-ph.IM', 'astro-ph.SR'], dtype='<U11')

In [None]:
# Cosmology and Galaxies only
categories_of_interest = ['astro-ph.CO', 'astro-ph.GA']

In [None]:
co_ga_dataset = dataset.filter(lambda x: any(n in x["categories"] for n in categories_of_interest))

Filter:   0%|          | 0/2064797 [00:00<?, ? examples/s]

In [None]:
co_ga_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'],
        num_rows: 99659
    })
})

In [None]:
# Removing all information except title and abstract
keys_to_remove = [key for key in co_ga_dataset['train'].features.keys() if key not in ['title', 'abstract']]

In [None]:
keys_to_remove

['id',
 'submitter',
 'authors',
 'comments',
 'journal-ref',
 'doi',
 'report-no',
 'categories',
 'license',
 'versions',
 'update_date',
 'authors_parsed']

In [None]:
subset_co_ga_dataset = co_ga_dataset['train'].remove_columns(keys_to_remove)

In [None]:
# Sample entry
subset_co_ga_dataset[0]

{'title': 'Gravitational particle production in braneworld cosmology',
 'abstract': '  Gravitational particle production in time variable metric of an expanding\nuniverse is efficient only when the Hubble parameter $H$ is not too small in\ncomparison with the particle mass. In standard cosmology, the huge value of the\nPlanck mass $M_{Pl}$ makes the mechanism phenomenologically irrelevant. On the\nother hand, in braneworld cosmology the expansion rate of the early universe\ncan be much faster and many weakly interacting particles can be abundantly\ncreated. Cosmological implications are discussed.\n'}

In [None]:
# Splitting dataset into training and test sets
astro_dataset_train_test_valid = subset_co_ga_dataset.train_test_split(train_size=0.8, seed=42)
astro_dataset_train_test_valid



DatasetDict({
    train: Dataset({
        features: ['title', 'abstract'],
        num_rows: 79727
    })
    test: Dataset({
        features: ['title', 'abstract'],
        num_rows: 19932
    })
})

In [None]:
# Splitting test set from above further into validation and test (50-50)
astro_dataset_test_valid = astro_dataset_train_test_valid['test'].train_test_split(train_size=0.5)
astro_dataset_test_valid

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract'],
        num_rows: 9966
    })
    test: Dataset({
        features: ['title', 'abstract'],
        num_rows: 9966
    })
})

In [None]:
astro_dataset_train_test_valid.pop('test') # Removing test set from first 80-20 split

Dataset({
    features: ['title', 'abstract'],
    num_rows: 19932
})

In [None]:
astro_dataset_train_test_valid

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract'],
        num_rows: 79727
    })
})

In [None]:
# Adding final test and validation sets to original train split so that final ratio is 80-10-10
astro_dataset_train_test_valid["validation"] = astro_dataset_test_valid["train"]
astro_dataset_train_test_valid["test"] = astro_dataset_test_valid["test"]

In [None]:
astro_dataset_train_test_valid

DatasetDict({
    train: Dataset({
        features: ['title', 'abstract'],
        num_rows: 79727
    })
    validation: Dataset({
        features: ['title', 'abstract'],
        num_rows: 9966
    })
    test: Dataset({
        features: ['title', 'abstract'],
        num_rows: 9966
    })
})

In [None]:
# Saving dataset and json files to disk (google drive)
astro_dataset_train_test_valid.save_to_disk("gdrive/MyDrive/Colab Notebooks/astro_co_ga")
astro_dataset_train_test_valid["train"].to_json("gdrive/MyDrive/Colab Notebooks/astro_co_ga_train.json")
astro_dataset_train_test_valid["test"].to_json("gdrive/MyDrive/Colab Notebooks/astro_co_ga_test.json")
astro_dataset_train_test_valid["validation"].to_json("gdrive/MyDrive/Colab Notebooks/astro_co_ga_validation.json")

Saving the dataset (0/1 shards):   0%|          | 0/79727 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9966 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/9966 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

14158650

In [12]:
# Playing around to see how model, tokenizer and output works
from transformers import AutoTokenizer, T5ForConditionalGeneration

In [13]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [14]:
inputs = tokenizer("summarize: Halo based models that begin with the assumption that\
galaxies form and live inside dark mater halos provided the\
breakthrough that made it possible to quantitatively model\
galaxy clustering on small scales. These models rely on the\
fact that Spathe statistical properties of dark matter halos are\
easy to predict with collision-less N-body simulations where\
the only important physical process is gravity. Halo models then adopt a parameterisation to connect galaxies to halos, thus bypassing the need to understand galaxy formation\
physics. In the halo model framework, there is a convenient\
conceptual and operational division between the roles of cosmology and galaxy formation: cosmology dictates the dark\
matter halo distribution while galaxy formation determines\
how exactly galaxies occupy halos. This division is not perfect since gas physics can affect the properties of halos however, this is a second order effect.", return_tensors="pt")

In [16]:
inputs.input_ids

tensor([[21603,    10,  5648,    32,     3,   390,  2250,    24,  1731,    28,
             8, 20662,    24,  6191,  8606,    15,     7,   607,    11,   619,
          1096,  2164,  6928,    49,  4244,  2298,   937,     8, 14577, 11258,
            24,   263,    34,   487,    12, 18906,   120,   825,  6191,     9,
           226,    63,  9068,    53,    30,   422,  2643,     7,     5,   506,
          2250,     3,  4610,    30,     8,  8717,    24,  5641,   532, 11775,
          2605,    13,  2164,  1052,  4244,  2298,    33, 20905,    12,  9689,
            28, 16345,    18,   924,   445,    18,  6965, 11108,     7,   213,
           532,   163,   359,  1722,   433,    19, 18076,     5,  5648,    32,
          2250,   258,  4693,     3,     9, 15577,  2121,    12,  1979,  7466,
          8606,    15,     7,    12,  4244,  2298,     6,  2932, 20720,    53,
             8,   174,    12,   734, 24856,  3239, 11599,     5,    86,     8,
             3,  3828,    32,   825,  4732,     6,  

In [17]:
tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

['▁summarize',
 ':',
 '▁Hal',
 'o',
 '▁',
 'based',
 '▁models',
 '▁that',
 '▁begin',
 '▁with',
 '▁the',
 '▁assumption',
 '▁that',
 'gal',
 'axi',
 'e',
 's',
 '▁form',
 '▁and',
 '▁live',
 '▁inside',
 '▁dark',
 '▁mat',
 'er',
 '▁ha',
 'los',
 '▁provided',
 '▁the',
 'break',
 'through',
 '▁that',
 '▁made',
 '▁it',
 '▁possible',
 '▁to',
 '▁quantitative',
 'ly',
 '▁model',
 'gal',
 'a',
 'x',
 'y',
 '▁cluster',
 'ing',
 '▁on',
 '▁small',
 '▁scale',
 's',
 '.',
 '▁These',
 '▁models',
 '▁',
 'rely',
 '▁on',
 '▁the',
 'fact',
 '▁that',
 '▁Spa',
 'the',
 '▁statistical',
 '▁properties',
 '▁of',
 '▁dark',
 '▁matter',
 '▁ha',
 'los',
 '▁are',
 'easy',
 '▁to',
 '▁predict',
 '▁with',
 '▁collision',
 '-',
 'less',
 '▁N',
 '-',
 'body',
 '▁simulation',
 's',
 '▁where',
 'the',
 '▁only',
 '▁important',
 '▁physical',
 '▁process',
 '▁is',
 '▁gravity',
 '.',
 '▁Hal',
 'o',
 '▁models',
 '▁then',
 '▁adopt',
 '▁',
 'a',
 '▁parameter',
 'isation',
 '▁to',
 '▁connect',
 '▁gal',
 'axi',
 'e',
 's',
 '▁to',
 '▁

In [21]:
outputs = model.generate(inputs.input_ids)



In [22]:
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

halo models rely on the fact that Spathe statistical properties of dark matter halos
