### Import

In [1]:
import json
import random
from transformers import DistilBertTokenizer, DistilBertModel
from typing import Optional
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    HfArgumentParser,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    MODEL_FOR_MASKED_LM_MAPPING,
    )
import torch
from datasets import load_dataset
from dataclasses import dataclass, field
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
import nltk
import torch
import os
import tqdm
from typing import List
import re


### Load data and remove grants not appropriate for text analysis

In [2]:
# load data
with open("data\\projects.json", "r") as f:
  dataPrep = json.load(f)

# remove unwanted grants
def remove_from_dict(
    key_name: str,
    values: List[str],
    dictionary: dict,
    use_regex: Optional[bool] = False,
):
    values = [value.casefold() for value in values]

    regular_comparator = lambda string: string.casefold() not in values
    regex_comparator = lambda string: not any(
        re.match(value, string.casefold()) for value in values
    )

    comparator = regex_comparator if use_regex else regular_comparator

    filtered_dict = {
        key: inner_dict
        for key, inner_dict in dictionary.items()
        if comparator(inner_dict[key_name])
    }

    return filtered_dict


def remove_from_dict_int(key_name: str, values: int, dictionary: dict):

    filtered_dict = {
        key: inner_dict
        for key, inner_dict in dictionary.items()
        if inner_dict[key_name] not in values
    }

    return filtered_dict

def remove_from_dict_small(key_name: str, value: int, dictionary: dict):

    filtered_dict = {
        key: inner_dict
        for key, inner_dict in dictionary.items()
        if len(inner_dict[key_name]) > value
    }

    return filtered_dict


dataPrep = remove_from_dict("title", ["dtp"], dataPrep, use_regex=True)
dataPrep = remove_from_dict("abstract", ["Summary: Equipment only", "Abstracts are not currently available in GtR", "As per advice from EPSRC, please see the attachments",
                        "Refer to ATLAS-UK", "Abstract from the telescope proposal", "Doctoral Training Partnerships: a range of postgraduate training", "See Je-S application", "As agreed with AHRC please see", "Awaiting Public Project Summary", "no public description", "The public description for this project has been requested but has not yet been received", "No abstract available", "Equipment only, agreed in relation to the previously issued GridPP4 grant", "Equipment only, agreed in relation to the previously issued GridPP5 grant"], dataPrep, use_regex=True)
dataPrep = remove_from_dict("abstract", ["none"], dataPrep, use_regex=False)

dataPrep = remove_from_dict_small("abstract", 75, dataPrep)

# remove anything with 0 funding value (mostly studentships)
dataPrep = remove_from_dict_int("funding", [0], dataPrep)

# remove duplicates in abstract/title. An argument could be made for keeping them in, but I think it would skew the results due to transferred grants etc
seen = set()
data = {}
for k, v in dataPrep.items():
    if (v['title'], v['abstract']) not in seen:
        data[k] = v
        seen.add((v['title'], v['abstract']))

df = pd.DataFrame(data).transpose()

# final bit of tidying. data also goes back to 1991 which will skew language (maybe), and too much data.
df = df.dropna()
df['start'] = pd.to_datetime(df['start'])
df['end'] = pd.to_datetime(df['end'])
df['funding'] = pd.to_numeric(df['funding'])

df = df[(df["start"] >"2013-01-01")]


49602

In [3]:
# I want to group up the unique project titles/abstracts
uniques = df.groupby(['title', 'abstract'], as_index=False).agg({'ref': 'sum',
                                                                         'funding': 'sum',
                                                                         'start': lambda x: x.iloc[0],
                                                                         'end': lambda x: x.iloc[0],
                                                                         'funder': lambda x: x.iloc[0],
                                                                         'category': lambda x: x.iloc[0]})

### Clean text + merge title/abstract

In [4]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')

#Clean abstracts
lang_map = {
    "\n": "|",
    "&quot;": '"',
    "&amp;": "&",
    "|": "",
}

def clean_up(text, lang_map=lang_map):
    text = BeautifulSoup(text, 'html.parser').get_text()
    for key in lang_map:
        text = text.replace(key, lang_map[key])
    return text.strip()

uniques['abstract'] = uniques['abstract'].apply(lambda x: clean_up(x))


In [5]:

#Embedding Functions
def mean_pooling(token_embeddings, attention_mask):
    """
    Effectively averages the embeddings of tokens across the vocabulary dimension
    to calculate the vocab-weighted latent representations (embeddings).

    :param token_embeddings: torch.float tensor of size (n_examples, n_vocab, n_latent)
    :param attention_mask: torch.byte tensor of size (n_examples, n_vocab)
    :return: torch.float tensor of size (n_examples, n_latent)
    """
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    out = sum_embeddings / sum_mask
    del input_mask_expanded
    del sum_embeddings
    del sum_mask
    return out.squeeze()

def embed(text, model, tokenizer):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(torch.device("cuda"))
    embeddings = model(**tokens, output_hidden_states=True).hidden_states[-1]
    pooled = mean_pooling(embeddings, tokens['attention_mask']).detach().cpu()
    dim_size = len(text) if isinstance(text, list) else 1
    del tokens
    del embeddings
    return pooled.reshape(dim_size, -1)


In [6]:
uniques.to_json(path_or_buf="data\\metadata.json", orient='index')

In [7]:
with open("data\\metadata.json", "r") as f:
  data = json.load(f)

for entry in data:
  data[entry]['id'] = entry

# join abstract + title together
fields = ["abstract", "title"]
target = "abstract_title_merge"
for k, v in data.items():
    v[target] = " ".join(v[n] for n in fields)

dictlist = {"data": [{'abstract_title_merge': data[id]['abstract_title_merge']} for id in data.keys()]}

with open("data\\datalist.json", "w") as f:
  json.dump(dictlist, f)

data = load_dataset('json', data_files="data\\datalist.json", field="data")

Using custom data configuration default-c45134bc7162ebad


Downloading and preparing dataset json/default to C:\Users\n3hoo\.cache\huggingface\datasets\json\default-c45134bc7162ebad\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to C:\Users\n3hoo\.cache\huggingface\datasets\json\default-c45134bc7162ebad\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

### Import distilbert models + create training dataset

In [8]:
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [9]:
#tokenization
def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_tokens_mask=True, truncation=True)  # careful of NaNs in your metadata (Nick H), it will throw errors

data = data['train'].train_test_split(test_size=0.25)

text_column_name = "abstract_title_merge"
NUM_WORKERS = None
overwrite_cache = False
tokenized_datasets = data.map(
    tokenize_function,
    batched=True,
    num_proc=NUM_WORKERS,
    remove_columns=[text_column_name],
    load_from_cache_file=not overwrite_cache,
)

# data collation
pad_to_multiple_of_8 = True
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
    pad_to_multiple_of=8 if pad_to_multiple_of_8 else None,
)


  0%|          | 0/38 [00:00<?, ?ba/s]

  0%|          | 0/13 [00:00<?, ?ba/s]

In [10]:
# set up trainer

training_args = TrainingArguments(do_train=True,
                                  do_eval=True,
                                  per_device_train_batch_size=8,
                                  output_dir="tmp_trainer",
                                  num_train_epochs=10,
                                  learning_rate=5e-5,
                                  logging_steps=5000,
                                  eval_steps=5000,
                                  evaluation_strategy="steps")

train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

### Run model

!WARNING!... the next part takes a long time. Also, the temp folders for fine-tuning needs AT least 185Gb of space when pulling all GTR data as of September 2022.

In [11]:
# create data folder for data
if not os.path.exists('model'):
    os.makedirs('model')

trainer.train()
trainer.save_model("model\\distilbert_ukri")

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 37201
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 46510


Step,Training Loss,Validation Loss


Saving model checkpoint to tmp_trainer\checkpoint-500
Configuration saved in tmp_trainer\checkpoint-500\config.json
Model weights saved in tmp_trainer\checkpoint-500\pytorch_model.bin
tokenizer config file saved in tmp_trainer\checkpoint-500\tokenizer_config.json
Special tokens file saved in tmp_trainer\checkpoint-500\special_tokens_map.json
Saving model checkpoint to tmp_trainer\checkpoint-1000
Configuration saved in tmp_trainer\checkpoint-1000\config.json
Model weights saved in tmp_trainer\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in tmp_trainer\checkpoint-1000\tokenizer_config.json
Special tokens file saved in tmp_trainer\checkpoint-1000\special_tokens_map.json
Saving model checkpoint to tmp_trainer\checkpoint-1500
Configuration saved in tmp_trainer\checkpoint-1500\config.json
Model weights saved in tmp_trainer\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in tmp_trainer\checkpoint-1500\tokenizer_config.json
Special tokens file saved in tmp_traine

### Import fine-tunel model

In [12]:
tokenizer = AutoTokenizer.from_pretrained("model\\distilbert_ukri/")
model = AutoModelForMaskedLM.from_pretrained("model\\distilbert_ukri/")
model = model.to(torch.device('cuda'))

Didn't find file model\distilbert_ukri/added_tokens.json. We won't load it.
loading file model\distilbert_ukri/vocab.txt
loading file model\distilbert_ukri/tokenizer.json
loading file None
loading file model\distilbert_ukri/special_tokens_map.json
loading file model\distilbert_ukri/tokenizer_config.json
loading configuration file model\distilbert_ukri/config.json
Model config DistilBertConfig {
  "_name_or_path": "model\\distilbert_ukri/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "vocab_size": 30522
}

loading weights file model\distilbert_ukri/pytorch_

### Create embeddings

In [13]:
%%time

M = torch.zeros([1, 768])

for _batch in tqdm.tqdm(range(0, uniques.shape[0]), position=0, leave=True):  # force restart
    text = uniques.abstract.iloc[_batch]
    embeddings = embed(text, model, tokenizer)
    M = torch.cat((M, embeddings.detach().cpu()), dim=0)
    # torch.save(embeddings.half(), DIR + f"/batch_{last_max}.pt")
    del embeddings
    del text

torch.save(M[1:], "data\\distilbert_ukri_tensor.pt")

100%|██████████| 49602/49602 [18:14<00:00, 45.31it/s]


CPU times: total: 2h 24min 57s
Wall time: 18min 15s
