In [1]:
# Own Packages
from Masterarbeit_utils.model_utils import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset
sys.version, sys.executable

2023-08-21 11:34:20.039738: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-21 11:34:20.066497: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


('3.10.0 (default, Jul 12 2023, 08:49:30) [GCC 12.2.0]',
 '/home/worker/.pyenv/versions/3.10.0/bin/python3.10')

In [2]:
choices = ['calculate all', 'ask for userinput', 'just calculate needed']
calculation_profile =  choices[2]
calculation_profile

'just calculate needed'

In [3]:
"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'gal_125_aug_1'#
checkpoint = 148000

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.bfloat16
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cpu'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False


# Creating the Tokenizer

In [4]:
if calculation_profile == choices[0]:
    i = 'y'
elif calculation_profile == choices[1]:  
    i = input("This creates a new tokenizer instance and saves it, if you want to proceed write y: ")
else:
    i = 'n'

if i != 'y' and os.path.isfile(f'{tokenizer_folder}/tokenizer.json'):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
    n_f_terms = len(tokenizer) - tokenizer.vocab_size
    print('Loaded Tokenizer from serialized instance!')    
    print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')
    
else:
    # Loads a pretrained Tokenizer for the galactica model and adds an additional token for each F-Term
    tokenizer = get_tokenizer(dump_dir)
    
    # The Tokenizer contained initially 50000 Tokens which are stored as the vocab-size.
    # The vocab_size attribute is not updated when the additional tokens are added to the tokenizer
    n_f_terms = len(tokenizer) - tokenizer.vocab_size
    tokenizer.save_pretrained(tokenizer_folder)
    print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')

Loaded Tokenizer from serialized instance!
There are 378166 different F-Terms in the whole Dataset!


# Loading The Model

In [5]:
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
         _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available


model = OPTForCausalLM.from_pretrained(f'{model_folder}/checkpoint-{checkpoint}', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                           device_map=device_map, max_memory=max_memory)



# Loading the Datasets and the Tokenizer

In [6]:
class JapPatDataset(Dataset):
    """Dataset containing Japanese patents and their F-Term classification"""
    def __init__(self, data_folder, tokenizer):
        """
        data_folder: path to folder containing the text samples
        tokenizer: tokenizer instance with added additional Tokens for F-Terms
        """
        super(Dataset).__init__()
        self.data_folder = data_folder
        # This has to be manually set to the ammount of files in the 'dataset_samples' folder. Calculating the number of files in this folder would take forever.
        # A to low number would lead to samples missing from the dataset.
        # A to high number would raise a FileNotFound error.
        self.l = len(os.listdir(data_folder)) - 1
        #self.l = 10000
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        try:
            with open(f'{self.data_folder}/{idx}.txt', 'r', encoding='utf-8') as f:
                item = f.read()
        except FileNotFoundError:
            
            raise FileNotFoundError
        
        # Tokenizing the item 
        # The Tokenizer will return a dict with the encoded text as 'input_ids', 
        # a mask which shows the tokens types this will not be needed for our applications
        # and a mask for the attention mechanism as 'attention_mask' The attention mask will be needed to indicate, that the 
        # model should not attend to <pad> tokens.
        
        output = self.tokenizer(item)  
        output.pop('token_type_ids')
        return output

In [7]:

train_dataset = JapPatDataset(f'{dataset_folder}/train', tokenizer)
validation_dataset = JapPatDataset(f'{dataset_folder}/validation', tokenizer)

# Loading a dict that contains the definitions of the f-terms
with open(f'{dump_dir}/full_descriptions.pk', 'rb') as f:
    full_descriptions_dict = pk.load(f)

In [11]:
test_sample = validation_dataset[10]
test_text= tokenizer.decode(test_sample['input_ids'])
test_abstract, f_term_text = test_text.split('<START F-TERMS>')
test_abstract, '                         ',f_term_text

('<s>To provide a vehicle riding-together support system that encourages a user who desires to ride in a vehicle to adjust desired time of boarding and a desired location of the boarding, and improve formability of matching between the user and a riding-together vehicle.SOLUTION: A vehicle riding-together support system, which supports allocation of a user and a riding-together vehicle, includes: a condition information acquisition unit that acquires, with respect to the riding-together vehicle, desired condition information including a desired boarding location and a desired boarding time zone desired by the user from a user terminal device; a reservation information acquisition unit that acquires and stores reservation reception records indicating a reserved boarding location, a reserved boarding date, and a reserved boarding time zone for each of allocated reservation requests transmitted from the user terminal device; and a difficulty calculation unit that calculates, based on the 

In [24]:
def generate(prompt, model, tokenizer, max_pred_tokens=10, decode=True):
    # adding the Start F-Term Token to the prompt to beginng the prediction of F-Terms
    prompt += '<START F-TERMS>'

    # Converting the prompt to tokens
    eos_token_id = tokenizer.eos_token_id
    tokenized = tokenizer(prompt, return_tensors='pt')
    prompt_tokens = tokenized['input_ids'][:,:-1]
    attention_mask = tokenized['attention_mask'][:, :-1]

    # Generating the F-Terms
    current_token = -100
    predictions = []
    while current_token != eos_token_id and len(predictions) < max_pred_tokens:

        # Model Call
        output = model(prompt_tokens, attention_mask, output_attentions=False, output_hidden_states=False, return_dict=True)
        logits = output['logits']
        # torch.max function returns values and indices, we are just interested in the indices.
        indices = torch.max(logits, dim=-1)[1]
        current_token = indices[0, -1]
        # To make the predictions match the indices in the tokenizer we must add 50000 to the prediction, because the output does not have the 50000 text-tokens
        current_token += 50000
        predictions.append(current_token)
        # Adding the prediction to the input sequence to predict the new token.
        prompt_tokens = torch.cat([prompt_tokens, indices[:, -1:]], -1)
        # Attention mask has to be updates as well
        attention_mask = torch.cat([attention_mask, attention_mask[:,-1:]], -1)
    if decode:
        predictions = tokenizer.decode(predictions)
        return predictions
    else: 
        return predictions
        
    
        
    
    

predictions = generate(test_abstract, model, tokenizer, 10)

for p, t in zip(predictions.split(','), f_term_text.split(',')):
    print('Prediction', p, 'Target', t)

Prediction 5L049/CC42 Target 5L049/CC42
Prediction <END F-TERMS> Target <END F-TERMS>


In [25]:
def generation_accuracy(sample, model):
    """
    
    Lets the model generate f-Terms for a abstact and then computes the accuracy of the predicted F-Terms
    """
    # Remove <s> and <END F-TERMS> Tokens and convert sample to text
    sample_text = tokenizer.decode(sample['input_ids'][1:-1])
    abstract, f_terms_text = sample_text.split('<START F-TERMS>')
    f_terms = tokenizer(f_terms_text)['input_ids'][1:-1]

    predictions = generate(abstract, model, tokenizer, len(f_terms), decode=False)
    print(predictions, f_terms)

for i, test_batch in enumerate(validation_dataset):
    #print(i, end='\r')
    generation_accuracy(test_batch, model)

[tensor(280161)] [280161]
[tensor(60800), tensor(60250), tensor(60800)] [155058, 153792, 105462]
[tensor(63153)] [63153]
[tensor(169129)] [169129]
[tensor(69303), tensor(235553), tensor(308035)] [69303, 341263, 142136]
[tensor(63153)] [63153]
[tensor(51833), tensor(51833), tensor(51833), tensor(50001)] [108983, 51414, 53081, 134214, 51417, 51420]
[tensor(63153)] [212834]
[tensor(243157), tensor(210845), tensor(68055)] [150502, 150503, 172404]
[tensor(60193), tensor(67618), tensor(119354), tensor(50001)] [60193, 67618, 219924, 73794, 67621, 73795, 119354, 113993, 160767, 160768]
[tensor(293105)] [293105]
[tensor(74052)] [74052]
[tensor(89206)] [108382]
[tensor(55000)] [66673]
[tensor(212567), tensor(61641)] [85713, 81345]
[tensor(280161)] [280161]
[tensor(100471), tensor(100471)] [60207, 100471]
[tensor(139556), tensor(112353), tensor(50001)] [126494, 139556, 178896, 265603, 112353]
[tensor(96758), tensor(96758), tensor(96758), tensor(115131)] [85159, 53466, 72205, 53468]
[tensor(70887)

KeyboardInterrupt: 

In [None]:

for i, test_sample in enumerate(validation_dataset):
    test_text= tokenizer.decode(test_sample['input_ids'])
    test_abstract, f_term_text = test_text.split('<START F-TERMS>')

    predictions = generate(test_abstract, model, tokenizer, 10)
   
    for p, t in zip(predictions.split(','), f_term_text.split(',')):
        print(p, t)
        
    if i == 10:
        break

In [None]:
def classic_accurracy(batch, model, top_k=1):
    """
    Classic prediction accuracy metric. 
    This function should be applied to a batch of samples,
    which were tokenized by a tokenizer instance.
    This function returns the procentual accuracy metric as well as the total number of correct predictions
    and the total number of predictions in this batch

    :batch: batch of samples from validation dataset
    :model: model which should be testet
    :top_k: top k predictions which should be investigated for a correct result.
    """
    with torch.no_grad():
        model.eval()
        logits = model(**batch, output_hidden_states=False, return_dict=True)['logits']

    input_ids = batch['input_ids']
    # Removing first label
    labels = input_ids[:, 1:]
    # removing last prediction
    logits = logits[:, :-1]
    # Sorting the logits to get the predictions orderd from highest to lowest
    _, preds = torch.sort(logits, dim=-1, descending=True)
    # Dropping all text predictions and labels, keeping just the predictions and labels for f-terms
    token_threshold = 50000 # tokens with an ids_value below the token_threshold are removed
    preds = preds[labels > token_threshold]
    # only taking the top k predictions:
    preds = preds[:,:top_k]
    # the predictions are missing the 50000 text tokens so the predictions have to be increased by 50000
    preds += 50000
    labels = labels[labels > token_threshold]
    n_preds = labels.shape[0] # number of values that are to be predicted

    # expanding the labels to the same size as the predictions
    labels = labels.unsqueeze(dim=-1)
    labels = labels.expand(-1, top_k)
    n_correct = len(labels[labels == preds])
    accuracy = 100*n_correct/n_preds
    return accuracy, n_correct, n_preds

class Batch_DataLoader():
    """
    This class converts a dataset to a iterable dataloader, which loads padded patches of data.   
    """
    def __init__(self,
                 dataset, 
                 batchsize=10,
                 datacollator=DataCollatorWithPadding(tokenizer, return_tensors='pt')):

        self.dataset = dataset
        self.batchsize = batchsize
        self.l = len(dataset)//batchsize + 1
        self.datacollator = datacollator
        self.current = 0

    def __len__(self):
        return self.l

    def __iter__(self):
        self.current = 0
        return self

    def __next__(self):
        batch = [self.dataset[i] for i in range(self.current, self.current+self.batchsize)]
        batch = self.datacollator(batch)
        self.current += self.batchsize
        return batch

In [None]:
def accuracy_on_dataset(dataset, model, top_k=1, batch_size=10):
    loader = Batch_DataLoader(dataset, batch_size)
    n_pred = 0
    n_corr = 0
    for i, batch in enumerate(loader):
        acc, corr, pred = classic_accurracy(batch, model, top_k)
        n_pred += pred
        n_corr += corr
        print(f'batch_acc: {acc:.2f}%, total_acc: {100*n_corr/n_pred:.2f}% batch {i}/{len(loader)}', end ='\r')
    return n_pred, n_corr
    

n_pred, n_corr = accuracy_on_dataset(validation_dataset, model, top_k=5, batch_size=100)