In [1]:
# Own Packages
from Masterarbeit_utils.model_utils import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset
sys.version, sys.executable

  from .autonotebook import tqdm as notebook_tqdm


('3.10.0 (default, Jul 12 2023, 08:49:30) [GCC 12.2.0]',
 '/home/worker/.pyenv/versions/3.10.0/bin/python')

# Parameters

In [2]:
"""
The Paths to important folders have to be changed for your system.
"""

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/gal_125_1'
os.makedirs(model_folder, exist_ok=True)

# The folder at which the training progress will be logged
log_folder = f'data/models/gal_125_1/logs'
os.makedirs(log_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.bfloat16
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cuda:0'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False
n_f_terms = None # Will be calculated

# Training parameters!
output_dir = model_folder
num_train_epochs = 3
per_device_train_batch_size = 15
per_device_eval_batch_size = 15
save_strategy = "epoch"
logging_strategy = "steps"
evaluation_strategy = "epoch"
logging_steps = 1000
gradient_accumulation_steps = 5
logging_first_step = True
logging_nan_inf_filter = True
learning_rate = 2e-4
weight_decay = 0.0
seed = 42

# This that could improve performance
dataloader_num_workers = 4
# sytem varables that must be set for the tokenizer
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
torch_compile = False
# V-Ram reduction only if default_dtype= float32
fp16=False
if default_dtype == torch.float16:
    fp16=False
bf16=False
tf32=True

# Creating the Tokenizer

In [3]:
# Loads a pretrained Tokenizer for the galactica model and adds an additional token for each F-Term
tokenizer = get_tokenizer(dump_dir)

# The Tokenizer contained initially 50000 Tokens which are stored as the vocab-size.
# The vocab_size attribute is not updated when the additional tokens are added to the tokenizer
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')

There are 378164 different F-Terms in the whole Dataset!


# Creating the dataset

In [4]:
# Samples in train 6385601
# Samples in val 1596401

class JapPatDataset(Dataset):
    """Dataset containing Japanese patents and their F-Term classification"""
    def __init__(self, data_folder, tokenizer):
        """
        data_folder: path to folder containing the text samples
        tokenizer: tokenizer instance with added additional Tokens for F-Terms
        """
        super(Dataset).__init__()
        self.data_folder = data_folder
        # This has to be manually set to the ammount of files in the 'dataset_samples' folder. Calculating the number of files in this folder would take forever.
        # A to low number would lead to samples missing from the dataset.
        # A to high number would raise a FileNotFound error.
        self.l = len(os.listdir(data_folder))
        #self.l = 10000
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        try:
            with open(f'{self.data_folder}/{idx}.txt', 'r', encoding='utf-8') as f:
                item = f.read()
        except FileNotFoundError:
            raise FileNotFoundError
        
        # Tokenizing the item 
        # The Tokenizer will return a dict with the encoded text as 'input_ids', 
        # a mask which shows the tokens types this will not be needed for our applications
        # and a mask for the attention mechanism as 'attention_mask' The attention mask will be needed to indicate, that the 
        # model should not attend to <pad> tokens.
        output = self.tokenizer(item)  
        output.pop('token_type_ids')
        return output

In [5]:
train_dataset = JapPatDataset(f'{dataset_folder}/train', tokenizer)
validation_dataset = JapPatDataset(f'{dataset_folder}/validation', tokenizer)

In [6]:
class JapPatDataset(Dataset):
    """Dataset containing Japanese patents and their F-Term classification"""
    def __init__(self, data_folder, tokenizer):
        """
        data_folder: path to folder containing the text samples
        tokenizer: tokenizer instance with added additional Tokens for F-Terms
        """
        super(Dataset).__init__()
        self.data_folder = data_folder
        # This has to be manually set to the ammount of files in the 'dataset_samples' folder. Calculating the number of files in this folder would take forever.
        # A to low number would lead to samples missing from the dataset.
        # A to high number would raise a FileNotFound error.
        #self.l = len(os.listdir(data_folder))
        self.l = 100
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        try:
            with open(f'{self.data_folder}/{idx}.txt', 'r', encoding='utf-8') as f:
                item = f.read()
        except FileNotFoundError:
            raise FileNotFoundError
        
        # Tokenizing the item 
        # The Tokenizer will return a dict with the encoded text as 'input_ids', 
        # a mask which shows the tokens types this will not be needed for our applications
        # and a mask for the attention mechanism as 'attention_mask' The attention mask will be needed to indicate, that the 
        # model should not attend to <pad> tokens.
        output = self.tokenizer(item)  
        output.pop('token_type_ids')
        return output

In [7]:
validation_dataset = JapPatDataset(f'{dataset_folder}/validation', tokenizer)

In [8]:
# The pretrained model is loaded from Huggingface.
# The token-embedding is expanded for all f-terms and the output embeddings is compleatly replaced by a F-Term classification head.
model = load_and_modify_model(base_model_name, default_dtype, tensor_parallel, num_gpus, n_f_terms, default_device)
print(f'The model interprets token-index {model.config.bos_token_id} as the beginning of a sequence and {model.config.eos_token_id} as the end')

cuda:0
The model interprets token-index 0 as the beginning of a sequence and 2 as the end


# Input Text
text = 'Good morning Mr.'
# Convert text to tokens
tokens  = tokenizer(text, return_tensors='pt').input_ids
print(f'Output of Tokenizer: {tokens}')
# Model generating the predicted output tokens
out = model.generate(tokens.to(default_device), max_length=30)
# Decoding the tokens

out = tokenizer.decode(out[0])
out

# Creating the Trainer Class by Subclassing from Huggingface-Trainer

In [9]:
# Subclassing the Huggingface Trainer class to use custome code to calculate the loss
# The labels used for the loss are generated and the labels for the text tokens are set to -100 to ignore their loss,
# because the modified model can't predict text-tokens
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs: bool=False):
        """
        model: model which should be trained.
        inputs: A padded batch of samples from the dataset.
        return_outputs: Indicates if the whole output of the model is returned or not.
        """
        # Removing the token_type_ids because we don't need them
        try:
            inputs.pop('token_type_ids')
        except KeyError:
            pass
        labels = inputs['input_ids']
        # Generating the labels, because the model can only predict F-Terms but also can interpret Text-Tokens as input, 
        # The maximum token idx is 50000 higher than the maximum output_idx
        labels = labels - 50000
        # All text tokens have token_idx below 50000 after substracting 50000 they are negative and 
        # are now set to -100 to ignore them when the loss is computed
        labels[labels<0] = -100
        # generating the output of the model
        # It is a dict of 'loss', 'logits' and 'past_key_values'
        outputs = model(**inputs, output_attentions=False, output_hidden_states=False, return_dict=True, labels=labels)
        loss = outputs['loss']

        message = f'loss: {loss.item()}'
        sys.stdout.write('\r'+ message)
        return (loss, outputs) if return_outputs else loss

    def prediction_step(
        self,
        model: torch.nn.Module,
        inputs: dict,
        prediction_loss_only: bool,
        ignore_keys: list = None,
        ) -> tuple:

        model = model.eval()
        with torch.no_grad():
                loss, outputs = self.compute_loss(model, inputs, return_outputs=True)

        return loss, None, None




    def prediction_step(
        self,
        model: torch.nn.Module,
        inputs: dict,
        prediction_loss_only: bool,
        ignore_keys: list = None,
    ) -> tuple:
        """
        Perform an evaluation step on `model` using `inputs`.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to evaluate.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.
            prediction_loss_only (`bool`):
                Whether or not to return the loss only.
            ignore_keys (`List[str]`, *optional*):
                A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                gathering predictions.

        Return:
            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
            logits and labels (each being optional).
        """
        #with torch.no_grad():
        if True:
             # Removing the token_type_ids because we don't need them
            try:
                inputs.pop('token_type_ids')
            except KeyError:
                pass
            labels = inputs['input_ids'].clone()
            # Generating the labels, because the model can only predict F-Terms but also can interpret Text-Tokens as input, 
            # The maximum token idx is 50000 higher than the maximum output_idx
            labels = labels - 50000
            # All text tokens have token_idx below 50000 after substracting 50000 they are negative and 
            # are now set to -100 to ignore them when the loss is computed
            labels[labels<0] = -100
            # generating the output of the model
            # It is a dict of 'loss', 'logits' and 'past_key_values'
            outputs = model(**inputs, output_attentions=False, output_hidden_states=False, return_dict=True, labels=labels)
            loss = outputs['loss']
            #print(loss, outputs['logits'].shape, outputs['logits'][0, 0, :100])
            message = f'eval_loss: {loss.item()}'
            sys.stdout.write('\r'+ message)
            return loss, None, None
        

# Training the Model

In [None]:
# The TrainingArguments class is a class which stores multiple parameters for the Custom-trainer of the model.

training_args = TrainingArguments(
    output_dir=output_dir,          
    logging_dir=log_folder,
    
    num_train_epochs=num_train_epochs,             
    per_device_train_batch_size=per_device_train_batch_size,    # batch size per device during training
    per_device_eval_batch_size=per_device_eval_batch_size,
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_first_step=logging_first_step,
    logging_nan_inf_filter=logging_nan_inf_filter,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    seed=seed,
    dataloader_num_workers=dataloader_num_workers, 
    fp16=fp16,
    bf16=bf16,
    tf32=tf32,
    torch_compile=torch_compile
)

trainer = CustomTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=validation_dataset, data_collator=DataCollatorWithPadding(tokenizer, return_tensors='pt'))

train_results = trainer.train()


You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


loss: 13.0625

Epoch,Training Loss,Validation Loss


loss: 8.56255

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



loss: 3.796875

In [None]:
train_results