In [1]:
# Own Packages
from Masterarbeit_utils.model_utils_seq_class import load_and_modify_model, get_tokenizer

# Site-Packages
import dask.dataframe as dd
import torch
import psutil
import os
import sys
import pickle as pk
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import tensorflow as tf
%load_ext tensorboard


from transformers import AutoTokenizer, AutoConfig, OPTForSequenceClassification
from tokenizers.processors import TemplateProcessing
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset
sys.version, sys.executable

2023-08-28 10:57:11.974061: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-28 10:57:11.994395: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


('3.10.0 (default, Jul 12 2023, 08:49:30) [GCC 12.2.0]',
 '/home/worker/.pyenv/versions/3.10.0/bin/python')

In [2]:
choices = ['calculate all', 'ask for userinput', 'just calculate needed']
calculation_profile =  choices[2]
calculation_profile

'just calculate needed'

# Parameters

In [3]:
"""
The Paths to important folders have to be changed for your system.
"""

# Name of this experiment
model_name = 'gal_125_seq_3'

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/{model_name}'
os.makedirs(model_folder, exist_ok=True)


# Folder in which the tokenizer will be saved
tokenizer_folder = f'data/tokenizers/{model_name}'
os.makedirs(tokenizer_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
# if default_type = float16 fp16 must be False
default_dtype = torch.bfloat16
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cuda:0'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False
n_f_terms = None # Will be calculated

# Training parameters!
output_dir = model_folder
num_train_epochs = 4
per_device_train_batch_size = 25
per_device_eval_batch_size = 25
gradient_accumulation_steps = 10
save_strategy = "steps"
logging_strategy = "steps"
evaluation_strategy = "steps"
logging_steps = 10
evaluation_steps = 10000
save_steps = 8000
logging_first_step = True
logging_nan_inf_filter = False


learning_rate = 2e-4 
weight_decay = 0.0  # Parameter from first model run
seed = 42
resume_from_checkpoint = False

# This that could improve performance
dataloader_num_workers = 8
# sytem varables that must be set for the tokenizer
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
torch_compile = False
# V-Ram reduction only if default_dtype= float32
fp16=False
if default_dtype == torch.float16:
    fp16=False
bf16=False
tf32=True

# Creating the Tokenizer

In [4]:
if calculation_profile == choices[0]:
    i = 'y'
elif calculation_profile == choices[1]:  
    i = input("This creates a new tokenizer instance and saves it, if you want to proceed write y: ")
else:
    i = 'n'

if i != 'y' and os.path.isfile(f'{tokenizer_folder}/tokenizer.json'):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_folder)
    n_f_terms = len(tokenizer) - tokenizer.vocab_size
    print('Loadede Tokenizer from serialized instance!')    
    print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')
    tokenizer.padding_side = 'left'
    
else:
    print('generating new tokenizer')
    # Loads a pretrained Tokenizer for the galactica model and adds an additional token for each F-Term
    tokenizer = get_tokenizer(dump_dir)
    
    # The Tokenizer contained initially 50000 Tokens which are stored as the vocab-size.
    # The vocab_size attribute is not updated when the additional tokens are added to the tokenizer
    n_f_terms = len(tokenizer) - tokenizer.vocab_size
    tokenizer.save_pretrained(tokenizer_folder)
    print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')


#!!!! Important
tokenizer.padding_side = 'left'

Loadede Tokenizer from serialized instance!
There are 378166 different F-Terms in the whole Dataset!


# Creating the dataset

In [5]:
class JapPatDataset(Dataset):
    """Dataset containing Japanese patents and their F-Term classification.
    This variant is adapted for sequence classification and returns the f_terms as a list of labels"""
    def __init__(self, data_folder, tokenizer):
        """
        data_folder: path to folder containing the text samples
        tokenizer: tokenizer instance with added additional Tokens for F-Terms
        """
        super(Dataset).__init__()
        self.data_folder = data_folder
        # This has to be manually set to the ammount of files in the 'dataset_samples' folder. Calculating the number of files in this folder would take forever.
        # A to low number would lead to samples missing from the dataset.
        # A to high number would raise a FileNotFound error.
        self.l = len(os.listdir(data_folder))
        self.start_f_term_token = '<START F-TERMS>'
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        try:
            with open(f'{self.data_folder}/{idx}.txt', 'r', encoding='utf-8') as f:
                item = f.read()
        except FileNotFoundError:
            raise FileNotFoundError

        #tokenizing the whole sample which will be later split into tokens and labels
        tokenized = self.tokenizer(item)
        tokenized.pop('token_type_ids')
        attention_mask = tokenized.pop('attention_mask')
        tokens = tokenized.pop('input_ids')
        
        tokens = torch.tensor(tokens)
        # separating the abstract text tokens from the f_terms
        input_ids = tokens[tokens < 50000].tolist()
        f_term_ids = tokens[tokens >= 50002] - 50000
        # rescaling the attention_mask to the shorter sequence
        attention_mask = attention_mask[:len(input_ids)]

        # creating a multi hot vector as the label 
        n_f_terms = len(self.tokenizer) - self.tokenizer.vocab_size
        labels = torch.zeros([n_f_terms])
        labels[f_term_ids] = 1
        return {'input_ids': input_ids,
                'attention_mask': attention_mask,  
                'labels':labels.tolist()}

In [6]:
train_dataset = JapPatDataset(f'{dataset_folder}/train', tokenizer)
validation_dataset = JapPatDataset(f'{dataset_folder}/validation', tokenizer)

##### Debugging remove later
#validation_dataset.l = 1500
#train_dataset.l = 1500

In [None]:
# The pretrained model is loaded from Huggingface.
# The token-embedding is expanded for all f-terms and the output embeddings is compleatly replaced by a F-Term classification head.
model = load_and_modify_model(base_model_name, default_dtype, tensor_parallel, num_gpus, n_f_terms, default_device)
print(f'The model interprets token-index {model.config.bos_token_id} as the beginning of a sequence and {model.config.eos_token_id} as the end')

In [7]:
###########################
# Loading the Model ##### Debugging remove later
###########################
device_map=None
max_memory = {}
if num_gpus > 0:
    # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
    for i in range(num_gpus):
        _ = torch.tensor([0], device=i)
    for i in range(num_gpus):
        max_memory[i] = torch.cuda.mem_get_info(i)[0]
    device_map = "auto"
max_memory["cpu"] = psutil.virtual_memory().available

model = OPTForSequenceClassification.from_pretrained(f'{model_folder}/checkpoint-3', torch_dtype=default_dtype, low_cpu_mem_usage=True,
                                               device_map=device_map, max_memory=max_memory)

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


In [None]:
# Input Text
text = 'Good morning Mr'
# Convert text to tokens
tokens  = tokenizer(text, return_tensors='pt').input_ids
print(f'Output of Tokenizer: {tokens}')

# creating one forward pass
tokens = tokens.to(default_device)
tokens = tokens[:,:-1]

labels = torch.zeros([2, 378166])
labels[:, 100] = 1
labels[:, 1233] = 1
print(labels.view(-1).shape, tokens.shape)
model_output = model(tokens)

In [None]:
f'The model has {model_output["logits"].shape[-1]} output-features, the tokenizer has {len(tokenizer)} tokens'

# Creating the Trainer Class by Subclassing from Huggingface-Trainer

In [11]:
"""
Subclassing the Huggingface Trainer class to use custome code to calculate the loss
The labels used for the loss are generated and the labels for the text tokens are set to -100 to ignore their loss,
because the modified model can't predict text-tokens
Also changing the log method to save the logs in a tensorboard format.
"""


def generate_log_function():
    """
    This function returns a logging-function that can be used as a method for the CustomTrainer class

    :log_dir:  path to folder in which the logs will be saved
    """
    writer = torch.utils.tensorboard.SummaryWriter()

    def log(self, logs) -> None:
        """
        Log `logs` on the various objects watching training.

        Subclass and override this method to inject custom behavior.

        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        # logging is printed after each - logging step but no update on the screen
        if self.state.epoch is not None:
            logs["epoch"] = round(self.state.epoch, 2)

        output = {**logs, **{"step": self.state.global_step}}
        self.state.log_history.append(output)
        self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs)
        for key, value in output.items():
            writer.add_scalar(key, value)
        writer.flush()
    return log


log_function = generate_log_function()
# Just beeing save and checking the right padding position of the tokenizer
tokenizer.padding_side = 'left'
cel = torch.nn.CrossEntropyLoss()

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs: bool=False, loss_fc=torch.nn.CrossEntropyLoss()):
        """
        model: model which should be trained.
        inputs: A padded batch of samples from the dataset.
        return_outputs: Indicates if the whole output of the model is returned or not.
        loss_fc: Instance of a loss function which should be used for the loss calculation
        """
        def cross_entropy_loss(logits, target):
            """
            This function applies softmax and than cross entropy loss to a logits, target pair
        
            A custom CrossEntropy-Loss function that can be applied to multi Label Problems
            """
            # Durch 1.0 Teilen erhält den Gradienten
            predicted_distribution = torch.nn.functional.softmax(logits, -1)/1.0
            # Scaling the predicted distribution to match the target distribution
                    
            n_targ = target.sum(-1).unsqueeze(-1)
            predicted_distribution *= n_targ
                    
            epsilon = 1e-10
            cross_entropy = target * torch.log(predicted_distribution + epsilon)
            cross_entropy = cross_entropy.sum(-1)
            return -torch.mean(cross_entropy)
            
        # Removing the token_type_ids because we don't need them
        try:
            inputs.pop('token_type_ids')
        except KeyError:
            pass
        
        # extracting and normalizing the labels
        labels = inputs.pop('labels')
        labels = torch.nn.functional.normalize(labels, p=1 , dim = -1)
        
        # Forward pass
        model.train()
        outputs = model(**inputs, output_attentions=False, output_hidden_states=False, return_dict=True)
        logits = outputs['logits']

        # calculating the loss


        # uncomment if you want to use customeloss
        #loss = cross_entropy_loss(logits, labels)
        # comment out if you want to use custome crossentorpy loss
        loss = cel(logits, labels)
        

        #message = f'loss: {loss.item()}'
        message = f'loss: {loss.item():.5f} max logit: {torch.max(logits, dim=-1).values[0]:.5f} min logit: {torch.min(logits, dim =-1).values[0]:.5f}, max_label: {torch.max(labels, dim=-1).values[0]:.5f}, min_label: {torch.max(labels, dim=-1).values[0]:.5f}' 
        sys.stdout.write('\r'+ message)
        
        return (loss, outputs) if return_outputs else loss

    def prediction_step(
        self,
        model: torch.nn.Module,
        inputs: dict,
        prediction_loss_only: bool,
        ignore_keys: list = None,
        ) -> tuple:
        #torch.cuda.empty_cache()
        model = model.eval()
        with torch.no_grad():
            with self.compute_loss_context_manager():
                loss, outputs = self.compute_loss(model, inputs, return_outputs=True)

        return loss, None, None

    def log(self, logs) -> None:
        """
        Log `logs` on the various objects watching training.

        Subclass and override this method to inject custom behavior.

        Args:
            logs (`Dict[str, float]`):
                The values to log.
        """
        log_function(self, logs)


# Training the Model

In [12]:
# The TrainingArguments class is a class which stores multiple parameters for the Custom-trainer of the model.

training_args = TrainingArguments(
    output_dir=output_dir,              
    num_train_epochs=num_train_epochs,             
    per_device_train_batch_size=per_device_train_batch_size,    # batch size per device during training
    per_device_eval_batch_size=per_device_eval_batch_size,
    save_strategy=save_strategy,
    evaluation_strategy=evaluation_strategy,
    eval_steps=evaluation_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    logging_first_step=logging_first_step,
    logging_steps=logging_steps,
    save_steps=save_steps,
    logging_nan_inf_filter=logging_nan_inf_filter,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    seed=seed,
    dataloader_num_workers=dataloader_num_workers, 
    fp16=fp16,
    bf16=bf16,
    tf32=tf32,
    torch_compile=torch_compile
    #,
    #adam_beta1=adam_beta1,
    #adam_beta2=adam_beta2,
    #warmup_steps=warmup_steps

)
# Allow the training of the input embeddings
model.enable_input_require_grads()
model.score.to(default_device)
trainer = CustomTrainer(model=model,
                        args=training_args, 
                        train_dataset=train_dataset, 
                        eval_dataset=validation_dataset,
                        data_collator=DataCollatorWithPadding(tokenizer,
                                                              return_tensors='pt'))

#trainer.save_model(f'{output_dir}/checkpoint-0')
#train_results = trainer.train(resume_from_checkpoint=resume_from_checkpoint)

trainer.evaluate(validation_dataset)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with 

loss: 12.81250 max logit: 2.46875 min logit: -2.51562, max_label: 1.00000, min_label: 1.00000

loss: 13.06250 max logit: 2.79688 min logit: -2.62500, max_label: 0.02942, min_label: 0.02942

{'eval_loss': 12.98929214477539,
 'eval_runtime': 408.3475,
 'eval_samples_per_second': 183.145,
 'eval_steps_per_second': 7.327}

In [None]:
def cross_entropy_loss(logits, target):
            """
            This function applies softmax and than cross entropy loss to a logits, target pair
        
            A custom CrossEntropy-Loss function that can be applied to multi Label Problems
            """
            # Durch 1.0 Teilen erhält den Gradienten
            predicted_distribution = torch.nn.functional.softmax(logits, -1)/1.0
            print('bevore', predicted_distribution, target)
            predicted_distribution = torch.nn.functional.normalize(predicted_distribution, p=1, dim=-1)
            # Scaling the predicted distribution to match the target distribution
            print('after', predicted_distribution)
            n_targ = target.sum(-1).unsqueeze(-1)
            predicted_distribution *= n_targ
                    
            epsilon = 1e-10
            cross_entropy = target * torch.log(predicted_distribution + epsilon)
            cross_entropy = cross_entropy.sum(-1)
            return -torch.mean(cross_entropy)


l = torch.tensor([[-1000, 10, -1000, 10]])/1.0
t = torch.tensor([[0, 1, 0, 1]])/1.0
t = torch.nn.functional.normalize(t, p=1, dim=-1)
cross_entropy_loss(l, t)