# Imports

In [1]:
# Own Packages
from Masterarbeit_utils.model_utils import get_tokenizer, load_and_modify_model, load_pretrained_Tokenizer

# Site-Packages
import torch
import psutil
import os
import pickle as pk

from transformers import AutoTokenizer, OPTForCausalLM
from tokenizers.processors import TemplateProcessing
from transformers import AutoTokenizer, OPTForCausalLM, OPTForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset

2023-07-03 14:40:27.669118: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Parameters

In [6]:
"""
The Paths to important folders have to be changed for your system.
"""

# This folder will be created and filled with txt.files for each sample after you run the Pytorch Dataset Notebook
dataset_folder = f'data/dataset_samples'

# The folder at which the model will be saved. This folder has to be created for your system 
model_folder = f'data/models/gal_125_1'
os.makedirs(model_folder, exist_ok=True)

# Folder at which all pickle files are stored. This folder is fixed for this project and should not be changed
dump_dir = r'PK_DUMP'

# Model parameters 
'''
mini	125 M
base	1.3 B
standard	6.7 B
large	30 B
huge	120 B'''
base_model_name = 'mini'

# All new Torch-objects will be by default in this dtype
default_dtype = torch.float16
torch.set_default_dtype(default_dtype)

# Default device on which the model will be loaded
default_device = 'cuda:0'

# Number of GPUs the model will be parallelised to 
num_gpus = 1
# If you change 'default_device' to 'cpu', make sure to set num_gpus to zero.
if default_device == 'cpu':
    num_gpus = 0

tensor_parallel = False
n_f_terms = None # Will be calculated

# Training parameters!

output_dir=model_folder
num_train_epochs=10
per_device_train_batch_size=2
save_strategy="epoch"
logging_strategy="epoch"
evaluation_strategy="epoch"
learning_rate=2e-4
weight_decay=0.01

# Creating the Tokenizer

In [7]:
# Loads a pretrained Tokenizer for the galactica model and adds an additional token for each F-Term
tokenizer = get_tokenizer(dump_dir)

# The Tokenizer contained initially 50000 Tokens which are stored as the vocab-size.
# The vocab_size attribute is not updated when the additional tokens are added to the tokenizer
n_f_terms = len(tokenizer) - tokenizer.vocab_size
print(f'There are {n_f_terms} different F-Terms in the whole Dataset!')

There are 391452 different F-Terms in the whole Dataset!


# Creating the dataset

In [8]:
# Samples in train 6385601
# Samples in val 1596401

class JapPatDataset(Dataset):
    """Dataset containing Japanese patents and their F-Term classification"""
    def __init__(self, data_folder, tokenizer):
        """
        data_folder: path to folder containing the text samples
        tokenizer: tokenizer instance with added additional Tokens for F-Terms
        """
        super(Dataset).__init__()
        self.data_folder = data_folder
        # This has to be manually set to the ammount of files in the 'dataset_samples' folder. Calculating the number of files in this folder would take forever.
        # A to low number would lead to samples missing from the dataset.
        # A to high number would raise a FileNotFound error.
        self.l = 7984000 
        self.tokenizer = tokenizer
        
    def __len__(self):
        return self.l
    
    def __getitem__(self, idx):
        try:
            with open(f'{self.data_folder}/{idx}.txt', 'r', encoding='utf-8') as f:
                item = f.read()
        except FileNotFoundError:
            raise FileNotFoundError
        
        # Tokenizing the item 
        # The Tokenizer will return a dict with the encoded text as 'input_ids', 
        # a mask which shows the tokens types this will not be needed for our applications
        # and a mask for the attention mechanism as 'attention_mask' The attention mask will be needed to indicate, that the 
        # model should not attend to <pad> tokens.
        return self.tokenizer(item)  

In [10]:
dataset = JapPatDataset(dataset_folder, tokenizer)
print(f'Example: {dataset[100]}')

Example: {'input_ids': [0, 70, 6314, 48486, 48, 2294, 1008, 286, 921, 353, 559, 345, 281, 1308, 1596, 404, 7445, 315, 281, 35044, 2481, 281, 20198, 312, 6862, 286, 20198, 321, 1262, 1336, 748, 281, 5675, 2194, 15508, 36, 41024, 2383, 21314, 48, 68, 305, 632, 1487, 10180, 243, 45, 343, 13542, 301, 286, 5675, 2194, 15508, 243, 40, 6068, 3039, 301, 281, 1470, 243, 44, 34, 835, 18774, 377, 286, 3577, 1311, 299, 286, 5675, 2194, 15508, 243, 40, 36, 381, 35044, 243, 42, 343, 7884, 2481, 286, 20198, 243, 39, 312, 286, 1336, 343, 7312, 1262, 748, 388, 299, 286, 5675, 2194, 15508, 243, 40, 34, 891, 286, 10180, 243, 45, 343, 6165, 18774, 36, 2263, 34, 286, 5675, 2194, 15508, 243, 40, 4159, 18214, 7270, 34, 891, 491, 6165, 17426, 2644, 36, 2093, 34, 286, 20198, 243, 39, 6165, 17426, 2644, 363, 286, 10180, 243, 45, 34, 891, 1308, 35, 9671, 2656, 2831, 343, 1616, 36, 50, 387, 401, 35, 11689, 2022, 52, 53060, 53061, 53062, 53063, 53064, 53065, 53066, 53067, 53068, 53069, 53070, 53071, 53072, 53073, 

# Creating the Model

In [11]:
from transformers.utils import replace_return_docstrings
from transformers.modeling_outputs import CausalLMOutputWithPast
_CONFIG_FOR_DOC = "OPTConfig"
from typing import List, Optional, Tuple, Union
from transformers.generation_logits_process import (
    EncoderNoRepeatNGramLogitsProcessor,
    ExponentialDecayLengthPenalty,
    ForcedBOSTokenLogitsProcessor,
    ForcedEOSTokenLogitsProcessor,
    ForceTokensLogitsProcessor,
    HammingDiversityLogitsProcessor,
    InfNanRemoveLogitsProcessor,
    LogitNormalization,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    NoBadWordsLogitsProcessor,
    NoRepeatNGramLogitsProcessor,
    PrefixConstrainedLogitsProcessor,
    RepetitionPenaltyLogitsProcessor,
    SuppressTokensAtBeginLogitsProcessor,
    SuppressTokensLogitsProcessor,
    TemperatureLogitsWarper,
    TopKLogitsWarper,
    TopPLogitsWarper,
    TypicalLogitsWarper,
)


class CustomOPTForCausalLM(OPTForCausalLM):
    def greedy_search(
        self,
        input_ids: torch.LongTensor,
        logits_processor: Optional[LogitsProcessorList] = None,
        stopping_criteria: Optional[StoppingCriteriaList] = None,
        max_length: Optional[int] = None,
        pad_token_id: Optional[int] = None,
        eos_token_id: Optional[int] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        output_scores: Optional[bool] = None,
        return_dict_in_generate: Optional[bool] = None,
        synced_gpus: Optional[bool] = False,
        **model_kwargs,) -> Union[GreedySearchOutput, torch.LongTensor]:
            r"""
            Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be
            used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

            Parameters:
                input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                    The sequence used as a prompt for the generation.
                logits_processor (`LogitsProcessorList`, *optional*):
                    An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                    used to modify the prediction scores of the language modeling head applied at each generation step.
                stopping_criteria (`StoppingCriteriaList`, *optional*):
                    An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                    used to tell if the generation loop should stop.

                max_length (`int`, *optional*, defaults to 20):
                    **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated
                    tokens. The maximum length of the sequence to be generated.
                pad_token_id (`int`, *optional*):
                    The id of the *padding* token.
                eos_token_id (`int`, *optional*):
                    The id of the *end-of-sequence* token.
                output_attentions (`bool`, *optional*, defaults to `False`):
                    Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                    returned tensors for more details.
                output_hidden_states (`bool`, *optional*, defaults to `False`):
                    Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                    for more details.
                output_scores (`bool`, *optional*, defaults to `False`):
                    Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
                return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                    Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
                synced_gpus (`bool`, *optional*, defaults to `False`):
                    Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
                model_kwargs:
                    Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                    If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

            Return:
                [`~generation_utils.GreedySearchDecoderOnlyOutput`], [`~generation_utils.GreedySearchEncoderDecoderOutput`]
                or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
                [`~generation_utils.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
                `return_dict_in_generate=True` or a [`~generation_utils.GreedySearchEncoderDecoderOutput`] if
                `model.config.is_encoder_decoder=True`.

            Examples:

            ```python
            >>> from transformers import (
            ...     AutoTokenizer,
            ...     AutoModelForCausalLM,
            ...     LogitsProcessorList,
            ...     MinLengthLogitsProcessor,
            ...     StoppingCriteriaList,
            ...     MaxLengthCriteria,
            ... )

            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

            >>> # set pad_token_id to eos_token_id because GPT2 does not have a PAD token
            >>> model.config.pad_token_id = model.config.eos_token_id

            >>> input_prompt = "It might be possible to"
            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids

            >>> # instantiate logits processors
            >>> logits_processor = LogitsProcessorList(
            ...     [
            ...         MinLengthLogitsProcessor(10, eos_token_id=model.config.eos_token_id),
            ...     ]
            ... )
            >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)])

            >>> outputs = model.greedy_search(
            ...     input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria
            ... )

            >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
            ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
            ```"""
            # init values
            logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
            stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
            if max_length is not None:
                warnings.warn(
                    "`max_length` is deprecated in this function, use"
                    " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                    UserWarning,
                )
                stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
            pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
            eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
            output_scores = output_scores if output_scores is not None else self.config.output_scores
            output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
            output_hidden_states = (
                output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
            )
            return_dict_in_generate = (
                return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
            )

            # init attention / hidden states / scores tuples
            scores = () if (return_dict_in_generate and output_scores) else None
            decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
            cross_attentions = () if (return_dict_in_generate and output_attentions) else None
            decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None

            # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
            if return_dict_in_generate and self.config.is_encoder_decoder:
                encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
                encoder_hidden_states = (
                    model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
                )

            # keep track of which sequences are already finished
            unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)

            this_peer_finished = False  # used by synced_gpus only
            while True:
                if synced_gpus:
                    # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
                    # The following logic allows an early break if all peers finished generating their sequence
                    this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
                    # send 0.0 if we finished, 1.0 otherwise
                    dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
                    # did all peers finish? the reduced sum will be 0.0 then
                    if this_peer_finished_flag.item() == 0.0:
                        break

                # prepare model inputs
                model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)

                # forward pass to get next token
                outputs = self(
                    **model_inputs,
                    return_dict=True,
                    output_attentions=output_attentions,
                    output_hidden_states=output_hidden_states,
                )

                if synced_gpus and this_peer_finished:
                    continue  # don't waste resources running the code we don't need

                next_token_logits = outputs.logits[:, -1, :]

                # pre-process distribution
                next_tokens_scores = logits_processor(input_ids, next_token_logits)
                print(f'next_token_scores: {next_token_scores}')

                # Store scores, attentions and hidden_states when required
                if return_dict_in_generate:
                    if output_scores:
                        scores += (next_tokens_scores,)
                    if output_attentions:
                        decoder_attentions += (
                            (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
                        )
                        if self.config.is_encoder_decoder:
                            cross_attentions += (outputs.cross_attentions,)

                    if output_hidden_states:
                        decoder_hidden_states += (
                            (outputs.decoder_hidden_states,)
                            if self.config.is_encoder_decoder
                            else (outputs.hidden_states,)
                        )

                # argmax
                next_tokens = torch.argmax(next_tokens_scores, dim=-1)

                # finished sentences should have their next token be a padding token
                if eos_token_id is not None:
                    if pad_token_id is None:
                        raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
                    next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)

                # update generated ids, model inputs, and length for next step
                input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
                model_kwargs = self._update_model_kwargs_for_generation(
                    outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
                )

                # if eos_token was found in one sentence, set sentence to finished
                if eos_token_id is not None:
                    unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())

                # stop when each sentence is finished, or if we exceed the maximum length
                if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
                    if not synced_gpus:
                        break
                    else:
                        this_peer_finished = True

            if return_dict_in_generate:
                if self.config.is_encoder_decoder:
                    return GreedySearchEncoderDecoderOutput(
                        sequences=input_ids,
                        scores=scores,
                        encoder_attentions=encoder_attentions,
                        encoder_hidden_states=encoder_hidden_states,
                        decoder_attentions=decoder_attentions,
                        cross_attentions=cross_attentions,
                        decoder_hidden_states=decoder_hidden_states,
                    )
                else:
                    return GreedySearchDecoderOnlyOutput(
                        sequences=input_ids,
                        scores=scores,
                        attentions=decoder_attentions,
                        hidden_states=decoder_hidden_states,
                    )
            else:
                return input_ids

    
    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:

        Example:

        ```python
        >>> from transformers import GPT2Tokenizer, OPTForCausalLM

        >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m")
        >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")

        >>> prompt = "Hey, are you consciours? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
        ```"""
        print('Forward-Call !!!!!')
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.lm_head(outputs[0]).contiguous()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return (loss,) + output if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

ModuleNotFoundError: No module named 'transformers.generation_logits_process'

In [None]:
def get_tokenizer(dump_dir):
    
    # Loading the dict containing all unique f-terms in the datase
    with open(f'{dump_dir}/f_terms_in_ds_dir.pk', 'rb') as f:
        f_term_dict = pk.load(f)
        
    # Loading a dict, which contains all uniqe f-terms with crawled definitions
    with open(f'{dump_dir}/f_term_dict.pk', 'rb') as f:
        definitions = pk.load(f)
        
    # Loading the original tokenizer for the galactica model
    tokenizer = load_pretrained_Tokenizer('mini')
    
    # Checking for which f-term form the dataset a f-term definition is present
    exceptions = {}
    exceptions_l = 0
    for i, key in enumerate(f_term_dict.keys()):
        try: 
            _ = definitions[key]
            exceptions[key] = 0
        except KeyError:
            exceptions[key] = 1
            exceptions_l += 1
    
    unique_tokens = [key +',' for key, value in exceptions.items() if value ==0] 
    tokenizer.add_tokens(unique_tokens)
    # Adding the start_sequence, end_sequence and padding tokens to the tokenizer
    tokenizer.pad_token = '<pad>'
    tokenizer.bos_token = '<s>'
    tokenizer.eos_token = '</s>'
    tokenizer.bos_token_id = 0
    tokenizer.eos_token_id = 2
    tokenizer.pad_token_id = 1
    tokenizer._tokenizer.post_processor = TemplateProcessing(
    	single=tokenizer.bos_token + " $A " + tokenizer.eos_token,
    	special_tokens=[(tokenizer.eos_token, tokenizer.eos_token_id), (tokenizer.bos_token, tokenizer.bos_token_id)],
	)
    return tokenizer


def load_pretrained_model(model_name: str, dtype: torch.dtype, tensor_parallel: bool, num_gpus: int) -> OPTForCausalLM:
    """
    Loads a pretrained model in the OPT structure

    :return: OPTForCausalLM with pretrained weights
    """
    if num_gpus > 1:
        tensor_parallel = True

    # will probably never need a device map
    device_map=None

    # A dict to map the correct model urls
    HF_MAPPING = {
        "mini": ("facebook/galactica-125m", torch.float32),
        "base": ("facebook/galactica-1.3b", torch.float32),
        "standard": ("facebook/galactica-6.7b", torch.float32),
        "large": ("facebook/galactica-30b", torch.float32),
        "huge": ("facebook/galactica-120b", torch.float16)}

    # Analyzing the system (code by huggingface)
    max_memory = {}
    if num_gpus > 0 and not tensor_parallel:
        # based on https://github.com/huggingface/accelerate/blob/5315290b55ea9babd95a281a27c51d87b89d7c85/src/accelerate/utils/modeling.py#L274
        for i in range(num_gpus):
            _ = torch.tensor([0], device=i)
        for i in range(num_gpus):
            max_memory[i] = torch.cuda.mem_get_info(i)[0]
        device_map = "auto"
    max_memory["cpu"] = psutil.virtual_memory().available

    # Loading the model form web / from cache
    model = CustomOPTForCausalLM.from_pretrained(HF_MAPPING[model_name][0], torch_dtype=dtype, low_cpu_mem_usage=True,
                                           device_map=device_map, max_memory=max_memory)

    return model


def load_pretrained_Tokenizer(model_name):
    """
    :param model_name:  Name of the matching pretrained model
    :return:            Tokenizer matching to the pretrained model
    """

    # A dict to map the correct model urls
    HF_MAPPING = {
        "mini": ("facebook/galactica-125m", torch.float32),
        "base": ("facebook/galactica-1.3b", torch.float32),
        "standard": ("facebook/galactica-6.7b", torch.float32),
        "large": ("facebook/galactica-30b", torch.float32),
        "huge": ("facebook/galactica-120b", torch.float16)}

    return AutoTokenizer.from_pretrained(HF_MAPPING[model_name][0])


def extract_embedding(model):
    """
    :param model:  Loaded Pretrained model
    :return:       Token embeddings
    """
    return model.get_input_embeddings()


def create_embedding(original_embedding: torch.nn.Embedding, n_f_terms: int, device: str) -> torch.nn.Embedding:
    """
    This function takes the original_embedding instance of an OPT model,
        (nn.Embedding instance).
    and the number of f-terms it should embedd (n_f_terms) and creates a new embedding which has
    new weights for all f_terms stacked ontop of the old weigths used for the original tokens

    returns: torch.nn.Embedding
    """
    # calculating parameters for the new embedding instance
    embedding_dim = original_embedding.embedding_dim
    num_embeddings = original_embedding.num_embeddings + n_f_terms
    padding_idx = original_embedding.padding_idx

    # creating new embedding (compleately untrained)
    embedding = torch.nn.Embedding(num_embeddings, embedding_dim, padding_idx)
    # extracting the weigths of the original pretrained embeddign
    old_weights = original_embedding.weight
    new_weights = embedding.weight

    # replacing a chunk of the new parameters with te old parameters
    # to retain the ability to encode natrual language tokens
    embedding.weight = torch.nn.Parameter(
        torch.cat([old_weights.clone().to(device),
                   new_weights[original_embedding.num_embeddings:].clone().to(device)],
                  0))
    return embedding


def modify_embeddings(model: OPTForCausalLM, n_f_terms: int, device: str) -> OPTForCausalLM:
    original_embeddings = extract_embedding(model)
    new_embeddings = create_embedding(original_embeddings, n_f_terms, device)
    # Replacing the old embedding instance with the new embedding instance in the model instance
    model.set_input_embeddings(new_embeddings)
    return model


def create_new_classification_head(n_f_terms: int, model_dim: int, dtype: torch.dtype, device: str) -> torch.nn.Linear:
    """
    Creates a new classification head for the model

    This classification head will be a new linear layer with 'model_dim' input features and 'n_f_terms' output features
    """
    print(device)
    return torch.nn.Linear(in_features=model_dim, out_features=n_f_terms, bias=False).to(device)


def add_classification_head(_model: OPTForCausalLM, classification_head: torch.nn.Linear) -> OPTForCausalLM:
    """
    This function implements the new classification head to the pretrained model.

    _model: Instanciated OPTForCausalLM model
    classificaiton_head: New classification head for the model
    """

    # changing the configuration of the model
    vocab_size = classification_head.out_features
    _model.config.vocab_size = vocab_size
    _model.model.decoder.vocab_size = vocab_size
    _model.num_labels = vocab_size
    _model.config.num_labels = vocab_size

    # adding the classification head to the model
    _model.set_output_embeddings(classification_head)
    return _model


def change_classification_head(model: OPTForCausalLM, n_f_terms: int, dtype: torch.dtype, device: str):
    """
    :param model:       Model which classification head should be changed
    :param n_f_terms:   Number of different F-terms in dataset
    :param dtype:       dtype of the model
    :return:            OPTForCausalLM with changed classification head
    """
    emb = extract_embedding(model)
    model_dim = emb.embedding_dim
    classification_head = create_new_classification_head(n_f_terms, model_dim, dtype, device)
    return add_classification_head(model, classification_head)


def load_and_modify_model(model_name: str,
                          dtype: torch.dtype,
                          tensor_parallel: bool,
                          num_gpus: int,
                          n_f_terms,
                          device: str) -> OPTForCausalLM:
    """
    This function loads a pretrained OPT model and modifies it for F-Term prediction

    :param model_name:      Name of the pretrained model to download
    :param dtype:           DType of the model parameters
    :param tensor_parallel: Switch to turn on model paralelization
    :param num_gpus:        Number of GPUs the model should run on
    :param n_f_terms:       Number of F-terms the model should be able to encode and predict
    :param device:          Device on which the model should be loaded
    :return:                Modified OPT model 
    """
    model = load_pretrained_model(model_name, dtype, tensor_parallel, num_gpus)
    model = modify_embeddings(model, n_f_terms, device)
    model = change_classification_head(model, n_f_terms, dtype, device)
    return model

if __name__=='__main__':
    pass

In [None]:
# The pretrained model is loaded from Huggingface.
# The token-embedding is expanded for all f-terms and the output embeddings is compleatly replaced by a F-Term classification head.
model = load_and_modify_model(base_model_name, default_dtype, tensor_parallel, num_gpus, n_f_terms, default_device)
print(f'The model interprets token-index {model.config.bos_token_id} as the beginning of a sequence and {model.config.eos_token_id} as the end')

In [None]:
i = 'A novel method to '
inputs = tokenizer(i, return_tensors='pt')
inputs.pop('token_type_ids')
out = model.generate(**inputs)

In [None]:
model.config.num_beams, model.config.length_penalty, model.config.early_stopping, model.config.num_beam_groups, model.config.do_sample, model.config.num_return_sequences, model.config.output_scores

In [None]:
model.logits_processer

# Creating the Trainer Class by Subclassing from Huggingface-Trainer

In [None]:
# Subclassing the Huggingface Trainer class to use custome code to calculate the loss
# The labels used for the loss are generated and the labels for the text tokens are set to -100 to ignore their loss,
# because the modified model can't predict text-tokens
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs: bool=False):
        """
        model: model which should be trained.
        inputs: A padded batch of samples from the dataset.
        return_outputs: Indicates if the whole output of the model is returned or not.
        """
        # Removing the token_type_ids because we don't need them
        inputs.pop('token_type_ids')
        labels = inputs['input_ids'].clone()
        # Generating the labels, because the model can only predict F-Terms but also can interpret Text-Tokens as input, 
        # The maximum token idx is 50000 higher than the maximum output_idx
        labels = labels - 50000
        # All text tokens have token_idx below 50000 after substracting 50000 they are negative and 
        # are now set to -100 to ignore them when the loss is computed
        labels[labels<0] = -100
        # generating the output of the model
        # It is a dict of 'loss', 'logits' and 'past_key_values'
        outputs = model(**inputs, output_attentions=False, output_hidden_states=False, return_dict=True, labels=labels)
        loss = outputs['loss']
        return (loss, outputs) if return_outputs else loss
        

# Training the Model

In [None]:
# The TrainingArguments class is a class which stores multiple parameters for the Custom-trainer of the model.

training_args = TrainingArguments(
    output_dir=output_dir,          # output directory
    num_train_epochs=num_train_epochs,              # total # of training epochs
    per_device_train_batch_size=per_device_train_batch_size,    # batch size per device during training
    save_strategy=save_strategy,
    logging_strategy=logging_strategy,
    evaluation_strategy=evaluation_strategy,
    learning_rate=learning_rate,
    weight_decay=weight_decay
)


In [None]:
trainer = CustomTrainer(model=model, args=training_args, train_dataset=dataset, data_collator=DataCollatorWithPadding(tokenizer, return_tensors='pt'))

In [None]:
trainer.train()