In [None]:
import torch, torch.nn as nn
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel, AutoConfig
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
import evaluate
from functools import partial
import numpy as np
from datetime import datetime
import os, json
from transformers import set_seed
from argparse import ArgumentParser


# set seed from transformers
set_seed(42) 

In [1]:
# # unused -> instead, use gpt2 config model from huggingface
# class PriceExpector(nn.Module):
#     def __init__(self, input_dim, output_dim, hidden_dim):
#         super(PriceExpector, self).__init__()
#         self.embedding = nn.Embedding(input_dim, hidden_dim)
#         self.fc1 = nn.Linear(hidden_dim, hidden_dim, bias = True)
#         self.fc2 = nn.Linear(hidden_dim, output_dim, bias = True)

#     def forward(self, inputs):
#         x = self.embedding(inputs['input_ids'])

#         x = self.fc1(x)
#         x = self.fc2(x)
#         return x



In [2]:
# trainer setter
def set_trainer(model, tokenizer, dataset, output_dir, args):
    
    # load metrics
    accuracy = evaluate.load('accuracy')

    # set metric function
    def metric(eval_pred, func):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis = -1) # (batch, sequence lenagh, hidden_state)
        # if tokens are not the PAD token
        filters = labels != -100

        predictions = predictions[filters]
        labels = labels[filters]

        # 'func' will be 'accuracy' above
        return func.compute(predictions = predictions, references = labels)
    
    # tokenizing, using 4 processor 
    def tokenize_func(examples):
        return tokenizer(examples['0'], truncation=True, padding=True)
    training_data = dataset['train'].map(tokenize_func, batched=True, num_proc = 4)
    valid_data = dataset['valid'].map(tokenize_func, batched=True, num_proc = 4)
    test_data = dataset['test'].map(tokenize_func, batched=True, num_proc = 4)

    # remove original columns, leaving 'input_ids' 'attention_mask' '그리고 하나 뭐였지' only
    training_data = training_data.remove_columns(['0'])
    valid_data = valid_data.remove_columns(['0'])
    test_data = test_data.remove_columns(['0'])

    # make output directory of current time, where weights are being saved. 
    od = output_dir + os.sep + datetime.strftime(datetime.now(), "%m-%d-%H-%M-%S")
    try: os.mkdir(od)
    except: pass

    # maximum batch, other hyperparameters except learning rate and weight decay are of defaults
    trainingarguments = TrainingArguments(
        do_train = True,    
        output_dir = od,                         
        evaluation_strategy = "steps", # necessary: change to step
        save_strategy = "steps",                         
        eval_steps = 50, # necessary: set step
        save_steps = 50,
        save_total_limit = 1,
        load_best_model_at_end = True, # necessary: EarlyStoppingCallBack하려면 True여야 함
        metric_for_best_model = "accuracy",
        greater_is_better = True, # necessary: higher metric results better performance # default = True when metric_for_best_model is set
        num_train_epochs = 3,
        seed = 42,
        per_device_train_batch_size = 512,
        per_device_eval_batch_size = 512,

        # control learning rate and weight decay value as a external variables. 
        learning_rate = args.lr,
        weight_decay = args.decay,
        remove_unused_columns = False
    )

    # save traningarguments
    with open(od+ os.sep + "trainingargs.json", "w") as f: 
        f.write(json.dumps(trainingarguments.to_dict(), indent = 2, ensure_ascii = False))
    f.close()
    
    # set trainer with autoregressive tasks
    trainer = Trainer(
        model = model,
        args = trainingarguments,
        tokenizer = tokenizer,
        train_dataset = training_data,
        eval_dataset = valid_data,
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        compute_metrics = partial(metric, func = accuracy)
    )

    return trainer

# initialize gpt model with AutoConfig
def set_config(ggangtong_model_checkpoint, tokenizer):
    print(len(tokenizer) - 1)
    
    # NOTE: head 2, layer 22, max_token_len = 1024
    config = AutoConfig.from_pretrained(
        ggangtong_model_checkpoint,
        vocab_size = len(tokenizer),
        n_ctx = 1024,
        bos_token_id = tokenizer.bos_token_id,
        eos_token_id = tokenizer.eos_token_id,
        n_embd = 64,
        n_head = 2,
        n_layer = 22,
        n_positions = 1024, 


    )

    return config


In [3]:
# main operations wrapper
def main():

    # get arguments
    parser = ArgumentParser()
    parser.add_argument("--lr", type = float, required = True)
    parser.add_argument("--decay", type = float, required = True)
    args = parser.parse_args()

    print(args.lr, args.decay)

    # load data
    data_path = {"train": "data_processed/training_data.csv", "valid": "data_processed/valid_data.csv", "test": "data_processed/test_data.csv"}
    dataset = load_dataset("csv", data_files = data_path)

    # get 깡통 checkpoint for config initialization
    ggangtong_model_checkpoint = "openai-community/gpt2"    

    # load indiv_tokenizer
    indiv_indeces_checkpoint = "tokenizers/tokenizer_indiv_jaeyoon"
    output_dir_indiv = "weights/model_indiv"
    tokenizer_indiv = AutoTokenizer.from_pretrained(indiv_indeces_checkpoint)
    tokenizer_indiv.add_special_tokens({"pad_token": "<pad>"}) # Llama3 doesn't have pad_token
    # initialize model config
    model_config_indiv = set_config(ggangtong_model_checkpoint, tokenizer_indiv)
    # initialize model 
    model_indiv = GPT2LMHeadModel(model_config_indiv)
    # initialze trainer
    trainer_indiv = set_trainer(model_indiv, tokenizer_indiv, dataset, output_dir_indiv, args)
    # run
    trainer_indiv.train()


    joint_indeces_checkpoint = "tokenizers/tokenizer_joint_jaeyoon"    
    output_dir_joint = "weights/model_joint"    
    tokenizer_joint = AutoTokenizer.from_pretrained(joint_indeces_checkpoint)
    tokenizer_joint.add_special_tokens({"pad_token": "<pad>"}) # Llama3 doesn't have pad_token
    # initialize model config
    model_config_joint = set_config(ggangtong_model_checkpoint, tokenizer_joint)
    # initialize model 
    model_joint = GPT2LMHeadModel(model_config_joint)
    # initialze trainer
    trainer_joint = set_trainer(model_joint, tokenizer_joint, dataset, output_dir_joint, args)
    # run
    trainer_joint.train()


In [None]:
# runner
if __name__ == "__main__":
    main()