# readme.ipynb
# proejct elements(all included in hw2.ipynb)
- readme.ipynb: reports about methods, goal, results
- preprocess.ipynb: preprocessing, initialize tokenizer, prepare dataset
- train_model.ipynb: initialize model, trainingArguments and trainer, train and save model
- run.sh: execution codes, and its result(best steps in terms of accuracy) as annotation

# Methods
- convert stock price data with korean string
- to initialize model and train. 
- variable hyperparameters: learning rate, weight decay
- fixed hyperparameters: layers, attention heads, sequence length
- two controls: tokenizers
    - first(indiv): digits and units are separatedly tokenized(예: 일/천/오/백)
    - second(joint): digits and units are jointedly tokenized(예: 일천/오백)

# Goal(what to see?)
## As a homework
1. preprocess the stock data
    - convert the number into string
    - preprocess the string fit to the tokenizer
2. initialize GPT2 model from huggingface, using config with custom hyperparameters
3. train and compare results

## as a toy project
1. the price data fluctuate more on low units then on the higher units
2. according to this inductive bias, when the tokens are splitted jointedly, only the digits + lower_units varies much
3. on the other hand, when the tokens are splitted individually, as the lower level units vary much, its digits as well as lower units would varies much, which makes model to predict the subsequent tokens(values) much harder
4. so, I expect the accuracies of 'indiv' tokenzer would be more lower than that of 'joint' tokenizer. 


# result: 
- as we can check the evaluation result in annotation, except for the large learning rate which both model scores low, the individual tokenizer model performs much better than the joint tokenizer, which the hypothesis is rejected(note that this is without verification of statisticallity). 
- i did not decode the generated output and check how much it differs(e.g. Gen: 일십만오천이백, True: 일십만오천삼백사십, Diff: 140) so the results take more chance to be explained.


## run.sh
- I run the code with shell scripts, which includes the following lines
- this is due to the parameter controls
- the first annotation is the accuracy of individual tokenizer, while the second is that of joint tokenizer

python3 train_model.py --lr 1e-6 --decay 0.0001 # 0.7097, 0.4603

python3 train_model.py --lr 1e-6 --decay 0.001 # 0.7097, 0.4603

python3 train_model.py --lr 1e-6 --decay 0.01 # 0.7097, 0.4603

python3 train_model.py --lr 1e-5 --decay 0.0001 # 0.2189 0.2169

python3 train_model.py --lr 1e-5 --decay 0.001 # 0.2189, 0.2169

python3 train_model.py --lr 1e-5 --decay 0.01 # 0.2189, 0.2169

python3 train_model.py --lr 1e-4 --decay 0.0001 # 7.2611e-06, 0.0219

python3 train_model.py --lr 1e-4 --decay 0.001 # 7.2611e-06, 0.0219

python3 train_model.py --lr 1e-4 --decay 0.01 # 7.2611e-06, 0.0219

python3 train_model.py --lr 1e-3 --decay 0.0001 # 2.4203e-07, 8.9623e-05

python3 train_model.py --lr 1e-3 --decay 0.001 # 9.6815e-07, 4.1564e-05

python3 train_model.py --lr 1e-3 --decay 0.01 # 7.9872e-06, 1.8184e-05?

python3 train_model.py --lr 1e-7 --decay 0.0001 # 0.6116, 0.5519

python3 train_model.py --lr 1e-7 --decay 0.001 # 0.6116, 0.5519

python3 train_model.py --lr 1e-7 --decay 0.01 # 0.6116, 0.5519

python3 train_model.py --lr 6e-7 --decay 0.0001 # 0.7329, 0.4923

python3 train_model.py --lr 6e-7 --decay 0.001 # 0.7329, 0.4923

python3 train_model.py --lr 6e-7 --decay 0.01 # 0.7329, 0.4923

python3 train_model.py --lr 3e-7 --decay 0.0001 # 0.6276, 0.5177

python3 train_model.py --lr 3e-7 --decay 0.001 # 0.6276, 0.5177

python3 train_model.py --lr 3e-7 --decay 0.01 # 0.6276, 0.5177

In [None]:
# preprocess.ipynb

import pandas as pd, numpy as np
from pathlib import Path
import os, shutil, random
from KoreanNumber import num2kr
import torch, torch.nn as nn
from transformers import AutoModel, DataCollatorForLanguageModeling, AutoTokenizer
import json, re

# assume the price data are crawled
# data format: ['date', 'time', 'data1' , 'data2', ...]
# we will use only the first three columns 
# 5분 간격 주가 데이터
data_raw = str(Path().resolve()) + os.sep + "data"
data_processed = str(Path().resolve()) + os.sep + "data_processed"

# gather all data into ['date' 'time' 'data1' 'data2' ... ] dataframe
def agg_data(full_cols, target_cols, merge_cols):

    files = os.listdir(data_raw)

    df = pd.DataFrame()
    meta = []
    for file in files:
        business_name = file.split("_")[0]
        full_cols_file = [i.format(business_name) for i in full_cols]
        target_cols_file = [i.format(business_name) for i in target_cols]

        with open(f"{data_raw}/{file}", "r") as f:

            x = pd.DataFrame([i.split("\t") for i in f.readlines()])
            x = x.loc[x.index[::-1]].reset_index(drop=True)

            x.columns = full_cols_file

            if len(df) == 0:
                df = x[target_cols_file]
            else:
                df = df.merge(x[target_cols_file], on=merge_cols)

        meta.append(file)

    return df, meta


# use num2kr library, to convert number to korean(e.g. 152307 -> 일십만오만이천삼백칠)
# for the ease of processing, use some weird expressions(e.g. 일십만오만)
def num_to_str(out_file_name, meta_file_name, value_cols, merge_cols):
    df = pd.read_csv(out_file_name)
    with open(meta_file_name, "r") as f:
        meta = [i.strip("\n").split("_")[0] for i in f.readlines()]

    meta = [i.split("_")[0] for i in meta]
    # end_{} 하나만 있으니 일단 [0]
    value_cols_file = [value_cols[0].format(i) for i in meta]
    x = df[value_cols_file].map(lambda x: int(str(x).replace(",", "")))
    x = x.map(lambda x: num2kr.num2kr(x, 1))

    # TODO: 누더기 코드 수정 
    x = x.map(lambda x: re.sub(r"십([^만])", r"십만\1", x))
    x = pd.concat([df[merge_cols], x], axis = 1)
    return x


# # make tokenizer: gpt2 tokenizer를 그대로 따와서 바꾸는 코드를 구현하려고 했지만 실패해서 손으로 직접 만듦. 추후 보완 예정.
# def make_custom_tokenizer(old_path, new_path, new_vocab):
#     try: os.mkdir(new_path)
#     except: pass

#     # tokenizer_config.json
#     gpt_special_token_idx = "50256"
#     special_tokens = dic['added_tokens_decoder'][gpt_special_token_idx]['content']
#     print(special_tokens)

#     with open(f"{old_path}/tokenizer_config.json", "r") as f:
#         dic = json.load(f)
#     f.close()
#     key = gpt_special_token_idx
#     val = dic['added_tokens_decoder'][gpt_special_token_idx]
#     dic['added_tokens_decoder'] = {len(new_vocab): val}

#     with open(f"{new_path}/tokenizer_config.json", "w") as f:
#         f.write(json.dumps(dic, indent = 2, ensure_ascii = False))
#     f.close()
    
#     # special_tokens_map.json
#     shutil.copy(f"{old_path}/special_tokens_map.json", f"{new_path}/special_tokens_map.json")

#     # tokenizer.json
#     with open(f"{old_path}/tokenizer.json", "r") as f:
#         dic = json.load(f)
#     f.close()

#     dic['added_tokens'] = [
#         {
#             key: (len(new_vocab) if key == "id" else value) \
#                 for key, value in dic['added_tokens'][0].items()
#         }
#     ] # only one token이니까 일단 0

#     new_vocab_eos_added = new_vocab[len(new_vocab)] = special_tokens

#     dic['model']['vocab'] = new_vocab_eos_added
#     with open(f"{new_path}/tokenizer.json", "w") as f:
#         f.write(json.dumps(dic, indent = 2, ensure_ascii = False))
#     f.close()

#     # vocabs
#     with open(f"{new_path}/vocab.json", "w") as f:
#         f.write(json.dumps(new_vocab_eos_added, indent = 2, ensure_ascii = False))
#     f.close()

#     # merge rule
#     shutil.copy(f"{old_path}/merges.txt", f"{new_path}/merges.txt")
    

#     return 

# dataframe to sentences. 한 줄에 78개(=1일치 데이터)
def convert_raw_data_to_training_data(str_file_name, training_file_name, valid_file_name, test_file_name, meta_file_name, value_cols):
    
    with open(meta_file_name, "r") as f:
        meta = [i.strip("\n").split("_")[0] for i in f.readlines()]

    meta = [i.split("_")[0] for i in meta]
    # end_{} 하나만 있으니 일단 [0]
    value_cols_file = [value_cols[0].format(i) for i in meta]

    df = pd.read_csv(str_file_name)
    lst2d = []
    lst2d_test = []

    for i in range(len(df) - 80):
        ranges = range(i, i + 80)
        out = df[value_cols_file].loc[ranges].apply(lambda x: " ".join(x), axis = 0).tolist()
        lst2d.append(out)
    
    for i in range(len(df)-80, len(df)):
        ranges = range(i, min(i + 80, len(df)))
        out = df[value_cols_file].loc[ranges].apply(lambda x: " ".join(x), axis = 0).tolist()
        lst2d_test.append(out)

    print(len(lst2d))
    print(len(lst2d_test))

    lst1d = [i for j in lst2d for i in j]
    lst1d_test = [i for j in lst2d_test for i in j]
    
    df = pd.Series(lst1d).sample(frac = 1)
    df_test = pd.Series(lst1d_test).sample(frac = 1)

    train = df.iloc[:round(len(df) * 0.8)]
    valid = df.iloc[round(len(df) * 0.8):round(len(df) * 0.9)]
    test = df.iloc[round(len(df) * 0.9):]


    train.to_csv(training_file_name, index = False)
    valid.to_csv(valid_file_name, index = False)
    test.to_csv(test_file_name, index = False)


    # with open(training_file_name+"test", "w") as f:
    #     for i in lst1d_test:
    #         f.write(f"{i}\n")
    # f.close()



def main_preprocess():

    full_cols = ["date", "time", "init_{}", "high_{}", "low_{}", "end_{}"] + [
        str(i) for i in list(range(12))
    ]
    target_cols = ["date", "time", "end_{}"]
    merge_cols = ["date", "time"]
    value_cols = ["end_{}"]
    out_file_name = f"{data_processed}/data_aggregated.csv"
    meta_file_name = f"{data_processed}/meta.txt"
    str_file_name = f"{data_processed}/data_string_converted.csv"
    training_file_name = f"{data_processed}/training_data.csv"
    valid_file_name = f"{data_processed}/valid_data.csv"
    test_file_name = f"{data_processed}/test_data.csv"


    ###### parse data and preprocess then save
    df, meta = agg_data(full_cols, target_cols, merge_cols)
    df.to_csv(out_file_name, index=False)
    with open(meta_file_name, "w") as f:
        for i in meta:
            f.write(str(i) + "\n")
    f.close()
    
    ###### preprocess(convert number to string)
    df = num_to_str(out_file_name, meta_file_name, value_cols, merge_cols)
    df.to_csv(str_file_name, index = False)
    
    
    # ###### prepare tokenizer: 코드로 구현하려 했지만 실패해서 토크나이저를 직접 만듦
    # digit_name = ["일", "이", "삼", "사", "오", "육", "칠", "팔", "구"]
    # unit = ["십", "백", "천", "만", "십만"]
    # # zero_point = ["영", "점"] # num2kr이 소숫점을 지원하지 않음(int only) 그래서 나중에 추가해서 다시 해보는 걸로. 
    
    # joint_tokens = [i+j for j in unit for i in digit_name] 
    # indiv_tokens = digit_name + unit
    # # digit_only_tokens = digit_name + zero_point    
    
    # joint_indeces = {j: i for i, j in enumerate(tuple(joint_tokens))}
    # indiv_indeces = {j: i for i, j in enumerate(tuple(indiv_tokens))}

    # checkpoint = "openai-community/gpt2"
    # tokenizer_checkpoint = "tokenizers/tokenizer_checkpoint"
    # tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    # tokenizer.save_pretrained(tokenizer_checkpoint)

    # joint_indeces_checkpoint = "tokenizers/tokenizer_joint"
    # indiv_indeces_checkpoint = "tokenizers/tokenizer_indiv"
    
    # make_custom_tokenizer(tokenizer_checkpoint, joint_indeces_checkpoint, joint_indeces)
    # make_custom_tokenizer(tokenizer_checkpoint, indiv_indeces_checkpoint, indiv_indeces)

    
    # convert data into training data format(put a single day's data(count: 78) to a list)
    convert_raw_data_to_training_data(str_file_name, training_file_name, valid_file_name, test_file_name, meta_file_name, value_cols)
    






In [None]:
# train_model.ipynb

import torch, torch.nn as nn
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2LMHeadModel, AutoConfig
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
import evaluate
from functools import partial
import numpy as np
from datetime import datetime
import os, json
from transformers import set_seed
from argparse import ArgumentParser


# set seed from transformers
set_seed(42) 

# trainer setter
def set_trainer(model, tokenizer, dataset, output_dir, args):
    
    # load metrics
    accuracy = evaluate.load('accuracy')

    # set metric function
    def metric(eval_pred, func):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis = -1) # (batch, sequence lenagh, hidden_state)
        # if tokens are not the PAD token
        filters = labels != -100

        predictions = predictions[filters]
        labels = labels[filters]

        # 'func' will be 'accuracy' above
        return func.compute(predictions = predictions, references = labels)
    
    # tokenizing, using 4 processor 
    def tokenize_func(examples):
        return tokenizer(examples['0'], truncation=True, padding=True)
    training_data = dataset['train'].map(tokenize_func, batched=True, num_proc = 4)
    valid_data = dataset['valid'].map(tokenize_func, batched=True, num_proc = 4)
    test_data = dataset['test'].map(tokenize_func, batched=True, num_proc = 4)

    # remove original columns, leaving 'input_ids' 'attention_mask' '그리고 하나 뭐였지' only
    training_data = training_data.remove_columns(['0'])
    valid_data = valid_data.remove_columns(['0'])
    test_data = test_data.remove_columns(['0'])

    # make output directory of current time, where weights are being saved. 
    od = output_dir + os.sep + datetime.strftime(datetime.now(), "%m-%d-%H-%M-%S")
    try: os.mkdir(od)
    except: pass

    # maximum batch, other hyperparameters except learning rate and weight decay are of defaults
    trainingarguments = TrainingArguments(
        do_train = True,    
        output_dir = od,                         
        evaluation_strategy = "steps", # necessary: change to step
        save_strategy = "steps",                         
        eval_steps = 50, # necessary: set step
        save_steps = 50,
        save_total_limit = 1,
        load_best_model_at_end = True, # necessary: EarlyStoppingCallBack하려면 True여야 함
        metric_for_best_model = "accuracy",
        greater_is_better = True, # necessary: higher metric results better performance # default = True when metric_for_best_model is set
        num_train_epochs = 3,
        seed = 42,
        per_device_train_batch_size = 512,
        per_device_eval_batch_size = 512,

        # control learning rate and weight decay value as a external variables. 
        learning_rate = args.lr,
        weight_decay = args.decay,
        remove_unused_columns = False
    )

    # save traningarguments
    with open(od+ os.sep + "trainingargs.json", "w") as f: 
        f.write(json.dumps(trainingarguments.to_dict(), indent = 2, ensure_ascii = False))
    f.close()
    
    # set trainer with autoregressive tasks
    trainer = Trainer(
        model = model,
        args = trainingarguments,
        tokenizer = tokenizer,
        train_dataset = training_data,
        eval_dataset = valid_data,
        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
        compute_metrics = partial(metric, func = accuracy)
    )

    return trainer

# initialize gpt model with AutoConfig
def set_config(ggangtong_model_checkpoint, tokenizer):
    print(len(tokenizer) - 1)
    
    # NOTE: head 2, layer 22, max_token_len = 1024
    config = AutoConfig.from_pretrained(
        ggangtong_model_checkpoint,
        vocab_size = len(tokenizer),
        n_ctx = 1024,
        bos_token_id = tokenizer.bos_token_id,
        eos_token_id = tokenizer.eos_token_id,
        n_embd = 64,
        n_head = 2,
        n_layer = 22,
        n_positions = 1024, 


    )

    return config

# main operations wrapper
def main_train():

    # get arguments
    parser = ArgumentParser()
    parser.add_argument("--lr", type = float, required = True)
    parser.add_argument("--decay", type = float, required = True)
    args = parser.parse_args()

    print(args.lr, args.decay)

    # load data
    data_path = {"train": "data_processed/training_data.csv", "valid": "data_processed/valid_data.csv", "test": "data_processed/test_data.csv"}
    dataset = load_dataset("csv", data_files = data_path)

    # get 깡통 checkpoint for config initialization
    ggangtong_model_checkpoint = "openai-community/gpt2"    

    # load indiv_tokenizer
    indiv_indeces_checkpoint = "tokenizers/tokenizer_indiv_jaeyoon"
    output_dir_indiv = "weights/model_indiv"
    tokenizer_indiv = AutoTokenizer.from_pretrained(indiv_indeces_checkpoint)
    tokenizer_indiv.add_special_tokens({"pad_token": "<pad>"}) # Llama3 doesn't have pad_token
    # initialize model config
    model_config_indiv = set_config(ggangtong_model_checkpoint, tokenizer_indiv)
    # initialize model 
    model_indiv = GPT2LMHeadModel(model_config_indiv)
    # initialze trainer
    trainer_indiv = set_trainer(model_indiv, tokenizer_indiv, dataset, output_dir_indiv, args)
    # run
    trainer_indiv.train()


    joint_indeces_checkpoint = "tokenizers/tokenizer_joint_jaeyoon"    
    output_dir_joint = "weights/model_joint"    
    tokenizer_joint = AutoTokenizer.from_pretrained(joint_indeces_checkpoint)
    tokenizer_joint.add_special_tokens({"pad_token": "<pad>"}) # Llama3 doesn't have pad_token
    # initialize model config
    model_config_joint = set_config(ggangtong_model_checkpoint, tokenizer_joint)
    # initialize model 
    model_joint = GPT2LMHeadModel(model_config_joint)
    # initialze trainer
    trainer_joint = set_trainer(model_joint, tokenizer_joint, dataset, output_dir_joint, args)
    # run
    trainer_joint.train()


In [None]:
if __name__ == "__main__":
    main_preprocess()
    main_train()