In [None]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"


from datasets import concatenate_datasets, load_dataset
from huggingface_hub import notebook_login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

In [None]:
# workaround when you cant login using notebook_login

#from huggingface_hub import interpreter_login
#interpreter_login

In [None]:
notebook_login()

In [None]:
# 16GB Tesla T4 is not enough to train EleutherAI/gpt-neo-1.3B
# switch ti distilgpt2
model_checkpoint = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [None]:
# load dataset
dataset_1 = load_dataset("gbharti/finance-alpaca")
dataset_2 = load_dataset("PaulAdversarial/all_news_finance_sm_1h2023")
dataset_3 = load_dataset("winddude/reddit_finance_43_250k")
dataset_4 = load_dataset("causal-lm/finance")

# create a column called text
dataset_1 = dataset_1.map(
    lambda example: {"text": example["instruction"] + " " + example["output"]},
    num_proc=4,
)
dataset_1 = dataset_1.remove_columns(["input", "instruction", "output"])

dataset_2 = dataset_2.map(
    lambda example: {"text": example["title"] + " " + example["description"]},
    num_proc=4,
)
dataset_2 = dataset_2.remove_columns(
    ["_id", "main_domain", "title", "description", "created_at"]
)

dataset_3 = dataset_3.map(
    lambda example: {
        "text": example["title"] + " " + example["selftext"] + " " + example["body"]
    },
    num_proc=4,
)
dataset_3 = dataset_3.remove_columns(
    [
        "id",
        "title",
        "selftext",
        "z_score",
        "normalized_score",
        "subreddit",
        "body",
        "comment_normalized_score",
        "combined_score",
    ]
)

dataset_4 = dataset_4.map(
    lambda example: {"text": example["instruction"] + " " + example["output"]},
    num_proc=4,
)
dataset_4 = dataset_4.remove_columns(["input", "instruction", "output"])

# combine and split train test sets
combined_dataset = concatenate_datasets(
    [
        dataset_1["train"],
        dataset_2["train"],
        dataset_3["train"],
        dataset_4["train"],
        dataset_4["validation"],
    ]
)

datasets = combined_dataset.train_test_split(test_size=0.2)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])


tokenized_datasets = datasets.map(
    tokenize_function, batched=True, num_proc=4, remove_columns=["text"]
)

In [None]:
block_size = 1024

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [None]:
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    f"{model_name}-finetuned-finance",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    gradient_accumulation_steps=64,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    push_to_hub=True,
)

In [None]:
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    callbacks=[early_stopping],
)

In [None]:
trainer.train()

In [None]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()
tokenizer.push_to_hub(f"{model_name}-finetuned-finance")

#### Inference

In [None]:
from transformers import pipeline

generator = pipeline(model="lxyuan/distilgpt2-finetuned-finance", tokenizer=tokenizer)

generator("Tesla is",
  pad_token_id=generator.tokenizer.eos_token_id,
  max_new_tokens=200,
  num_return_sequences=2
)