## Part 1: Data Prep

### 1.1 Initialize Directories
This block check if the certain path exists. If they do, then remove the corresponding path

In [None]:
import os
import shutil

partition_size = 5

jsonl_path = './data/dataset_new.jsonl'
save_path = './data/dataset_new'
partition_path = './data/partition'

if os.path.exists(jsonl_path):
    os.remove(jsonl_path)

if os.path.exists(save_path):
    shutil.rmtree(save_path)

directory = "./data"
if not os.path.exists(directory):
    os.makedirs(directory)

### 1.2 Load and Prepare Dataset:
- Import necessary libraries from the datasets package: https://huggingface.co/docs/datasets/index
- Load the Twitter Financial News Sentiment (TFNS) dataset and convert it to a Pandas dataframe. https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment
- Map numerical labels to their corresponding sentiments (negative, positive, neutral).
- Add instruction for each data entry, which is crucial for Instruction Tuning.
- Convert the Pandas dataframe back to a Hugging Face Dataset object.

In [None]:
from datasets import load_dataset
import datasets

dic = {
    0:"negative",
    1:'positive',
    2:'neutral',
}

# Load the TFNS dataset, apply the label mapping and rename the columns
tfns = load_dataset('zeroshot/twitter-financial-news-sentiment')
tfns = tfns['train']
tfns = tfns.to_pandas()
tfns['label'] = tfns['label'].apply(lambda x:dic[x])
tfns['instruction'] = 'What is the sentiment of this tweet? Please choose an answer from {negative/neutral/positive}.'
tfns.columns = ['input', 'output', 'instruction']
tfns = datasets.Dataset.from_pandas(tfns)

### 1.3 Concatenate and Shuffle the data
- Concatenate TFNS into training set
- Shuffle training set


In [None]:
# Concatenate the dataset with itself to increase the number of examples
tmp_dataset = datasets.concatenate_datasets([tfns]*2)
train_dataset = tmp_dataset
print(tmp_dataset.num_rows)

# Shuffle the dataset
all_dataset = train_dataset.shuffle(seed = 42)
all_dataset.shape

## Part 2: Dataset Formatting and Tokenization

### 2.1 Dataset Formatting

In [None]:
import json
from notebook.tqdm import tqdm

In [None]:
# Reformat the dataset into the format of "context" and "target"
def format_example(example: dict) -> dict:
    context = f"Instruction: {example['instruction']}\n"
    if example.get("input"):
        context += f"Input: {example['input']}\n"
    context += "Answer: "
    target = example["output"]
    return {"context": context, "target": target}

In [None]:
# Load the dataset into the list
data_list = []
for item in all_dataset.to_pandas().itertuples():
    tmp = {}
    tmp["instruction"] = item.instruction
    tmp["input"] = item.input
    tmp["output"] = item.output
    data_list.append(tmp)

In [None]:
# Partition the dataset into 5 disjoint subsets
partition = []
partition_size = 5
for i in range(partition_size):
    partition.append(data_list[i::partition_size])

In [None]:
# save to a jsonl file
with open("./data/dataset_new.jsonl", 'w') as f:
    for example in tqdm(data_list, desc="formatting.."):
        f.write(json.dumps(format_example(example)) + '\n')

# save the partition to a jsonl file
for i in range(partition_size):
    with open(f"./data/dataset_new_partition_{i}.jsonl", 'w') as f:
        for example in tqdm(partition[i], desc="formatting.."):
            f.write(json.dumps(format_example(example)) + '\n')

### 2.2 Tokenization
Tokenization is the process of converting input text into tokens that can be fed into the model.

In [None]:
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer


model_name = "meta-llama/Llama-2-7b-hf"
jsonl_path = "./data/dataset_new.jsonl"  # updated path
save_path = './data/dataset_new'  # updated path
partition_path = './data/partition'  # updated path

max_seq_length = 512
skip_overlength = True

In [None]:
# The preprocess function tokenizes the prompt and target, combines them into input IDs,
# and then trims or pads the sequence to the maximum sequence length.
def preprocess(tokenizer, config, example, max_seq_length):
    prompt = example["context"]
    target = example["target"]
    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
    target_ids = tokenizer.encode(
        target,
        max_length=max_seq_length,
        truncation=True,
        add_special_tokens=False)
    input_ids = prompt_ids + target_ids + [config.eos_token_id]
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

# The read_jsonl function reads each line from the JSONL file, preprocesses it using the preprocess function,
# and then yields each preprocessed example.
def read_jsonl(path, max_seq_length, skip_overlength=False):
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, trust_remote_code=True)
    config = AutoConfig.from_pretrained(
        model_name, trust_remote_code=True, device_map='auto')
    with open(path, "r") as f:
        for line in tqdm(f.readlines()):
            example = json.loads(line)
            feature = preprocess(tokenizer, config, example, max_seq_length)
            if skip_overlength and len(feature["input_ids"]) > max_seq_length:
                continue
            feature["input_ids"] = feature["input_ids"][:max_seq_length]
            yield feature

### 2.3 Saving the dataset

In [None]:
# The script then creates a Hugging Face Dataset object from the generator and saves it to disk.
save_path = './data/dataset_new'

dataset = datasets.Dataset.from_generator(
    lambda: read_jsonl(jsonl_path, max_seq_length, skip_overlength)
    )
dataset.save_to_disk(save_path)

In [None]:
for i in range(partition_size):
    partition_path = f'./data/partition/partition_{i}'
    dataset = datasets.Dataset.from_generator(
        lambda: read_jsonl(f'./data/dataset_new_partition_{i}.jsonl', max_seq_length, skip_overlength)
    )
    dataset.save_to_disk(partition_path)

## Part 3: Setup FinGPT Training configuration with LoRA

### 3.1 Training Argument Setup
Initialize and set training argument

In [None]:
from typing import List, Dict, Optional
import torch
from loguru import logger
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    TaskType,
    LoraConfig,
    get_peft_model,
    set_peft_model_state_dict,
    prepare_model_for_kbit_training,
    prepare_model_for_int8_training,
)
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING

In [None]:
training_args = TrainingArguments(
        output_dir='./finetuned_model',    # saved model path
        logging_steps = 500,
        # max_steps=10000,
        num_train_epochs = 3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=8,
        learning_rate=1e-4,
        weight_decay=0.01,
        warmup_steps=1000,
        save_steps=500,
        fp16=True,
        # bf16=True,
        torch_compile = False,
        load_best_model_at_end = True,
        evaluation_strategy="steps",
        remove_unused_columns=False,
    )

### 3.2 Quantization Config Setup

In [None]:
# Quantization
compute_dtype = getattr(torch, 'float16')
q_config = BitsAndBytesConfig(load_in_4bit=True,
                                bnb_4bit_quant_type='nf4',
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=compute_dtype
                                )

### 3.3 Model Loading & Preparation
Load the base model and tokenizer, and prepare the model for kbit training

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                            trust_remote_code=True, 
                                            quantization_config=q_config, 
                                            device='cuda')
model = prepare_model_for_kbit_training(model, q_config)


### 3.4 LoRA Config & Setup
Implement LoRA and print trainable params

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# LoRA
target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING['llama']
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=target_modules,
    bias='none',
)
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

In [None]:
resume_from_checkpoint = None
if resume_from_checkpoint is not None:
    checkpoint_name = os.path.join(resume_from_checkpoint, 'pytorch_model.bin')
    if not os.path.exists(checkpoint_name):
        checkpoint_name = os.path.join(
            resume_from_checkpoint, 'adapter_model.bin'
        )
        resume_from_checkpoint = False
    if os.path.exists(checkpoint_name):
        logger.info(f'Restarting from {checkpoint_name}')
        adapters_weights = torch.load(checkpoint_name)
        set_peft_model_state_dict(model, adapters_weights)
    else:
        logger.info(f'Checkpoint {checkpoint_name} not found')

In [None]:
model.print_trainable_parameters()

In [None]:
import copy
# Copy the model 5 times
model_list = [copy.deepcopy(model) for _ in range(partition_size)]
for i in range(partition_size):
    model_list[i].print_trainable_parameters()

## Part 4: Loading Data and Training Llama2
In this segment, we'll delve into the loading of your pre-processed data, and finally, launch the training of your FinGPT model. 

*Note: The federated model would also be trained in this step*

### 4.1 Loading your Data

In [None]:
# load data
from datasets import load_from_disk
import datasets

dataset = datasets.load_from_disk("./data/dataset_new")
dataset = dataset.train_test_split(0.2, shuffle=True, seed = 42)

# load the partition
partitioned_dataset = []
for i in range(partition_size):
    partition_path = f'./data/partition/partition_{i}'
    partitioned_dataset.append(datasets.load_from_disk(partition_path))
    partitioned_dataset[i] = partitioned_dataset[i].train_test_split(0.2, shuffle=True, seed = 42)

### 4.2 Training Configuration and Launch:
- Customize the Trainer class for specific loss computation, prediction step, and model-saving methods.
- Define a data collator function to process batches of data during training.
- Set up TensorBoard for logging, instantiate your modified trainer, and begin training.

In [None]:
class ModifiedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        ).loss

    def prediction_step(self, model: torch.nn.Module, inputs, prediction_loss_only: bool, ignore_keys = None):
        with torch.no_grad():
            res = model(
                input_ids=inputs["input_ids"].to(model.device),
                labels=inputs["labels"].to(model.device),
            ).loss
        return (res, None, None)

    def save_model(self, output_dir=None, _internal_call=False):
        from transformers.trainer import TRAINING_ARGS_NAME

        os.makedirs(output_dir, exist_ok=True)
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
        saved_params = {
            k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
        }
        torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))

def data_collator(features: list) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [tokenizer.pad_token_id] * (seq_len - 1) + ids[(seq_len - 1) :] + [tokenizer.pad_token_id] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }

In [None]:
from torch.utils.tensorboard import SummaryWriter
from transformers.integrations import TensorBoardCallback

In [None]:
# Train the model
writer = SummaryWriter()
trainer = ModifiedTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    callbacks=[TensorBoardCallback(writer)],
)
trainer.train()
writer.close()

# Save the model
model.save_pretrained(training_args.output_dir)

validation_loss = []

# Train the partitioned models
for i in range(partition_size):
    writer = SummaryWriter()
    partition_path = f'./finetuned_model/partitioned/partition_{i}'
    partitioned_trainer = ModifiedTrainer(
        model=model_list[i],
        args=training_args,
        train_dataset=partitioned_dataset[i]["train"],
        eval_dataset=partitioned_dataset[i]["test"],
        data_collator=data_collator,
        callbacks=[TensorBoardCallback(writer)],
    )
    partitioned_trainer.train()
    writer.close()
    
    # Save the partitioned models
    model_list[i].save_pretrained(partition_path)
    
    # Log validation loss
    validation_loss = partitioned_trainer.evaluate()["eval_loss"]
    validation_loss.append(validation_loss)


In [None]:
from peft import PeftModel

# Calculate the weights for the partitioned models
weights = [1 / loss for loss in validation_loss]
total_weight = sum(weights)
normalized_weights = [weight / total_weight for weight in weights]

# Merge the partitioned adapters
partition_path = f'./finetuned_model/partitioned/partition_0'
aggregated_model = PeftModel.from_pretrained(model, partition_path, adapter_name='Institute_0')

for i in range(1, partition_size):
    partition_path = f'./finetuned_model/partitioned/partition_{i}'
    _ = aggregated_model.load_adapter(partition_path, adapter_name=f'Institute_{i}')


adapter_list = [f'Institute_{i}' for i in range(partition_size)]
adapter_name = 'merge'
density = 0.2
aggregated_model.add_weighted_adapter(adapter_list, normalized_weights, 
                                      adapter_name, combination_type='linear')

In [None]:
aggregated_model.set_adapter(adapter_name)

# Save the aggregated model
aggregated_path = './finetuned_model/aggregated'
aggregated_model.save_pretrained()


## Part 5: Inference and Benchmarks using FinGPT
Now that your model is trained, let’s understand how to use it to infer and run benchmarks.

### 5.1 Load the model

In [None]:
import sys
sys.path.append('/content/FinNLP')
