In [None]:
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from huggingface_hub import login

# Authenticate with Hugging Face Hub
login(token="your_token")

# **Step 1: Load Pretrained Mistral Model & Tokenizer**
model_name = "mistralai/Mistral-7B-v0.1"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Define BitsAndBytesConfig for 4-bit quantization with CPU offloading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True # Enable CPU offloading for 32-bit parts
)

# Load model with updated quantization config and device_map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map="auto", # Let Transformers automatically handle device placement
)
model.resize_token_embeddings(len(tokenizer))  # Resize embeddings after adding pad_token

# **Step 2: Load Merged Finance Dataset**
def load_json_dataset(json_path):
    with open(json_path, "r") as file:
        data = json.load(file)
    return Dataset.from_list(data)

dataset = load_json_dataset("/content/fixed_dataset.json")

# **Step 3: Tokenize Data**
def tokenize_function(examples):
    instructions = examples["instruction"]
    responses = examples["response"]

    # Ensure everything is a string before concatenation
    instruction_strs = [instr if isinstance(instr, str) else " ".join(instr) for instr in instructions]
    response_strs = [resp if isinstance(resp, str) else " ".join(resp) for resp in responses]

    texts = [instr + " " + resp for instr, resp in zip(instruction_strs, response_strs)]

    return tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Map function with batched=True
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "response"])

# **Step 4: Configure LoRA Parameters**
lora_config = LoraConfig(
    r=16,                # Rank (Adjust for memory/performance tradeoff)
    lora_alpha=32,       # Scaling factor
    lora_dropout=0.05,   # Dropout for regularization
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# **Step 5: Define Training Arguments**
training_args = TrainingArguments(
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=8,  # Increase to compensate for batch size reduction
    num_train_epochs=3,
    learning_rate=2e-4,
    weight_decay=0.01,
    bf16=torch.cuda.is_bf16_supported(),  # Use bf16 if available
    fp16=not torch.cuda.is_bf16_supported(),  # Otherwise, use fp16
    logging_steps=100,
    save_steps=500,
    output_dir="mistral_lora_finance",
    report_to="none"
)


# **Step 6: Train the Model**
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args
)

trainer.train()

# **Step 7: Save the Fine-Tuned LoRA Adapter**
model.save_pretrained("mistral_lora_finance_adapter")
tokenizer.save_pretrained("mistral_lora_finance_adapter")

# Research Document: Fine-tuning Mistral-7B for Financial Applications using LoRA

## Abstract
This document details the methodology for fine-tuning the `Mistral-7B-v0.1` large language model to a financial domain using the Low-Rank Adaptation (LoRA) technique. The objective is to adapt the pre-trained model's capabilities to better understand, generate, and respond to queries within a specialized financial context. The process leverages 4-bit quantization for efficient resource utilization and employs `trl`'s `SFTTrainer` for supervised fine-tuning on a custom JSON-formatted financial dataset. The outcome is a LoRA adapter that can be merged with the base model for deployment in financial NLP tasks.

## 1. Introduction
Large Language Models (LLMs) have demonstrated remarkable capabilities across a wide range of natural language processing (NLP) tasks. However, their general-purpose nature often necessitates domain-specific adaptation for optimal performance in specialized fields. This research outlines a practical approach to fine-tuning the `Mistral-7B-v0.1` model, a powerful open-source LLM, for financial applications. The chosen fine-tuning strategy, LoRA, is selected for its computational efficiency, allowing for effective adaptation with limited resources.

## 2. Methodology

### 2.1 Model Selection and Quantization

The base model selected for fine-tuning is `mistralai/Mistral-7B-v0.1`. This model is known for its strong performance and relatively smaller size compared to larger models, making it suitable for fine-tuning on consumer-grade hardware or cloud instances with limited GPU memory.

To further optimize memory usage and computational speed, the model is loaded with **4-bit quantization** using `BitsAndBytesConfig`. This configuration includes:
*   `load_in_4bit=True`: Enables 4-bit quantization.
*   `bnb_4bit_compute_dtype=torch.float16`: Specifies `float16` as the compute data type for 4-bit tensors.
*   `bnb_4bit_use_double_quant=True`: Activates double quantization, which quantizes the quantization constants, leading to further memory savings.
*   `llm_int8_enable_fp32_cpu_offload=True`: Allows offloading of specific 32-bit components to the CPU, freeing up GPU memory.

The model is loaded with `device_map="auto"` to enable automatic device placement by the `transformers` library.

### 2.2 Tokenizer Configuration

The `AutoTokenizer` from the `transformers` library is used to load the tokenizer corresponding to the `Mistral-7B-v0.1` model. A crucial step involves ensuring a `pad_token` is defined. If the default tokenizer lacks one, a special token `[PAD]` is added, and the model's token embeddings are resized accordingly to accommodate this new token.

### 2.3 Dataset Preparation

**Dataset Source:** A custom financial dataset, expected to be located at `/content/fixed_dataset.json`, is loaded. The dataset is anticipated to be a JSON array where each element is an object containing `"instruction"` and `"response"` keys, representing question-answer pairs or prompts and their desired completions in a financial context.

**Tokenization Function:** A `tokenize_function` is implemented to process the dataset. This function concatenates the `instruction` and `response` fields, ensuring they are string types, and then tokenizes the combined text. The tokenization process includes:
*   `padding="max_length"`: Pads sequences to the `max_length`.
*   `truncation=True`: Truncates sequences longer than `max_length`.
*   `max_length=512`: Sets the maximum sequence length for tokenization.

The dataset is processed in batches to improve efficiency, and the original `instruction` and `response` columns are removed after tokenization.

### 2.4 LoRA Configuration

Low-Rank Adaptation (LoRA) is utilized to efficiently fine-tune the large model. LoRA injects trainable rank decomposition matrices into the transformer layers, significantly reducing the number of trainable parameters. The `LoraConfig` is set with the following parameters:
*   `r=16`: The rank of the update matrices, controlling the expressiveness of the adaptation.
*   `lora_alpha=32`: A scaling factor for the LoRA activations.
*   `lora_dropout=0.05`: Dropout applied to the LoRA layers for regularization.
*   `bias="none"`: Specifies that bias parameters will not be trained.
*   `task_type="CAUSAL_LM"`: Defines the task as causal language modeling.

### 2.5 Training Arguments

Training parameters are defined using `TrainingArguments` from the `transformers` library:
*   `per_device_train_batch_size=1`: Small batch size due to memory constraints with large models.
*   `gradient_accumulation_steps=8`: Compensates for the small batch size by accumulating gradients over multiple steps.
*   `num_train_epochs=3`: The number of passes over the training dataset.
*   `learning_rate=2e-4`: The initial learning rate for the optimizer.
*   `weight_decay=0.01`: Regularization parameter.
*   `bf16`/`fp16`: Mixed precision training is enabled, preferring `bf16` if supported by the hardware, otherwise defaulting to `fp16` for faster training and reduced memory footprint.
*   `logging_steps=100`: Frequency of logging training metrics.
*   `save_steps=500`: Frequency of saving model checkpoints.
*   `output_dir="mistral_lora_finance"`: Directory to save model outputs.
*   `report_to="none"`: Disables automatic reporting to external services.

### 2.6 Trainer Initialization and Execution

The `SFTTrainer` from the `trl` library is used to handle the supervised fine-tuning process. It is initialized with the LoRA-adapted model, the tokenized training dataset, and the defined training arguments. The `trainer.train()` method then commences the fine-tuning process.

## 3. Results and Artifacts

Upon successful completion of the training process, the fine-tuned LoRA adapter weights and the tokenizer configuration will be saved to the directory `mistral_lora_finance_adapter`. These artifacts can then be loaded to infer with the adapted model, potentially by merging the LoRA weights back into the base model.

## 4. Conclusion and Future Work

This notebook provides a robust framework for domain-adapting large language models using efficient fine-tuning techniques. The fine-tuned Mistral-7B model is expected to exhibit enhanced performance on financial NLP tasks. Future work could involve evaluating the model on specific financial benchmarks, experimenting with different LoRA parameters, integrating more diverse financial datasets, or exploring alternative fine-tuning methods.

In [5]:
!pip install -U trl
# After this cell runs, please restart the Colab runtime (Runtime -> Restart runtime...) and then run all cells from the beginning.

Collecting trl
  Downloading trl-0.26.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.26.0-py3-none-any.whl (517 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.2/517.2 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.26.0
