# RAFT

Retrival Augmented Fine-Tuning

## Chunking and Question Generation

### 1. Setup the Environment

In [None]:
import pandas as pd

ds_name = "output_sample_data"
doc_path = "../../sample_data/"
ds_path = f"../../{ds_name}"
print("Creating dataset: " + ds_name)

### (Optional) Clean up the DEMO folder

In [None]:
# Clean up demo folder only if it's a DEMO dataset
if ds_path.endswith("demo"):
    import shutil
    print(f"Cleaning demo folder {ds_path}")
    shutil.rmtree(ds_path, ignore_errors=True)
    print(f"Cleaning demo checkpoints folder {ds_path}")
    shutil.rmtree(ds_path + "-checkpoints", ignore_errors=True)

### 2. Generate Q/A/CoT fine-tuning dataset using RAFT from the domain specific documents

In [None]:
! python3 ../raft.py \
    --datapath "$doc_path" \
    --output $ds_path \
    --distractors 1 \
    --doctype pdf \
    --chunk_size 2000 \
    --questions 3 \
    --workers 1 \
    --system-prompt-key gpt \
    --completion_model gpt-4 \
    --embedding_model text-embedding-ada-002 \
    --templates "../templates/" 

## Prepare training, validation and evaluation splits

### 3. Setup file locations for splits

In [None]:
raft_arrow_file = f"{ds_path}/data-00000-of-00001.arrow"
dataset_path = f"{ds_path}-files/{ds_name}-full.jsonl"
dataset_path_hf = f"{ds_path}-files/{ds_name}-hf.full.jsonl"

dataset_path_hf_train = f"{ds_path}-files/{ds_name}-hf.train.jsonl"
dataset_path_hf_valid = f"{ds_path}-files/{ds_name}-hf.valid.jsonl"
dataset_path_hf_eval = f"{ds_path}-files/{ds_name}-hf.eval.jsonl"

dataset_path_ft_train = f"{ds_path}-files/{ds_name}-ft.train.jsonl"
dataset_path_ft_train_filtered = f"{ds_path}-files/{ds_name}-ft.train.filtered.jsonl"
dataset_path_ft_valid = f"{ds_path}-files/{ds_name}-ft.valid.jsonl"
dataset_path_ft_valid_filtered = f"{ds_path}-files/{ds_name}-ft.valid.filtered.jsonl"
dataset_path_ft_eval = f"{ds_path}-files/{ds_name}-ft.eval.jsonl"

print(f"Reading arrow file {raft_arrow_file}")

### 4. Export the full Hugging Face dataset to JSONL

In [None]:
! python ../format.py \
    --input $raft_arrow_file \
    --output $dataset_path_hf \
    --output-format hf

In [None]:
hf_full_df = pd.read_json(dataset_path_hf, lines=True)
hf_full_df.head(5)

### 5. Do the splitting
    - 80% Training Data / 10% Evaluation Data / 10% Validation Data

In [None]:
# split dataset into 80%/10%/10%
import numpy as np

samples_count = len(hf_full_df)

hf_train_df, hf_valid_df, hf_eval_df = np.split(hf_full_df, [int(.8*samples_count), int(.9*samples_count)])

hf_train_df.to_json(dataset_path_hf_train, orient="records", lines=True)
hf_valid_df.to_json(dataset_path_hf_valid, orient="records", lines=True)
hf_eval_df.to_json(dataset_path_hf_eval, orient="records", lines=True)

## Export training and validation datasets into JSONL format

### 6. Export training dataset for a chat model

In [None]:
! python ../format.py \
    --input $dataset_path_hf_train \
    --input-type jsonl \
    --output $dataset_path_ft_train \
    --output-format chat 

In [None]:
dataset_path_ft_train_df = pd.read_json(dataset_path_ft_train, lines=True)
dataset_path_ft_train_df.head(5)

### 7. Export the validation dataset for a chat model

In [None]:
! python ../format.py \
    --input $dataset_path_hf_valid \
    --input-type jsonl \
    --output $dataset_path_ft_valid \
    --output-format chat 

In [None]:
dataset_path_ft_valid_df = pd.read_json(dataset_path_ft_valid, lines=True)
dataset_path_ft_valid_df.head(5)

### 8. Export evaluation dataset into JSONL format

In [None]:
! python ../format.py \
    --input $dataset_path_hf_eval \
    --input-type jsonl \
    --output $dataset_path_ft_eval \
    --output-format eval

In [None]:
dataset_path_ft_eval_df = pd.read_json(dataset_path_ft_eval, lines=True)
dataset_path_ft_eval_df.head(5)

## Optional Steps

### Filter the data based on the token limit

In [None]:
import json
import openai
from transformers import GPT2Tokenizer

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to count tokens
def count_tokens(text):
    return len(tokenizer.encode(text))

def transform_entry(entry):
    prompt_parts = []
    completion = None

    for message in entry['messages']:
        if message['role'] == 'system':
            prompt_parts.append(message['content'])
        elif message['role'] == 'user':
            if prompt_parts:
                prompt_parts.append("User: " + message['content'])
            else:
                prompt_parts = ["User: " + message['content']]
        elif message['role'] == 'assistant':
            completion = "Assistant: " + message['content']
    
    prompt = "\n".join(prompt_parts)
    return prompt, completion

def filter_data(entries, max_tokens=8192):
    filtered_entries = []
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    for entry in entries:
        prompt, completion = transform_entry(entry)
                
        total_text = f"{prompt} {completion}"  # Combine prompt and completion
        if count_tokens(total_text) <= max_tokens:
            filtered_entries.append(entry)
    return filtered_entries

def load_chat_completions(file_path):
    try:
        entries = []
        with open(file_path, 'r') as file:
            for line in file:
                try:
                    entry = json.loads(line)
                    entries.append(entry)
                except json.JSONDecodeError:
                    print("Skipping a line that could not be decoded.")
        return entries
    except FileNotFoundError:
        print("The specified file was not found.")
        return []

# Function to write filtered entries to a JSONL file
def write_to_jsonl(filtered_entries, output_file_path):
    with open(output_file_path, 'w') as outfile:
        for entry in filtered_entries:
            json.dump(entry, outfile)
            outfile.write('\n')  # Ensure each JSON object is on a new line

unfiltered_entries = load_chat_completions(dataset_path_ft_train)
filtered_entries = filter_data(unfiltered_entries)
write_to_jsonl(filtered_entries, dataset_path_ft_train_filtered)

unfiltered_entries = load_chat_completions(dataset_path_ft_valid)
filtered_entries = filter_data(unfiltered_entries)
write_to_jsonl(filtered_entries, dataset_path_ft_valid_filtered)

In [None]:
dataset_path_ft_train_filtered_df = pd.read_json(dataset_path_ft_train_filtered, lines=True)
dataset_path_ft_train_filtered_df.head(5)

### Open AI Tool to validate the completions file

In [None]:
! openai tools fine_tunes.prepare_data -f ../../output_data_1000.completion.jsonl