## Library

In [1]:
import json
import random
from datasets import load_dataset, Dataset
from typing import List, Dict
import pandas as pd
from pathlib import Path
import gc, torch, time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from unsloth.chat_templates import get_chat_template

NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

In [None]:
from unsloth.chat_templates import standardize_sharegpt

### Load Model

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 30000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
#     "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
#     "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
#     "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
#     "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
#     "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
#     "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
#     "unsloth/Phi-3-medium-4k-instruct",
#     "unsloth/gemma-2-9b-bnb-4bit",
#     "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

#     "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
#     "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
#     "unsloth/Llama-3.2-3B-bnb-4bit",
#     "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head",], # Add for continual pretraining,
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

## Data Preparation

### Functions

In [None]:
def free_mem():
  for _ in range(10):
        gc.collect()
        with torch.no_grad():
          torch.cuda.empty_cache()
        time.sleep(0.1)
      
def load_jsonl_dataset(file_path: str) -> Dataset:
    """
    Load a JSONL file into a Dataset object with 'conversations' feature.
    
    Args:
        file_path: Path to the JSONL file
        
    Returns:
        Dataset object with 'conversations' feature
    """
    # Read the JSONL file
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Parse each line as JSON and wrap in conversations format
            conversations = json.loads(line.strip())
            data.append({'conversations': conversations})
    
    # Convert to Dataset
    dataset = Dataset.from_list(data)
    
    return dataset

In [None]:
def xlsum_convert_to_sharegpt(example):
    """
    Convert a single XLSum example to ShareGPT format with a detailed prefix.
    """
    human_message = {
        "from": "human",
        "value": (
            "You are an expert summarization assistant trained to generate concise, clear, and accurate summaries of "
            "given texts. Your goal is to understand the core ideas of the provided article and create a summary and related keywords.\n\n"
            f"Provide a summary for the following article in its original language: \n"
            f"Title: {example['title']} \n"
            f"Content: {example['text']} \n"
            "Summary:"
        )
    }
    
    gpt_message = {
        "from": "gpt",
        "value": example['summary']
    }
    
    return [human_message, gpt_message]


def create_extended_conversation(examples: List[Dict], conversation_extension: int) -> List[Dict]:
    """
    Create an extended conversation by combining multiple examples.
    
    Args:
        examples: List of dataset examples to sample from
        conversation_extension: Number of examples to combine into one conversation
    
    Returns:
        List of messages forming a single extended conversation
    """
    # Randomly sample the specified number of examples
    selected_examples = random.sample(examples, min(conversation_extension, len(examples)))
    
    # Convert each example and combine their messages
    extended_conversation = []
    for example in selected_examples:
        conversation_pair = convert_to_sharegpt_format(example)
        extended_conversation.extend(conversation_pair)
    
    return extended_conversation

def process_dataset(dataset, conversation_extension: int = 1):
    """
    Process the dataset and convert to ShareGPT format with optional conversation extension.
    
    Args:
        dataset: The input dataset
        conversation_extension: Number of examples to combine into one conversation
    
    Returns:
        Dataset object with conversations column
    """
    processed_data = []
    
    # Process each split in the dataset
    for split in dataset.keys():
        print(f"Processing {split} split...")
        examples = list(dataset[split])
        
        # Calculate number of conversations needed
        num_conversations = len(examples) // conversation_extension
        
        # Create extended conversations
        for i in range(num_conversations):
            start_idx = i * conversation_extension
            end_idx = start_idx + conversation_extension
            conversation_examples = examples[start_idx:end_idx]
            
            # Create extended conversation
            if conversation_extension > 1:
                conversation = create_extended_conversation(conversation_examples, conversation_extension)
            else:
                conversation = xlsum_convert_to_sharegpt(conversation_examples[0])
            
            processed_data.append({"conversations": conversation})
    
    # Convert to Dataset object
    return Dataset.from_list(processed_data)

def save_sharegpt_format(dataset, output_path):
    """
    Save the converted data in JSONL format.
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        for item in dataset:
            f.write(json.dumps(item['conversations'], ensure_ascii=False) + '\n')


### Download Data

#### Download Data - wikipedia-10k-id

In [None]:
# Define the dataset and subset
dataset_name = "indonesian-nlp/wikipedia-10k"
output_dir = Path("dataset/wikipedia-10k")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, "wikipedia-id")

In [None]:
print(dataset.column_names)
print(dataset['test'])

In [None]:
columns_to_keep = ['id', 'url', 'title', 'text']
standalone_dataset = dataset['test']
standalone_dataset = standalone_dataset.select_columns(columns_to_keep)

In [None]:
print(standalone_dataset.column_names)

In [None]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    standalone_dataset,
    merged_prompt = "Anda adalah sebuah asisten wikipedia. Jelaskan mengenai topik berikut: {title}",
    output_column_name = "text",
)

In [None]:
print(dataset[0])

In [None]:
output_path = f"{output_dir}/sharegpt_wikipedia-10k.jsonl"
save_sharegpt_format(dataset, output_path)

#### Download Data - alpaca-cleaned

In [None]:
# Define the dataset and subset
dataset_name = "yahma/alpaca-cleaned"
output_dir = Path("dataset/alpaca-cleaned")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, split = "train")

In [None]:
print(dataset.column_names)
print(dataset[1])

In [None]:
print(dataset[0])

In [None]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = "{instruction}[[\nYour input is:\n{input}]]",
    output_column_name = "output",
    conversation_extension = 3, # Select more to handle longer conversations
)

In [None]:
print(dataset[0])

In [None]:
output_path = f"{output_dir}/sharegpt_alpaca-cleaned.jsonl"
save_sharegpt_format(dataset, output_path)

#### Download Data - alpaca-gpt4-indonesian

In [None]:
# Define the dataset and subset
dataset_name = "FreedomIntelligence/alpaca-gpt4-indonesian"
output_dir = Path("dataset/alpaca-gpt4-indonesian")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, split = "train")

In [None]:
print(dataset.column_names)
print(dataset[0])

In [None]:
output_path = f"{output_dir}/sharegpt_alpaca-gpt4-indonesian.jsonl"
save_sharegpt_format(dataset, output_path)

#### Download Data - finetome100k

In [None]:
# Define the dataset and subset
dataset_name = "mlabonne/FineTome-100k"
output_dir = Path("dataset/FineTome-100k")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, split = "train")

In [None]:
print(dataset.column_names)
print(dataset[0])

In [None]:
print(dataset.column_names)

In [None]:
output_path = f"{output_dir}/sharegpt_FineTome-100k.jsonl"
save_sharegpt_format(dataset, output_path)

#### Download Data - xlsum-english

In [None]:
# Define the dataset and subset
dataset_name = "csebuetnlp/xlsum"
language_subset = "english"
output_dir = Path("dataset/xlsum-english")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, language_subset, split = "test")

In [None]:
dataset
dataset.column_names

In [None]:
dataset 

In [None]:
def xlsum_en_convert_to_sharegpt(dataset):
    """
    Convert the dataset into ShareGPT format.

    Parameters:
    dataset (Dataset): The dataset to be converted.

    Returns:
    Dataset: A new dataset in ShareGPT format.
    """
    sharegpt_data = []

    for example in dataset:
        # Format the input message
        human_message = (
            f"Provide a summary for the following article in its original language: \nTitle: {example['title']} \nContent: {example['text']} \nSummary:"
        )
        
        # Format the output message
        gpt_message = (
            f"{example['summary']}"
        )
        
        # Create the ShareGPT format
        conversation = [
            {"from": "human", "value": human_message},
            {"from": "gpt", "value": gpt_message}
        ]
        sharegpt_data.append(conversation)
    
    # Convert to Dataset format
    return sharegpt_data

In [None]:
dataset = xlsum_en_convert_to_sharegpt(dataset)

In [None]:
dataset = Dataset.from_dict({"conversations": dataset})

In [None]:
dataset

In [None]:
output_path = f"{output_dir}/sharegpt_xlsum_english.jsonl"
save_sharegpt_format(dataset, output_path)

#### Download Data - xlsum-indonesian

##### Download 

In [None]:
# Define the dataset and subset
dataset_name = "csebuetnlp/xlsum"
language_subset = "indonesian"
output_dir = Path("dataset/xlsum-indonesian")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, language_subset, split = "train")

In [None]:
print(dataset.column_names)

In [None]:
print(dataset['train'][0])

Save to CSV

In [None]:
for split in ["train", "test", "validation"]:
    df = pd.DataFrame(dataset[split])  # Convert to Pandas DataFrame
    output_file = output_dir / f"xlsum_indonesian_{split}.csv"
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved {split} split to {output_file}")

##### Load Dataset

In [None]:
train_path = "dataset/xlsum-indonesian/xlsum_indonesian_train.csv"
test_path = "dataset/xlsum-indonesian/xlsum_indonesian_test.csv"
validation_path = "dataset/xlsum-indonesian/xlsum_indonesian_validation.csv"

dataset = load_dataset("csv", data_files={
    "train": train_path,
    "test": test_path,
    "validation": validation_path
})

In [None]:
dataset['train'][0]

##### Convert to sharegpt

###### Unsloth 

In [None]:
##this code doesnt work, out of index bug

from unsloth import to_sharegpt
train_dataset = to_sharegpt(
    dataset['validation'],
    merged_prompt= \
        "[[Provide a summary for the following article in its original language:]]"\
        "[[\nTitle: {title}]]"\
        "[[\nContent: {text}]]"\
        "[[\nSummarize below:]]",
    conversation_extension=2,  # Randomly combines conversations 
    output_column_name="summary",  # Use the "summary" column as the target
)


###### Manual

In [None]:
# Set conversation extension (e.g., 3 for combining 3 examples into one conversation)
conversation_extension = 1

# Convert to ShareGPT format with conversation extension
dataset = process_dataset(dataset, conversation_extension)

In [None]:
# Print a sample conversation
print("\nSample conversation:")
print(json.dumps(dataset[0]['conversations'], ensure_ascii=False, indent=2))

In [None]:
# Print dataset info
print("\nDataset info:")
print(f"Number of conversations: {len(dataset)}")
print(f"Features: {dataset.features}")
print(f"Datatype: {type(dataset)}")

In [None]:
# Save the converted data
output_path = "dataset/xlsum-indonesian/sharegpt_xlsum_indonesian.jsonl"
save_sharegpt_format(dataset, output_path)
print(f"Converted data saved to {output_path}")

#### Download Data - scientific_lay_summarisation-plos-norm

In [None]:
# Define the dataset and subset
dataset_name = "pszemraj/scientific_lay_summarisation-plos-norm"
output_dir = Path("dataset/scientific_lay_summarisation-plos-norm")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, split = "train")

In [None]:
print(dataset)
print(dataset[0])

In [None]:
def scientific_convert_to_sharegpt(dataset):
    """
    Convert the dataset into ShareGPT format.

    Parameters:
    dataset (Dataset): The dataset to be converted.

    Returns:
    Dataset: A new dataset in ShareGPT format.
    """
    sharegpt_data = []

    for example in dataset:
        # Format the input message
        human_message = (
            "You are an expert summarization assistant trained to generate concise, clear, and accurate summaries of "
            "given texts. Your goal is to understand the core ideas of the provided article and create a summary and related keywords.\n\n"
            f"Title: {example['title']}\n\n"
            f"Content: {example['article']}\n\n"
            "Summary:\n"
        )
        
        # Format the output message
        gpt_message = (
            f"{example['summary']}\n\n"
            f"keywords: {example['keywords']}"
        )
        
        # Create the ShareGPT format
        conversation = [
            {"from": "human", "value": human_message},
            {"from": "gpt", "value": gpt_message}
        ]
        sharegpt_data.append(conversation)
    
    # Convert to Dataset format
    return sharegpt_data

In [None]:
sharegpt_dataset = scientific_convert_to_sharegpt(dataset)
print(sharegpt_dataset[0])

In [None]:
dataset = Dataset.from_dict({"conversations": sharegpt_dataset})

# Check the first example
print(dataset[0])

In [None]:
dataset

In [None]:
output_path = f"{output_dir}/sharegpt_scientific_lay_summarisation-plos-norm.jsonl"
save_sharegpt_format(dataset, output_path)

#### Download Data - govreport-summarization-8192

In [None]:
# Define the dataset and subset
dataset_name = "pszemraj/govreport-summarization-8192"
output_dir = Path("dataset/govreport-summarization-8192")

# Create the output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Load the dataset
dataset = load_dataset(dataset_name, split = "train")

In [None]:
print(dataset.column_names)
print(dataset[0])

In [None]:
from unsloth import to_sharegpt
dataset = to_sharegpt(
    dataset,
    merged_prompt = "You are an expert summarization assistant trained to generate concise, clear, and accurate summaries of given texts. Your goal is to understand the core ideas of the provided article and create a summary. [[\nHere is the article:\n{report}]]",
    output_column_name = "summary",
    # conversation_extension = 3, # Select more to handle longer conversations
)

In [None]:
print(dataset.column_names)
print(dataset[0])

In [None]:
output_path = f"{output_dir}/sharegpt_govreport-summarization-8192.jsonl"
save_sharegpt_format(dataset, output_path)

## Dataset Preparation

### Combine Dataset

In [None]:
# Selects a given number of rows from a dataset
# Will be off due to rounding errors***
def subset(dataset : Dataset, count : int) -> Dataset:
  divisor = int(len(dataset) / count)
  new_dataset = dataset[::divisor]
  while len(new_dataset['conversations']) > count:
    new_dataset['conversations'].pop()
  return Dataset.from_dict(new_dataset)

# Way slower but more precise
def subset_slow_exact(dataset : Dataset, count : int) -> Dataset:
  divisor = len(dataset) / count
  new_dataset = {'conversations':[]}
  i = 0
  while i < len(dataset) - 1:
    new_dataset['conversations'].append(dataset['conversations'][int(i)])
    i += divisor
  return Dataset.from_dict(new_dataset)

In [None]:
data_path = Path("dataset")

In [None]:
finetune_dataset = data_path / 'sharegpt_finetune.jsonl'

In [None]:
finetune_dataset = load_jsonl_dataset(str(finetune_dataset))

In [None]:
# alpaca_cleaned = data_path / 'alpaca-cleaned/sharegpt_alpaca-cleaned.jsonl'
# alpaca_id = data_path / 'alpaca-gpt4-indonesian/sharegpt_alpaca-gpt4-indonesian.jsonl'
# finetome_100k = data_path / 'FineTome-100k/sharegpt_FineTome-100k.jsonl'
# wikipedia_10k = data_path / 'wikipedia-10k/sharegpt_wikipedia-10k.jsonl'
xlsum_en = data_path / 'xlsum-english/sharegpt_xlsum_english.jsonl'
xlsum_id = data_path / 'xlsum-indonesian/sharegpt_xlsum_indonesian.jsonl'
govreport = data_path / 'govreport-summarization-8192/sharegpt_govreport-summarization-8192.jsonl'
scientific_sum = data_path / 'scientific_lay_summarisation-plos-norm/sharegpt_scientific_3k.jsonl'
        
# Load the dataset for each path
# alpaca_cleaned = load_jsonl_dataset(str(alpaca_cleaned))
# alpaca_id = load_jsonl_dataset(str(alpaca_id))
# finetome_100k = load_jsonl_dataset(str(finetome_100k))
# wikipedia_10k = load_jsonl_dataset(str(wikipedia_10k))
xlsum_en = load_jsonl_dataset(str(xlsum_en))
xlsum_id = load_jsonl_dataset(str(xlsum_id))
govreport = load_jsonl_dataset(str(govreport))
scientific_sum = load_jsonl_dataset(str(scientific_sum))

# Print the number of rows in each dataset as a quick verification
# print(f"Alpaca Cleaned: {len(alpaca_cleaned)} examples")
# print(f"Alpaca Indonesian: {len(alpaca_id)} examples")
# print(f"FineTome 100k: {len(finetome_100k)} examples")
# print(f"Wikipedia 10k: {len(wikipedia_10k)} examples")
print(f"XLSum English: {len(xlsum_en)} examples")
print(f"XLSum Indonesian: {len(xlsum_id)} examples")
print(f"GovReport: {len(govreport)} examples")
print(f"Scientific Summary: {len(scientific_sum)} examples")

In [None]:
from datasets import concatenate_datasets

# Combine datasets together
# pretrain_dataset = concatenate_datasets([alpaca_cleaned, alpaca_id, finetome_100k, wikipedia_10k])
finetune_dataset = concatenate_datasets([xlsum_id, xlsum_en, govreport, scientific_sum])
# finetune_dataset = concatenate_datasets([xlsum_id, govreport, scientific_sum])

# Shuffle dataset (optional)
# pretrain_dataset = pretrain_dataset.shuffle(seed=0)
# finetune_dataset = finetune_dataset.shuffle(seed=0)
# scientific_sum = scientific_sum.shuffle(seed=0)
# scientific_sum = subset_slow_exact(scientific_sum, 3000)

In [None]:
# pretrain_dataset

In [None]:
finetune_dataset

In [None]:
def count_tokens_in_conversation(conversation, tokenizer):
    """
    Count tokens in a single conversation.
    """
    total_tokens = 0
    
    # Concatenate all messages in the conversation
    full_text = ""
    for message in conversation:
        # Add message format
        full_text += f"{message['from']}: {message['value']}\n"
    
    # Count tokens
    tokens = tokenizer(full_text, return_tensors="pt", truncation=False)
    return len(tokens.input_ids[0])

def analyze_dataset_tokens(dataset, tokenizer):
    """
    Analyze token counts across the entire dataset.
    """
    token_counts = []
    
    # Process each conversation
    for idx, item in enumerate(dataset):
        if idx % 1000 == 0:  # Progress indicator
            print(f"Processing conversation {idx}/{len(dataset)}")
            
        tokens = count_tokens_in_conversation(item['conversations'], tokenizer)
        token_counts.append(tokens)
    
    # Calculate statistics
    total_tokens = sum(token_counts)
    avg_tokens = total_tokens / len(token_counts)
    max_tokens = max(token_counts)
    min_tokens = min(token_counts)
    
    # Find conversations exceeding token limit
    over_limit = sum(1 for count in token_counts if count > max_seq_length)
    
    return {
        "total_tokens": total_tokens,
        "average_tokens": avg_tokens,
        "max_tokens": max_tokens,
        "min_tokens": min_tokens,
        "total_conversations": len(token_counts),
        "conversations_over_limit": over_limit,
        "percent_over_limit": (over_limit / len(token_counts)) * 100
    }

In [None]:
# # Use the functions
# stats = analyze_dataset_tokens(pretrain_dataset, tokenizer)

In [None]:
# # Print results
# print("\nPretrain Dataset Token Statistics:")
# print(f"Total Tokens: {stats['total_tokens']:,}")
# print(f"Average Tokens per Conversation: {stats['average_tokens']:.2f}")
# print(f"Max Tokens in a Conversation: {stats['max_tokens']:,}")
# print(f"Min Tokens in a Conversation: {stats['min_tokens']:,}")
# print(f"Total Conversations: {stats['total_conversations']:,}")
# print(f"Conversations Over {max_seq_length:,} tokens: {stats['conversations_over_limit']:,} ({stats['percent_over_limit']:.2f}%)")

In [None]:
stats = analyze_dataset_tokens(scientific_sum, tokenizer)

In [None]:
# Print results
print("\nFinetune Dataset Token Statistics:")
print(f"Total Tokens: {stats['total_tokens']:,}")
print(f"Average Tokens per Conversation: {stats['average_tokens']:.2f}")
print(f"Max Tokens in a Conversation: {stats['max_tokens']:,}")
print(f"Min Tokens in a Conversation: {stats['min_tokens']:,}")
print(f"Total Conversations: {stats['total_conversations']:,}")
print(f"Conversations Over {max_seq_length:,} tokens: {stats['conversations_over_limit']:,} ({stats['percent_over_limit']:.2f}%)")

### Standardize

In [None]:
from unsloth import standardize_sharegpt
# pretrain_dataset = standardize_sharegpt(pretrain_dataset)
finetune_dataset = standardize_sharegpt(finetune_dataset)

In [None]:
# Free the variables
del xlsum_id
del govreport
del scientific_sum

# Optionally, force garbage collection
import gc
gc.collect()


In [None]:
# print("\nSample conversation (pretrain):")
# print(json.dumps(pretrain_dataset[1]['conversations'], ensure_ascii=False, indent=2))

In [None]:
print("\nSample conversation (finetune):")
print(json.dumps(finetune_dataset[1]['conversations'], ensure_ascii=False, indent=2))

## Training

### Chat Template

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

# def formatting_prompts_func(examples):
#     convos = examples["conversations"]
#     texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
#     return { "text" : texts, }
# pass

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = []
    
    for convo in convos:
        try:
            # Skip invalid conversations
            if not convo or not isinstance(convo, list):
                texts.append("")
                continue
                
            # Validate and clean conversation messages
            valid_convo = [
                {
                    "role": msg["role"],
                    "content": msg["content"] or ""  # Convert None to empty string
                }
                for msg in convo
                if isinstance(msg, dict) 
                and msg.get("role") 
                and msg.get("content") is not None
            ]
            
            if not valid_convo:
                texts.append("")
                continue
                
            text = tokenizer.apply_chat_template(
                valid_convo,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(text)
        except Exception as e:
            print(f"Error processing conversation: {e}")
            texts.append("")
            
    return {"text": texts}

def clean_dataset(dataset):
    """
    Clean the dataset by removing conversations with None values
    and validating conversation structure
    """
    def is_valid_conversation(conv):
        if not conv or not isinstance(conv, list):
            return False
        return all(
            isinstance(msg, dict) 
            and isinstance(msg.get('role'), str) 
            and isinstance(msg.get('content'), str)
            for msg in conv
        )
    
    # Filter valid conversations
    filtered_dataset = dataset.filter(
        lambda x: is_valid_conversation(x['conversations']),
        num_proc=4  # Adjust based on your CPU cores
    )
    
    print(f"Original dataset size: {len(dataset)}")
    print(f"Cleaned dataset size: {len(filtered_dataset)}")
    
    return filtered_dataset

In [None]:
# pretrain_dataset = pretrain_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
finetune_dataset = finetune_dataset.map(formatting_prompts_func, batched = True,)

In [None]:
# pretrain_dataset[5]["conversations"]

In [None]:
# pretrain_dataset[5]["text"]

In [None]:
finetune_dataset[37269]["conversations"]

In [None]:
finetune_dataset[37269]["text"]

In [None]:
# Clean the dataset before training
finetune_dataset = clean_dataset(finetune_dataset)

In [None]:
output_path = f"dataset/sharegpt_finetune.jsonl"
save_sharegpt_format(finetune_dataset, output_path)

### Continued Pretraining

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = finetune_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 8,
        # warmup_steps = 10,
        warmup_ratio = 0.01,
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps =None,
        # learning_rate = 2e-4,
        learning_rate = 2e-4,
        # embedding_learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        save_strategy = "steps",
        save_steps = 100,
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
# from trl import SFTTrainer
# from transformers import TrainingArguments, DataCollatorForSeq2Seq
# from unsloth import is_bfloat16_supported
# from unsloth import UnslothTrainer, UnslothTrainingArguments

# trainer = UnslothTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     # train_dataset = pretrain_dataset,
#     train_dataset = finetune_dataset,
#     dataset_text_field = "text",
#     max_seq_length = max_seq_length,
#     data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
#     dataset_num_proc = 2,
#     packing = False, # Can make training 5x faster for short sequences.
#     args = UnslothTrainingArguments(
#         per_device_train_batch_size = 2,
#         gradient_accumulation_steps = 8,
#         # warmup_steps = 10,
#         warmup_ratio = 0.05,
#         num_train_epochs = 2, # Set this for 1 full training run.
#         # max_steps =None,
#         # learning_rate = 2e-4,
#         learning_rate = 5e-5,
#         embedding_learning_rate = 1e-5,
#         fp16 = not is_bfloat16_supported(),
#         bf16 = is_bfloat16_supported(),
#         logging_steps = 1,
#         optim = "adamw_8bit",
#         weight_decay = 0.01,
#         lr_scheduler_type = "linear",
#         seed = 3407,
#         output_dir = "outputs",
#         save_strategy = "steps",
#         save_steps = 100,
#         report_to = "none", # Use this for WandB etc
#     ),
# )

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

#### Show memory stats

In [None]:
#@title Show current memory stats
free_mem()
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
 model.config.use_cache = False

#### Train

In [None]:
# use this to train from scratch
# trainer_stats = trainer.train()

# use this to train from checkpoint
trainer_stats = trainer.train(resume_from_checkpoint = True)


#### Final Memory anf Time Stats

In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")