In [None]:
# # Reset our files for the new session
# !rm -rf /content/*

In [None]:
# # This is for Google colab enviroment:

! pip install transformers datasets
! pip install unsloth

In [None]:
# from google.colab import files
# uploaded = files.upload()  # This opens a file picker

In [None]:
# Device setup
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"ðŸ”§ Using device: {device}")

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="chinese_history_uncensored.jsonl")
print(dataset)  # Check it loaded

In [None]:
from unsloth import FastLanguageModel

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # Use model of your choice. Provide correct model path from Hugging Face

# Download model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=512,
    dtype=None,
    load_in_4bit=True,
)

# Apply LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Optimize memory
)

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)
# It's used to clean up the dataset after tokenization and remove unnecessary columns
tokenized_dataset = tokenized_dataset.remove_columns(["text"])

In [None]:
# "wandb" is used for tracking and visualizing your machine learning experiments
# "Trainer" tries to use W&B by default if it's installed
# But it's not required unless you want to
# In local dev environments, unless you manually "pip install wandb", it won't get triggered
# import os
# os.environ["WANDB_DISABLED"] = "true"

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # Train for 3 full passes
    per_device_train_batch_size=4,  # Small batches for memory
    gradient_accumulation_steps=4,  # Combines batches to act bigger
    learning_rate=5e-5,  # Slow learning for accuracy
    fp16=True,  # Speeds up on GPU
    logging_steps=10,  # Check progress often
    save_steps=100,  # Save regularly
    save_total_limit=2,  # Keep only 2 saves
    report_to="none",  # Disables all logging integrations, including "wandb". So we don't need to install extra libraries
)

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,  # Optimize for GPU
)

# Create Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    data_collator=data_collator,
)

trainer.train()

# # Save the model (optional)
# model.save_pretrained("./fine_tuned_model")
# tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
import textwrap
from unsloth import FastLanguageModel

# # Reload the fine-tuned model from local storage (optional, only if model was previously saved)
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name="./fine_tuned_model",  # model_name as a path
#     max_seq_length=512,
#     dtype=None,
#     load_in_4bit=True,
#     local_files_only=True,  # Ensures only local files are used
# )

# Inference optimization
FastLanguageModel.for_inference(model)
model.eval()

# Prepare the prompt
prompt = "Question: What happened during the Tiananmen Square protests in 1989?"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Move inputs to GPU if available
inputs = {k: v.to("cuda") for k, v in inputs.items()}

# Generate response
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=5000, # Specifies the maximum number of tokens the model is allowed to generate
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )

# Decode and clean response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
wrapped_response = textwrap.fill(response, width=100, break_long_words=False)

print("\nðŸ§  Model Response:\n")
print(wrapped_response)

In [None]:
# Check base model answers
import textwrap
from unsloth import FastLanguageModel

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Use model of your choice. Provide correct model path from Hugging Face

# Load the base model and tokenizer using FastLanguageModel (without LoRA modifications)
test_base_model, test_base_tokenizer = FastLanguageModel.from_pretrained(
    model_name,
    max_seq_length=512,
    dtype=None,
    load_in_4bit=True,  # Ensure this setting is for loading in 4-bit precision
)

# Inference optimization
FastLanguageModel.for_inference(test_base_model)
test_base_model.eval()

# Define the prompt for testing
prompt = "Question: What happened during the Tiananmen Square protests in 1989?"
inputs = test_base_tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Move inputs to GPU if available
inputs = {k: v.to(device) for k, v in inputs.items()}

# Generate and print the base model response
with torch.no_grad():
    base_outputs = test_base_model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=5000, # Specifies the maximum number of tokens the model is allowed to generate
        pad_token_id=test_base_tokenizer.pad_token_id,
        use_cache=True,
    )

base_response = test_base_tokenizer.decode(base_outputs[0], skip_special_tokens=True)
base_wrapped_response = textwrap.fill(base_response, width=100, break_long_words=False)

print("\nðŸ§  Base Model (Preâ€“fine-tuning) Response:\n")
print(base_wrapped_response)