In [80]:
import requests
from bs4 import BeautifulSoup

url = "https://www.figma.com/resource-library/graphic-design-principles/"
response = requests.get(url)
print(f"Status code: {response.status_code}")
soup = BeautifulSoup(response.text, "html")

import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)
cleaned_text = remove_html_tags(soup.get_text())
with open("output.txt", "w", encoding="utf-8") as file:
  file.write(cleaned_text)

Status code: 200


In [84]:
import json

def convert_qa_to_llama_format(input_file, output_file):
    # Specify UTF-8 encoding explicitly
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    for item in data:
        formatted_text = f"<s>[INST] {item['human']} [/INST] {item['assistant']}</s>"
        converted_data.append({"text": formatted_text})
    
    # Also specify UTF-8 for output
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_data, f, indent=2, ensure_ascii=False)

# Usage
convert_qa_to_llama_format('content.json', 'llama.json')

In [88]:
from datasets import load_dataset

# Load the dataset
import os
os.environ["HF_TOKEN"] = "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"

dataset = load_dataset("sahil2801/CodeAlpaca-20k")

# Save as JSON
dataset['train'].to_json('CodeAlpaca-20k.json')

# Or access the data directly
train_data = dataset['train']
print(f"Number of examples: {len(train_data)}")
print(train_data[0])  # First example

README.md:   0%|          | 0.00/147 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


code_alpaca_20k.json: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/20022 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

Number of examples: 20022
{'output': 'arr = [2, 4, 6, 8, 10]', 'instruction': 'Create an array of length 5 which contains all even numbers between 1 and 10.', 'input': ''}


In [90]:
import json

def convert_codealpaca_to_llama(input_file, output_file):
    converted_data = []
    
    # Read JSONL file (one JSON object per line)
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                item = json.loads(line)
                
                instruction = item['instruction'].strip()
                input_text = item.get('input', '').strip()
                output_text = item['output'].strip()
                
                # Combine instruction and input if input exists
                if input_text and input_text not in ['', '< noinput >', '<noinput>']:
                    user_prompt = f"{instruction}\n\n{input_text}"
                else:
                    user_prompt = instruction
                
                # Format for Llama 7B Chat
                formatted_text = f"<s>[INST] {user_prompt} [/INST] {output_text}</s>"
                converted_data.append({"text": formatted_text})
    
    # Save as proper JSON array
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_data, f, indent=2, ensure_ascii=False)
    
    print(f"Converted {len(converted_data)} examples")

# Usage
convert_codealpaca_to_llama('CodeAlpaca-20k.json', 'llama_codealpaca.json')

Converted 20022 examples


In [92]:
import json
from datasets import load_dataset

def convert_oasst2_to_llama_fast(output_file="llama_oasst2.json", max_examples=10000):
   print("Loading dataset...")
   dataset = load_dataset("OpenAssistant/oasst2")
   
   converted_data = []
   count = 0
   
   # Create a simple lookup for faster access
   print("Building message lookup...")
   msg_lookup = {item['message_id']: item for item in dataset['train']}
   
   print("Converting conversations...")
   for item in dataset['train']:
       if count >= max_examples:
           break
           
       # Only process assistant messages that are direct replies to prompters
       if (item['role'] == 'assistant' and 
           item['parent_id'] and 
           item.get('review_result', False) == True):  # Only quality responses
           
           parent = msg_lookup.get(item['parent_id'])
           
           if (parent and 
               parent['role'] == 'prompter' and 
               item['lang'] == 'en'):  # English only
               
               user_text = parent['text'].strip()
               assistant_text = item['text'].strip()
               
               # Skip if either text is too short/long
               if 10 < len(user_text) < 2000 and 10 < len(assistant_text) < 4000:
                   conversation = f"<s>[INST] {user_text} [/INST] {assistant_text}</s>"
                   converted_data.append({"text": conversation})
                   count += 1
                   
                   if count % 1000 == 0:
                       print(f"Processed {count} conversations...")
   
   print(f"Saving {len(converted_data)} conversations...")
   with open(output_file, 'w', encoding='utf-8') as f:
       json.dump(converted_data, f, indent=2, ensure_ascii=False)
   
   print(f"Done! Saved {len(converted_data)} conversations to {output_file}")
   return len(converted_data)

# Usage - should complete in under 2 minutes
count = convert_oasst2_to_llama_fast()

Loading dataset...
Building message lookup...
Converting conversations...
Processed 1000 conversations...
Processed 2000 conversations...
Processed 3000 conversations...
Processed 4000 conversations...
Processed 5000 conversations...
Processed 6000 conversations...
Processed 7000 conversations...
Processed 8000 conversations...
Processed 9000 conversations...
Processed 10000 conversations...
Saving 10000 conversations...
Done! Saved 10000 conversations to llama_oasst2.json


In [95]:
import json
from datasets import load_dataset

def download_langchain_alternatives():
    """Try multiple LangChain datasets"""
    
    datasets_to_try = [
        "LangChainDatasets/langchain-howto-queries",
        "antonioibars/langchain-docs", 
        "clue2solve/langchain-additional-resources"
    ]
    
    all_data = []
    
    for dataset_name in datasets_to_try:
        try:
            print(f"Trying {dataset_name}...")
            dataset = load_dataset(dataset_name)
            
            # Check dataset structure
            if dataset['train']:
                sample = dataset['train'][0]
                print(f"Sample keys: {list(sample.keys())}")
                
                for item in dataset['train']:
                    # Adapt based on actual structure
                    text_content = ""
                    
                    if 'text' in item:
                        text_content = item['text']
                    elif 'content' in item:
                        text_content = item['content']
                    elif 'question' in item and 'answer' in item:
                        text_content = f"Question: {item['question']}\nAnswer: {item['answer']}"
                    elif 'instruction' in item and 'output' in item:
                        text_content = f"<s>[INST] {item['instruction']} [/INST] {item['output']}</s>"
                    
                    if text_content:
                        # Convert to Llama format if not already
                        if '[INST]' not in text_content:
                            # Create instruction from content
                            instruction = f"Explain this LangChain concept"
                            text_content = f"<s>[INST] {instruction} [/INST] {text_content}</s>"
                        
                        all_data.append({
                            "text": text_content                    
                        })
                
                print(f"Found {len([x for x in all_data if x['source'] == dataset_name])} examples from {dataset_name}")
                
        except Exception as e:
            print(f"Failed to load {dataset_name}: {e}")
            continue
    
    # If still no data, create from LangChain documentation scraping
    if not all_data:
        print("No datasets found, creating synthetic LangChain examples...")
        all_data = create_synthetic_langchain_data()
    
    # Save results
    with open("langchain_combined.json", 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)
    
    print(f"Total saved: {len(all_data)} LangChain examples")
    return len(all_data)

def create_synthetic_langchain_data():
    """Create basic LangChain Q&A pairs"""
    
    langchain_qa = [
        {
            "question": "How do I create a simple LLM chain in LangChain?",
            "answer": "You can create an LLM chain using: from langchain.llms import OpenAI; from langchain.chains import LLMChain; from langchain.prompts import PromptTemplate; llm = OpenAI(); prompt = PromptTemplate(template='Question: {question}', input_variables=['question']); chain = LLMChain(llm=llm, prompt=prompt)"
        },
        {
            "question": "What is a PromptTemplate in LangChain?",
            "answer": "A PromptTemplate is a class that helps you create prompts with variables. It allows you to define a template with placeholders like {variable_name} and then fill them dynamically with actual values."
        },
        {
            "question": "How do I use vector stores in LangChain?",
            "answer": "Vector stores in LangChain allow you to store and search embeddings. You can use them with: from langchain.vectorstores import FAISS; from langchain.embeddings import OpenAIEmbeddings; embeddings = OpenAIEmbeddings(); vectorstore = FAISS.from_texts(texts, embeddings)"
        },
        {
            "question": "What are LangChain agents?",
            "answer": "LangChain agents are entities that can use tools to interact with the world. They combine LLMs with tools like search engines, calculators, or APIs to perform complex tasks that require multiple steps."
        },
        {
            "question": "How do I implement RAG with LangChain?",
            "answer": "To implement RAG (Retrieval Augmented Generation): 1) Create embeddings of your documents, 2) Store them in a vector database, 3) Use a retriever to find relevant documents, 4) Combine retrieved context with your query in a prompt to the LLM."
        }
    ]
    
    converted_data = []
    for qa in langchain_qa:
        text = f"<s>[INST] {qa['question']} [/INST] {qa['answer']}</s>"
        converted_data.append({
            "text": text,
            "source": "synthetic"
        })
    
    return converted_data

# Run the fixed script
count = download_langchain_alternatives()

Trying LangChainDatasets/langchain-howto-queries...
Sample keys: ['inputs']
Found 0 examples from LangChainDatasets/langchain-howto-queries
Trying antonioibars/langchain-docs...
Sample keys: ['id', 'text', 'source']
Failed to load antonioibars/langchain-docs: 'source'
Trying clue2solve/langchain-additional-resources...
Sample keys: ['id', 'text', 'source']
Failed to load clue2solve/langchain-additional-resources: 'source'
Total saved: 108 LangChain examples


In [98]:
from datasets import load_dataset

# Load LangChain documentation dataset
dataset = load_dataset("hudsongeorge/langchain-docs")

# Load LangChain how-to queries
howto_dataset = load_dataset("LangChainDatasets/langchain-howto-queries")

# Convert to Llama format for fine-tuning
def convert_langchain_docs_to_llama(dataset):
    converted = []
    for item in dataset['train']:
        if 'question' in item and 'answer' in item:
            text = f"<s>[INST] {item['question']} [/INST] {item['answer']}</s>"
            converted.append({"text": text})
    with open("lang.json", 'w', encoding='utf-8') as f:
       json.dump(converted, f, indent=2, ensure_ascii=False)
converted = convert_langchain_docs_to_llama(dataset)

In [99]:
import json
from datasets import load_dataset

def download_ai_principles_dataset(output_file="ai_principles.json"):
    print("Loading AI principles datasets...")
    
    all_data = []
    
    # Dataset 1: Airoboros (contains AI reasoning)
    try:
        print("Loading airoboros dataset...")
        dataset = load_dataset("jondurbin/airoboros-gpt4-1.4.1")
        
        for item in dataset['train']:
            if any(keyword in item.get('instruction', '').lower() 
                  for keyword in ['ai', 'artificial intelligence', 'machine learning', 'reasoning', 'logic']):
                
                instruction = item['instruction'].strip()
                response = item['response'].strip()
                
                text = f"<s>[INST] {instruction} [/INST] {response}</s>"
                all_data.append({
                    "text": text,
                    "source": "airoboros",
                    "category": "ai_principles"
                })
        
        print(f"Found {len([x for x in all_data if x['source'] == 'airoboros'])} AI-related examples")
        
    except Exception as e:
        print(f"Error with airoboros: {e}")
    
    # Dataset 2: Filter OASST2 for AI topics
    try:
        print("Loading OpenAssistant for AI topics...")
        dataset = load_dataset("OpenAssistant/oasst2")
        msg_lookup = {item['message_id']: item for item in dataset['train']}
        
        ai_keywords = ['artificial intelligence', 'machine learning', 'ai', 'neural network', 
                      'deep learning', 'algorithm', 'data science', 'model training']
        
        for item in dataset['train']:
            if (item['role'] == 'assistant' and 
                item['parent_id'] and 
                item['lang'] == 'en' and
                item.get('review_result', False)):
                
                parent = msg_lookup.get(item['parent_id'])
                if parent and parent['role'] == 'prompter':
                    
                    user_text = parent['text'].lower()
                    if any(keyword in user_text for keyword in ai_keywords):
                        
                        conversation = f"<s>[INST] {parent['text'].strip()} [/INST] {item['text'].strip()}</s>"
                        all_data.append({
                            "text": conversation,
                            "source": "oasst2",
                            "category": "ai_functionality"
                        })
        
        print(f"Found {len([x for x in all_data if x['source'] == 'oasst2'])} AI conversations from OASST2")
        
    except Exception as e:
        print(f"Error with OASST2: {e}")
    
    # Save combined dataset
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_data, f, indent=2, ensure_ascii=False)
    
    print(f"Total saved: {len(all_data)} AI principles examples")
    return len(all_data)

# Usage
download_ai_principles_dataset()

Loading AI principles datasets...
Loading airoboros dataset...


README.md:   0%|          | 0.00/281 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


as_conversations.json:   0%|          | 0.00/50.0M [00:00<?, ?B/s]

instructions.jsonl:   0%|          | 0.00/45.9M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Error with airoboros: An error occurred while generating the dataset

All the data files must have the same columns, but at some point there are 4 new columns ({'category', 'question_id', 'instruction', 'response'}) and 2 missing columns ({'conversations', 'id'}).

This happened while the json dataset builder was generating data using

hf://datasets/jondurbin/airoboros-gpt4-1.4.1/instructions.jsonl (at revision 433c04038d724bf29a193bc3c1a48b600cc417a1)

Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)
Loading OpenAssistant for AI topics...
Found 7804 AI conversations from OASST2
Total saved: 7804 AI principles examples


7804

In [100]:
import json
from datasets import load_dataset

# 1. Load dataset
dataset = load_dataset("Pavithrars/AI_dataset")

# 2. Conversion function: dataset -> LLaMA-2 chat template
def to_llama_format(example):
    instruction = example.get("Question") or example.get("instruction") or "Explain AI."
    output = example.get("Answer") or example.get("output") or "Artificial Intelligence is..."
    
    # LLaMA 2 Chat format
    text = f"<s>[INST] <<SYS>>\nYou are a helpful assistant that explains how AI works.\n<</SYS>>\n\n{instruction} [/INST] {output} </s>"
    return {"text": text}

# 3. Apply conversion
train_dataset = dataset["train"].map(to_llama_format)

# 4. Save as JSON (list of objects)
data_list = [row for row in train_dataset]
with open("llama_ai_dataset.json", "w") as f:
    json.dump(data_list, f, indent=2)

print("Saved llama_ai_dataset.json")

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


AI_Tutor.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/173 [00:00<?, ? examples/s]

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

Saved llama_ai_dataset.json


In [101]:
import json

# Load your original JSON
with open("llama_ai_dataset.json", "r") as f:
    data = json.load(f)

# Keep only the "text" field for fine-tuning
cleaned = [{"text": item["text"]} for item in data if "text" in item]

# Save cleaned dataset
with open("llama_ai_dataset_clean.json", "w") as f:
    json.dump(cleaned, f, indent=2)

print(f"Cleaned dataset size: {len(cleaned)} examples")


Cleaned dataset size: 173 examples


In [103]:
import json
from datasets import load_dataset

# Load dataset (replace with the one you actually picked)
dataset = load_dataset("lucadiliello/How_AI_Works")

records = []

for item in dataset["train"]:
    # Extract fields safely
    question = item.get("title") or item.get("question") or "Explain this AI concept."
    answer = item.get("document") or item.get("content") or item.get("summary") or "No answer available."

    # Format in LLaMA style
    text = f"<s>[INST] <<SYS>>\nYou are a helpful assistant that explains how AI works.\n<</SYS>>\n\n{question} [/INST] {answer} </s>"

    records.append({"text": text})

# Save as JSON (not JSONL)
with open("llama_ai_dataset2.json", "w") as f:
    json.dump(records, f, indent=2)

print(f"âœ… Saved {len(records)} examples in llama_ai_dataset.json")


ConnectionError: Couldn't reach 'lucadiliello/How_AI_Works' on the Hub (LocalEntryNotFoundError)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer

# -----------------------------
# 1. Model in 4-bit (QLoRA)
# -----------------------------
use_4bit = True
bnb_4bit_compute_dtype="float16"
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

model_name = "meta-llama/Llama-2-7b-hf"  # HF model repo

if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

model.config.use_cache=False
model.config.pretraining_tp=1

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token  # avoids pad errors
tokenizer.padding_side = "right"

# -----------------------------
# 2. Dataset
# -----------------------------
# Example: local JSON file in Colab
dataset = load_dataset("json", data_files="final.json", split="train")
# Split the dataset into training and validation sets
train_ds, val_ds = dataset.train_test_split(test_size=0.1).values()

# -----------------------------
# 3. LoRA Config
# -----------------------------
peft_config = LoraConfig(
    r=32,                 # rank (bottleneck size)
    lora_alpha=16,        # scaling factor
    lora_dropout=0.05,    # dropout probability
    bias="none",
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

# -----------------------------
# 4. Training Arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir="./qlora-llama7b-finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,      # Effective batch size = 4 * 4 = 16
    optim="paged_adamw_32bit",
    save_steps=500,  # Changed from 0 to save checkpoints
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,                          
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    save_total_limit=2            # Keep only 2 latest checkpoints
)

# -----------------------------
# 5. Trainer (FIXED)
# -----------------------------
trainer = SFTTrainer(
    model=model,
    train_dataset=train_ds,      # Fixed: use train_ds instead of dataset
    eval_dataset=val_ds,         # Fixed: this was correct
    peft_config=peft_config,
    args=training_args,
    # If your dataset has a specific text field, add:
    # dataset_text_field="text"  # Only add this if your JSON has a "text" field
)

# -----------------------------
# 6. Train
# -----------------------------
print("Starting training...")
trainer.train()
print("Training finished.")

# -----------------------------
# 7. Save Adapter
# -----------------------------
model.save_pretrained("./qlora-llama7b-adapter")
tokenizer.save_pretrained("./qlora-llama7b-adapter")

print("Model and tokenizer saved successfully!")

In [None]:
from huggingface_hub import notebook_login

# ---------------------------------------------------------------------------
# 2. AUTHENTICATE & UPLOAD DATA (CRITICAL FIX)
# ---------------------------------------------------------------------------
# Login to Hugging Face to download the gated Llama 2 model
notebook_login()

In [None]:
from google.colab import files
uploaded = files.upload()  # pick llama_ai_dataset.json

In [None]:
!pip install -q -U accelerate peft bitsandbytes transformers trl datasets huggingface_hub

In [110]:
import requests
from bs4 import BeautifulSoup

url = "https://www.figma.com/resource-library/portfolio-website-examples/"
response = requests.get(url)
print(f"Status code: {response.status_code}")
soup = BeautifulSoup(response.text, "html")

import re
def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub(r'', text)
cleaned_text = remove_html_tags(soup.get_text())
with open("output.txt", "w", encoding="utf-8") as file:
  file.write(cleaned_text)

Status code: 200
