venv for generating dataset


.venv for training llm

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Generate dataset

In [None]:
import re
import json
import random
import PyPDF2
import spacy
import requests
from spacy.matcher import Matcher
import os

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

URL = "http://localhost:11434/api/generate"
MODEL = "mistral"

session = requests.Session()

def extract_text_from_pdf(pdf_path):
    print("extract_text_from_pdf here")
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def ask_llm(prompt):
    print("ask_llm here")
    """
    Sends a prompt to the local language model API and returns the response.
    """
    payload = {"model": MODEL, "prompt": prompt, "stream": False}
    try:
        response = session.post(URL, json=payload)
        response.raise_for_status()
        return response.json()['response']
    except requests.RequestException as e:
        print(f"Error in API request: {e}")
        return None

def extract_email(text):
    print("extract_email here")
    pattern = r"[\w\.-]+@[\w\.-]+"
    match = re.search(pattern, text)
    return match.group() if match else None

def extract_phone_number(text):
    print("extract_phone_number here")
    pattern = r"[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]"
    match = re.search(pattern, text)
    return match.group() if match else None

def extract_name(text):
    print("extract_name here")
    pattern = r"^\s*([A-Z]+\s+[A-Z]+)"
    match = re.search(pattern, text, re.MULTILINE)
    return match.group(1) if match else None

def extract_portfolio_linkedin(text):
    print("extract_portfolio_linkedin here")
    pattern = r"(https?://[^\s]+)|(linkedin\.com/[^\s]+)"
    match = re.search(pattern, text)
    return match.group() if match else None

def extract_categories(text):
    print("extract_categories here")

    doc = nlp(text)
    matcher = Matcher(nlp.vocab)
    
    patterns = {
        "Education": [[{"LOWER": "education"}]],
        "Experience": [[{"LOWER": "experience"}]],
        "Skills": [[{"LOWER": "skills"}]],
        "Interests": [[{"LOWER": "interests"}]],
        "Extracurricular Activities": [[{"LOWER": "extracurricular"}, {"LOWER": "activities"}]],
        "Key Achievements": [[{"LOWER": "key"}, {"LOWER": "achievements"}]],
        "Personal Statement": [[{"LOWER": "personal"}, {"LOWER": "statement"}]],
    }

    for category, pattern in patterns.items():
        matcher.add(category, pattern)

    matches = matcher(doc)
    categories = {category: "" for category in patterns.keys()}
    start_pos = {category: None for category in patterns.keys()}
    
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]
        start_pos[rule_id] = start

    for category, pos in start_pos.items():
        if pos is not None:
            end_pos = len(doc)
            for other_cat, other_pos in start_pos.items():
                if other_pos is not None and other_pos > pos:
                    end_pos = min(end_pos, other_pos)
            span = doc[pos:end_pos]
            categories[category] = span.text.strip()

    return categories

def extract_experiences(text):
    print("extract_experiences here")
    doc = nlp(text)
    experiences = []
    current_experience = {}
    
    for sent in doc.sents:
        if any(ent.label_ == "DATE" for ent in sent.ents) and any(ent.label_ == "ORG" for ent in sent.ents):
            if current_experience:
                experiences.append(current_experience)
            current_experience = {
                "date": next((ent.text for ent in sent.ents if ent.label_ == "DATE"), ""),
                "organization": next((ent.text for ent in sent.ents if ent.label_ == "ORG"), ""),
                "description": sent.text
            }
        elif current_experience:
            current_experience["description"] += " " + sent.text
    
    if current_experience:
        experiences.append(current_experience)
    
    return experiences

def extract_education(text):
    print("extract_education here")
    doc = nlp(text)
    education = []
    current_education = {}
    
    for sent in doc.sents:
        if any(ent.label_ == "DATE" for ent in sent.ents) and any(ent.label_ == "ORG" for ent in sent.ents):
            if current_education:
                education.append(current_education)
            current_education = {
                "date": next((ent.text for ent in sent.ents if ent.label_ == "DATE"), ""),
                "institution": next((ent.text for ent in sent.ents if ent.label_ == "ORG"), ""),
                "description": sent.text
            }
        elif current_education:
            current_education["description"] += " " + sent.text
    
    if current_education:
        education.append(current_education)
    
    return education

def extract_resume_data(text):
    print("extract_resume_data here")
    email = extract_email(text)
    phone_number = extract_phone_number(text)
    name = extract_name(text)
    portfolio_linkedin = extract_portfolio_linkedin(text)
    categories = extract_categories(text)

    data = {
        "name": name,
        "email": email,
        "phone_number": phone_number,
        "portfolio_linkedin": portfolio_linkedin,
        "education": extract_education(categories.get("Education", "")),
        "experience": extract_experiences(categories.get("Experience", "")),
        "skills": categories.get("Skills", ""),
        "interests": categories.get("Interests", ""),
        "extracurricular_activities": categories.get("Extracurricular Activities", ""),
        "key_achievements": categories.get("Key Achievements", ""),
        "personal_statement": categories.get("Personal Statement", "")
    }

    return data

def generate_question(category, data):
    print("generate_question here")
    prompt = f"Write a question about the person's {category} based on this information: {data}"
    return ask_llm(prompt)

def generate_answer(category, data):
    print("generate_answer here")
    prompt = f"Generate a concise answer in the third person to the following question about {category}, based on this information: {data}"
    return ask_llm(prompt)

# def generate_question(category, data):
#     print("generate_question here")
#     prompt = f"Write a question about the person's {category} based on this information: {data}"
#     # Here you would typically send this prompt to an LLM API
#     # For demonstration, we'll return a placeholder question
#     return f"Can you tell me about your {category}?"

# def generate_answer(category, data):
#     print("generate_answer here")
#     prompt = f"Generate a concise answer in the third person to the following question about {category}, based on this information: {data}"
#     # Here you would typically send this prompt to an LLM API
#     # For demonstration, we'll return a placeholder answer
#     return f"The person's {category} includes {data[:50]}..."

def generate_training_data(resume_data, num_samples=5):
    print("generate_training_data here")
    results = []
    categories = ["education", "experience", "skills", "interests", "key_achievements", "personal_statement"]
    
    for _ in range(num_samples):
        category = random.choice(categories)
        data = resume_data.get(category, "")
        question = generate_question(category, data)
        answer = generate_answer(category, data)
        if question and answer:
            results.append({
                "text": f"<s>[INST] {question} [/INST] {answer}</s>"
            })
    
    return results 

def shuffle_and_split_data(data, train_ratio=0.8):
    print("shuffle_and_split_data here")
    random.shuffle(data)
    split_index = int(len(data) * train_ratio)
    return data[:split_index], data[split_index:]

def save_data(data, filepath):
    print("save_data here")
    with open(filepath, "w", encoding="utf-8") as file:
        for item in data:
            json.dump(item, file)
            file.write("\n")

def main(pdf_path, output_dir, num_samples=5):
    print("main here")
    try:
        os.makedirs(output_dir, exist_ok=True)
        
        pdf_text = extract_text_from_pdf(pdf_path)
        resume_data = extract_resume_data(pdf_text)
        training_data = generate_training_data(resume_data, num_samples)
        
        train_data, valid_data = shuffle_and_split_data(training_data)
        
        train_file = os.path.join(output_dir, "train.jsonl")
        valid_file = os.path.join(output_dir, "valid.jsonl")
        
        save_data(train_data, train_file)
        save_data(valid_data, valid_file)
        
        print(f"Successfully generated and split {len(training_data)} samples.")
        print(f"Training data: {len(train_data)} samples saved to {train_file}")
        print(f"Validation data: {len(valid_data)} samples saved to {valid_file}")
    except Exception as e:
        print(f"Error processing PDF or generating data: {str(e)}")

if __name__ == "__main__":
    pdf_path = r"raw_data\UK_Resume_Alexis_BALAYRE.pdf"  # Replace with the path to your PDF file
    output_dir = "train_data"
    num_samples = 500  # Increased for a more meaningful split
    main(pdf_path, output_dir, num_samples)
    

main here
extract_text_from_pdf here
extract_resume_data here
extract_email here
extract_phone_number here
extract_name here
extract_portfolio_linkedin here
extract_categories here
extract_education here
extract_experiences here
generate_training_data here
generate_question here
ask_llm here


### Training 

In [3]:
!pip install torch transformers datasets peft trl bitsandbytes

Collecting transformers
  Using cached transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting datasets
  Using cached datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting peft
  Using cached peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting trl
  Using cached trl-0.11.4-py3-none-any.whl.metadata (12 kB)
Using cached transformers-4.45.2-py3-none-any.whl (9.9 MB)
Using cached datasets-3.0.1-py3-none-any.whl (471 kB)
Using cached peft-0.13.2-py3-none-any.whl (320 kB)
Using cached trl-0.11.4-py3-none-any.whl (316 kB)
Installing collected packages: transformers, datasets, trl, peft


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\gunes\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python311\\site-packages\\transformers\\models\\deprecated\\trajectory_transformer\\convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py'



In [2]:
"""
This script demonstrates the process of fine-tuning a large language model using the Hugging Face Transformers library,
BitsAndBytes for memory optimization, and PEFT (Parameter Efficient Fine Tuning) techniques for efficient training.
It involves loading and preparing a model and tokenizer, configuring training parameters, performing the training process,
and saving the fine-tuned model. It specifically focuses on causal language modeling with a custom dataset.
"""

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel
from datasets import load_dataset
from trl import SFTTrainer

# Initial configuration for model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
new_model = "AlexisBalayre/who-am-I_mistral_7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Configuration for BitsAndBytes to reduce memory footprint
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# Load the model without quantization (TinyLlama is already small)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False

# Ensure all parameters require gradients
for param in model.parameters():
    param.requires_grad = True

model = prepare_model_for_kbit_training(model)

# Configure the pad token in the model to match the tokenizer
model.config.pad_token_id = tokenizer.pad_token_id

# PEFT-specific configuration for the model to introduce parameter-efficient techniques
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],
)
# peft_config = LoraConfig(
#     r=32,
#     lora_alpha=64,
#     lora_dropout=0.1,
#     bias="none",
#     task_type="CAUSAL_LM",
#     target_modules=[
#         "q_proj",
#         "k_proj",
#         "v_proj",
#         "o_proj",
#         "gate_proj",
#         "up_proj",
#         "down_proj",
#         "lm_head",
#     ],
# )
model = get_peft_model(model, peft_config)

# Load and tokenize datasets for training and validation
dataset = load_dataset(
    "json", data_files={"train": "data/train.jsonl", "validation": "data/valid.jsonl"}
)

def tokenize_function(examples):
    """
    Tokenizes text examples using the provided tokenizer. Ensures consistent padding and truncation.

    Parameters:
    - examples (dict): A batch of text examples.

    Returns:
    - dict: The tokenized representations of the inputs.
    """
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

# Configuration of training arguments
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    save_steps=1000,
    logging_steps=100,
    learning_rate=2e-5,
    fp16=True,
    gradient_checkpointing=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    max_grad_norm=1.0,
    num_train_epochs=3.0,
    weight_decay=0.01,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

# Initialize the trainer with model, datasets, PEFT configuration, and training arguments
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Start the training process
trainer.train()

# Save the fine-tuned model to a specified path
trainer.model.save_pretrained(new_model)

# Clear the memory footprint after training is complete
del model, trainer
torch.cuda.empty_cache()

# Example for reloading the base model with memory optimization settings and merging PEFT parameters
base_model_reload = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    device_map={"": 0},
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(base_model_reload, new_model)
model = model.merge_and_unload()

# Save the merged model and reload tokenizer to ensure consistency
model_dir = "./merged_model"
model.save_pretrained(model_dir, safe_serialization=True)
tokenizer.save_pretrained(model_dir)

CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend


RuntimeError: CUDA is required but not available for bitsandbytes. Please consider installing the multi-platform enabled version of bitsandbytes, which is currently a work in progress. Please check currently supported platforms and installation instructions at https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend

In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer

def main():
    # Initial configuration
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    new_model = "gunzzz24/fine-tuned-tinyllama-1.1b"

    # Tokenizer setup
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Model setup
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )
    model.config.use_cache = False
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    # PEFT configuration
    peft_config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    )
    model = get_peft_model(model, peft_config)

    # Load and tokenize datasets
    dataset = load_dataset("json", data_files={"train": "train_data/train.jsonl", "validation": "train_data/valid.jsonl"})

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

    # Training arguments
    training_arguments = TrainingArguments(
        output_dir="./results",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        optim="adamw_torch",
        save_steps=500,
        logging_steps=100,
        learning_rate=2e-4,
        fp16=True,
        gradient_checkpointing=True,
        eval_steps=500,
        evaluation_strategy="steps",
        save_strategy="steps",
        max_grad_norm=0.3,
        max_steps=1000,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        load_best_model_at_end=True,
    )

    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
        max_seq_length=256,
    )

    # Start the training process
    trainer.train()

    # Save the fine-tuned model
    trainer.model.save_pretrained(new_model)

    # Clear memory
    del model, trainer
    torch.cuda.empty_cache()

    print("Fine-tuning completed successfully!")

if __name__ == "__main__":
    main()

Generating train split: 40 examples [00:00, 582.08 examples/s]
Generating validation split: 10 examples [00:00, 440.75 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 477.31 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 643.34 examples/s]

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
max_steps is given, it will override any value given in num_train_epochs
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


KeyboardInterrupt: 

In [5]:
!pip install torch>=2.0.0 transformers>=4.30.0 peft>=0.3.0 datasets>=2.10.0 trl>=0.4.1 bitsandbytes>=0.39.0 accelerate>=0.20.0 scipy scikit-learn

In [7]:
!pip install ipywidgets



In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoKrConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer
import os
from huggingface_hub import login

def main():
    # Initial configuration
    model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    new_model_name = "fine-tuned-tinyllama-1.1b-lokr"
    output_dir = "./results"
    hf_model_name = "gunzzz24/fine-tuned-tinyllama-1.1b-cv-analyser-lokr"  # Change this to your desired Hugging Face model name

    # Hugging Face login
    login()  # This will prompt for your Hugging Face token if you're not already logged in

    # Tokenizer setup
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Model setup
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )
    model.config.use_cache = False
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()

    # LoKr configuration
    peft_config = LoKrConfig(
        r=8,
        alpha=32,
        lora_dropout=0.05,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)

    # Load and tokenize datasets
    dataset = load_dataset("json", data_files={"train": "train.jsonl", "validation": "valid.jsonl"})

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

    tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names)

    # Training arguments
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=16,
        optim="adamw_torch",
        save_steps=500,
        logging_steps=100,
        learning_rate=2e-4,
        fp16=True,
        gradient_checkpointing=True,
        eval_steps=500,
        evaluation_strategy="steps",
        save_strategy="steps",
        max_grad_norm=0.3,
        max_steps=1000,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        load_best_model_at_end=True,
        push_to_hub=True,
        hub_model_id=hf_model_name,
    )

    # Initialize the trainer
    trainer = SFTTrainer(
        model=model,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        peft_config=peft_config,
        dataset_text_field="text",
        tokenizer=tokenizer,
        args=training_arguments,
        packing=False,
        max_seq_length=256,
    )

    # Start the training process
    trainer.train()

    # Save and upload the fine-tuned model
    print("Saving and uploading the fine-tuned model...")

    # Save and upload the PEFT (LoKr) adapters
    peft_model_path = os.path.join(output_dir, new_model_name + "-peft")
    trainer.model.save_pretrained(peft_model_path, push_to_hub=True, repo_id=hf_model_name)

    # Save and upload the base model
    base_model_path = os.path.join(output_dir, new_model_name + "-base")
    trainer.model.base_model.save_pretrained(base_model_path, push_to_hub=True, repo_id=hf_model_name + "-base")

    # Save and upload the tokenizer
    tokenizer_path = os.path.join(output_dir, new_model_name + "-tokenizer")
    tokenizer.save_pretrained(tokenizer_path, push_to_hub=True, repo_id=hf_model_name)

    print(f"Model saved and uploaded successfully!")
    print(f"PEFT adapters: https://huggingface.co/{hf_model_name}")
    print(f"Base model: https://huggingface.co/{hf_model_name}-base")
    print(f"Tokenizer: https://huggingface.co/{hf_model_name}")

    # Clear memory
    del model, trainer
    torch.cuda.empty_cache()

    print("Fine-tuning, saving, and uploading completed successfully!")

if __name__ == "__main__":
    main()

ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.