In [None]:
!pip install torch
!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install trl
!pip install accelerate
!pip install einops
!pip install datasets
!pip install rich

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
from rich import print
import json

In [None]:
# Model
base_model_name = "microsoft/phi-2"

# demo dataset
# dataset = load_dataset("prsdm/medquad-phi2-1k", split="train")

In [None]:
import json

def read_json_file(file_path):
    """
    Reads a JSON file and returns the data as a Python dictionary.

    Args:
        file_path (str): The path to the JSON file.

    Returns:
        dict: A dictionary containing the data from the JSON file, or None if an error occurs.
    """
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in {file_path}")
        return None

# Example usage:
file_path = '/content/dataset.json'
data = read_json_file(file_path)

if data:
    print(data)

In [None]:
# Dataset
from datasets import Dataset
dataset = Dataset.from_list(data)

In [None]:
print(dataset)

In [None]:
dataset[0]

{'text': '### Instruction: What score did he obtain in his Bachelor’s program?\n### Assistant: \n He studied Computer Science during his Bachelor’s at St. John’s College and achieved 7.73 CGPA.\n'}

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,  # Changed from keyword argument 'model' to positional argument
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map={"": 0}
)
# model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2",quantization_config=bnb_config, torch_dtype="auto", trust_remote_code=True)

model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
# LoRA configuration
peft_config = LoraConfig(
    r= 64,
    lora_alpha= 16,
    lora_dropout=0.05, #0.1
    bias="none",
    task_type="CAUSAL_LM",
   #target_modules= ["Wqkv", "out_proj"] #["Wqkv", "fc1", "fc2" ] # ["Wqkv", "out_proj", "fc1", "fc2" ]
)

In [None]:
# Set training arguments
training_arguments = TrainingArguments(
    output_dir = "./results",
    num_train_epochs = 1,
    fp16 = False,
    bf16 = False,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 0,
    logging_steps = 25,
)

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
)



Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Assuming you want to use wandb for logging
import wandb

# Initialize a wandb run
# You can specify a project name and other parameters here
wandb.init(project="ResumeRiser")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mslowery1992[0m ([33mslowery1992-stanley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Train model
trainer.train()

Step,Training Loss
25,2.3433
50,1.6528
75,0.7255
100,0.6933
125,0.3425
150,0.4351
175,0.1612
200,0.346
225,0.1037
250,0.221


TrainOutput(global_step=250, training_loss=0.7024418334960938, metrics={'train_runtime': 409.2782, 'train_samples_per_second': 2.443, 'train_steps_per_second': 0.611, 'total_flos': 839468550144000.0, 'train_loss': 0.7024418334960938})

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
fine_tuned_model_path='/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B'
# os.makedirs(fine_tuned_model_path, exist_ok=True)

In [None]:
os.system('rm -rf /content/drive/MyDrive/fine_tuned_model/')

0

In [None]:
# Saves the model
model.save_pretrained(fine_tuned_model_path)
trainer.save_model(fine_tuned_model_path)  # Saves the model, tokenizer, and training config
tokenizer.save_pretrained(fine_tuned_model_path)  # Explicitly save the tokenizer (optional, as trainer.save_model may include it)

('/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B/vocab.json',
 '/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B/merges.txt',
 '/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B/added_tokens.json',
 '/content/drive/MyDrive/fine_tuned_model/Fine_tuned_microsoft_phi-2-ResumeRiser-7B/tokenizer.json')

# push to hugging face hub

In [None]:
!pip install huggingface_hub



In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
repo_id = "Mangal-404/phi-2-ResumeRiser-7B"  # Replace with your Hugging Face username and desired model name
model.push_to_hub(repo_id, private=False)  # Set private=True if you want a private repository
tokenizer.push_to_hub(repo_id, private=False)

adapter_model.safetensors:   0%|          | 0.00/294M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mangal-404/phi-2-ResumeRiser-7B/commit/55bceedcd4afaa6dfc524cc2af73963fa747a93b', commit_message='Upload tokenizer', commit_description='', oid='55bceedcd4afaa6dfc524cc2af73963fa747a93b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Mangal-404/phi-2-ResumeRiser-7B', endpoint='https://huggingface.co', repo_type='model', repo_id='Mangal-404/phi-2-ResumeRiser-7B'), pr_revision=None, pr_num=None)

In [None]:
repo_id = "Mangal-404/phi-2-ResumeRiser-7B"

In [None]:
!pip install -q transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer from the Hub
model = AutoModelForCausalLM.from_pretrained(
    repo_id,
    trust_remote_code=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(repo_id)

# Test inference
input_text = "What tools and libraries does Mangalesh use for data analysis?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# Clear the memory
del model, trainer
torch.cuda.empty_cache()