In [2]:
from datasets import load_dataset

dataset = load_dataset("Krooz/Campus_Recruitment_Text")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['prompt', 'label', 'response'],
        num_rows: 7225
    })
    validation: Dataset({
        features: ['prompt', 'label', 'response'],
        num_rows: 1275
    })
    test: Dataset({
        features: ['prompt', 'label', 'response'],
        num_rows: 1500
    })
})

In [3]:
dataset['train'][0]

{'prompt': '[INST] Instruction:\nWrite an objective overview about the following colleage student based only on the provided structured data in the JSON format.\nYou should include details and cover the information mentioned in the placement data. The overview should be 100 - 200 words. \nDon’t make up information. Don\'t give any additional feedback just represent the given information in the overview. \nUse a random human name for the student, Dont start with \'based on the structured data\'\n\nStructured data:\n{\n    "CGPA": 7.7,\n    "Internships": 1,\n    "Projects": 1,\n    "Workshops/Certifications": 0,\n    "AptitudeTestScore": 69,\n    "SoftSkillsRating": 4.0,\n    "ExtracurricularActivities": "No",\n    "PlacementTraining": "No",\n    "SSC_Marks": 55,\n    "HSC_Marks": 69\n}\n\n- SSC_Marks denote marks attained by the student in senion secondary school\n- HSC_Marks denote marks attained by the student in higher seconday school\n- CGPA is the cummulative GPA attained by the s

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig

# BitsAndBytesConfig to quantize the model int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

llm_name = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(llm_name, device_map='auto', quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(llm_name)
tokenizer.pad_token = tokenizer.eos_token

2024-02-15 00:17:18.975030: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-15 00:17:19.713645: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 574.88it/s]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:24<00:00, 12.08s/it]


In [5]:
from peft import LoraConfig

# PEFT Config 
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

In [6]:
from peft import prepare_model_for_kbit_training, get_peft_model

# prepare model for training
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [7]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")

# get frozen vs trainable model param statistics
print_trainable_parameters(model)

trainable params: 170082304 || all params: 3922153472 || trainable%: 4.336452033664837


In [39]:
def classify_report(sample):
    return f"""### Instruction: 
Classify the student into Placed/NotPlaced based on his/her college report details. The report includes marks scored by the student in various courses and extra curricular activities taken by them.

### Report:
{sample['response']}

### Label:
{sample['label']}
"""

In [15]:
print(classify_report(dataset['train'][0]))

### Instruction: 
Classify the student into Placed/NotPlaced based on his/her college report details. The report includes marks scored by the student in various courses and extra curricular activities taken by them.

### Report:
Johnson is a driven and motivated university student with a CGPA of 7.7 in his academic pursuits. He has completed one internship and one project during his time in university. Johnson has not participated in any workshops or certifications, but has demonstrated a strong aptitude with a score on the aptitude test of 69.

In terms of soft skills, Johnson has a solid rating with a score of 4.0. Additionally, Johnson has been actively involved in extracurricular activities, reporting that he takes part in "No" extracurricular organizations.

During placement training, Johnson did not participate. Despite this, he achieved strong grades attained by the student in his senior secondary school and higher secondary school marks of 55 and 69 respectively.

Overall, John

In [None]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments

model_args = TrainingArguments(
    output_dir = "/dccstor/kirushikesh/mistral_instruct_classify",
    num_train_epochs = 5,
    per_device_train_batch_size = 6,
    warmup_steps = 0.03,
    logging_steps=10,
    learning_rate=2e-4,
    bf16=True,
    lr_scheduler_type='constant',
    disable_tqdm=True,
    evaluation_strategy="epoch",
    load_best_model_at_end=True, 
    save_strategy="epoch",
    save_total_limit=1,
)

# Supervised Fine-Tuning Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=classify_report,
    args=model_args,
)

# train
trainer.train()

In [2]:
# login to HF hub
from dotenv import load_dotenv
from huggingface_hub import login
import os

load_dotenv()
login(os.getenv('HUGGING_FACE_TOKEN'))

  from .autonotebook import tqdm as notebook_tqdm


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /u/kirushi/.cache/huggingface/token
Login successful


In [5]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import torch

# fine-tuned model id
model_id = "/dccstor/kirushikesh/mistral_instruct_classify/checkpoint-167"

finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# Load tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:23<00:00, 11.81s/it]


In [6]:
# push model and tokenizer to HF hub under your username
finetuned_model.push_to_hub("Krooz/placement-classification-mistral-7b-instruct-v1")
tokenizer.push_to_hub("Krooz/placement-classification-mistral-7b-instruct-v1")

adapter_model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████| 680M/680M [00:16<00:00, 42.4MB/s]
tokenizer.model: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 493k/493k [00:00<00:00, 2.63MB/s]


CommitInfo(commit_url='https://huggingface.co/Krooz/placement-classification-mistral-7b-instruct-v1/commit/cfbb3e1aff99a1391fa767923cd01134e1c0cffc', commit_message='Upload tokenizer', commit_description='', oid='cfbb3e1aff99a1391fa767923cd01134e1c0cffc', pr_url=None, pr_revision=None, pr_num=None)

In [5]:
import torch
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

# fine-tuned model id
model_id = "Krooz/placement-classification-mistral-7b-instruct-v1"

# load base LLM model, LoRA params and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

adapter_config.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 648/648 [00:00<00:00, 188kB/s]
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:44<00:00, 22.21s/it]
adapter_model.safetensors: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 680M/680M [00:14<00:00, 46.7MB/s]
tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1.47k/1.47k [00:00<00:00, 643kB/s]
tokenizer.model: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 493k/493k [00:00<00:00, 72.3MB/s]
tokenizer.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1.80M/1.80M [00:00<00:00, 24.8MB/s]
special_tokens_map.json: 100%|████

In [8]:
from random import randrange

def classify_report(sample):
    return f"""### Instruction: 
Classify the student into Placed/NotPlaced based on his/her college report details. The report includes marks scored by the student in various courses and extra curricular activities taken by them.

### Report:
{sample['response']}

### Label:
"""

# select random sample
sample = dataset['test'][randrange(len(dataset['test']))]

# create prompt for inference
prompt = classify_report(sample)
print(prompt)

### Instruction: 
Classify the student into Placed/NotPlaced based on his/her college report details. The report includes marks scored by the student in various courses and extra curricular activities taken by them.

### Report:
John is a college-level student who has demonstrated academic excellence throughout his schooling journey. He has a cumulative GPA of 6.7 in his university, indicating his strong academic abilities. Additionally, he scored 63 on an aptitude test, showcasing his analytical and problem-solving skills. John has also engaged in one project, demonstrating his creativity and practical skills.

In terms of extracurricular activities, John is actively involved in a range of areas. He has participated in one project, which showcases his ability to work collaboratively and achieve results independently. However, he has zero internships and zero workshops/certifications, which could have been an area for improvement.

In terms of soft skills, John has a rating of 3.8, whi

In [17]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
# tokenize input text
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(device)

# inference, 5 outfit combinations make up around 700-750 tokens
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids, 
        max_new_tokens=20, 
        do_sample=False
    )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [19]:
# decode token ids to text
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# outputs is a list of length num_prompts
# parse the completed part
output = outputs[0][len(prompt):]

print(f"Context: \n{sample['response']}\n")
print(f"Ground truth: \n{sample['label']}\n")
print(f"Generated output: \n{output}\n\n\n")

Context: 
John is a college-level student who has demonstrated academic excellence throughout his schooling journey. He has a cumulative GPA of 6.7 in his university, indicating his strong academic abilities. Additionally, he scored 63 on an aptitude test, showcasing his analytical and problem-solving skills. John has also engaged in one project, demonstrating his creativity and practical skills.

In terms of extracurricular activities, John is actively involved in a range of areas. He has participated in one project, which showcases his ability to work collaboratively and achieve results independently. However, he has zero internships and zero workshops/certifications, which could have been an area for improvement.

In terms of soft skills, John has a rating of 3.8, which suggests that he has strong social and communication capabilities. He has no placement training, meaning he would benefit from gaining hands-on experience in a professional environment.

Overall, John has good academ