# Loading the data set

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

path = "/content/drive/MyDrive/resume-txt/100-JD.xlsx"
df = pd.read_excel(path)
print(df)

    DATA SCIENCE                                          Unnamed: 1
0             NaN                                                NaN
1            JD 1  Job description\nQualifications :\n6+ years ex...
2            JD 2  Job description\nYou will build projects with ...
3            JD 3  Job description\n\nAdvanced knowledge of proba...
4            JD 4  Job description\n• Exp in Data Scientist / Ana...
..            ...                                                ...
130          JD 6  Job description\nRole & responsibilities\n\n1....
131          JD 7  Job description\nLead and support Pipe Stress ...
132          JD 8  Job description\nWe are looking for a candidat...
133          JD 9  Job description\nI&C Design and Detailed Engin...
134         JD 10  Job description\nSolar PV Design Engineer For ...

[135 rows x 2 columns]


In [3]:
df.rename(columns= {"Unnamed: 1": "JD"}, inplace = True)

In [4]:
df.head()

Unnamed: 0,DATA SCIENCE,JD
0,,
1,JD 1,Job description\nQualifications :\n6+ years ex...
2,JD 2,Job description\nYou will build projects with ...
3,JD 3,Job description\n\nAdvanced knowledge of proba...
4,JD 4,Job description\n• Exp in Data Scientist / Ana...


In [5]:
df.shape

(135, 2)

In [6]:
df.columns

Index(['DATA SCIENCE ', 'JD'], dtype='object')

In [7]:
df = df.dropna()
df.shape

(97, 2)

In [8]:
df.head()

Unnamed: 0,DATA SCIENCE,JD
1,JD 1,Job description\nQualifications :\n6+ years ex...
2,JD 2,Job description\nYou will build projects with ...
3,JD 3,Job description\n\nAdvanced knowledge of proba...
4,JD 4,Job description\n• Exp in Data Scientist / Ana...
5,JD 5,Job description\nL&D Trainer - Python & Data S...


# Prompt and Model

In [9]:
!pip install clarifai
!pip install langchain

Collecting clarifai
  Downloading clarifai-9.8.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting clarifai-grpc>=9.8.1 (from clarifai)
  Downloading clarifai_grpc-9.8.2-py3-none-any.whl (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.9/216.9 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tritonclient==2.34.0 (from clarifai)
  Downloading tritonclient-2.34.0-py3-none-manylinux1_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
Collecting tqdm==4.64.1 (from clarifai)
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rich==13.4.2 (from clarifai)
  Downloading rich-13.4.2-py3-none-any.whl (239 kB)
[2K    

In [10]:
from langchain.llms import Clarifai
from langchain import PromptTemplate, LLMChain

clarifai_llm = Clarifai(
    pat='4d72f91e513247889ee7c9130d28f674', user_id= "meta", app_id="Llama-2", model_id = "llama2-70b-chat"
)

clarifai_llm_2=Clarifai(
    pat='4d72f91e513247889ee7c9130d28f674', user_id= "clarifai", app_id="ml", model_id = "llama2-70b-chat-alternative"
)

clarifai_llm_3=Clarifai(
    pat='4d72f91e513247889ee7c9130d28f674', user_id= "openai", app_id="chat-completion", model_id = "GPT-3_5-turbo"
)

In [11]:
def prompt_llama(jd):

  template_llama = """
          <s>[INST] <<SYS>>
          You are a skilled talent recruiter, your task is to provide a concise structured summary of the given job description in a JSON format.

          Follow the following instructions:
          Step-1: Analyse and parse the following information from the job description, do not just extract the data, rephrase it meaningfully:
              Role, Relevant Experiences required, Experience Duration required, Skillset and Tools required, Projects required,
              Certifications required and Roles Achievements Contributions required
              If value of a key is missing in the resume then value should be null.
              If not a resume then all the key's value should be null
          Step-2: Only return the meaningful parsed data in a sturctured JSON format with key and corresponding value format as follows-
              'Role':string
              'Relevant Experiences required': string,
              'Experience Duration required': string,
              'Skillset and Tools required' : string,
              'Project description': string,
              'Responsibilities required': string,
              'Certifications required': string
              'Education required': string
              If not a job description then all the key's value should be null.
                <</SYS>>
                Paragraph
                Job_description: {jd}
                Only return the structured parsed json format of the resume of candidate.
          [/INST]
          """


  # Get model prediction
  prompt = PromptTemplate(template=template_llama, input_variables=["jd"])
  llm_chain_detail = LLMChain(prompt=prompt, llm=clarifai_llm_2)
  jd_summ =llm_chain_detail.run(jd=jd)

  return jd_summ


# Check for 1 JD

In [12]:
jd = list(df['JD'])[0]
#print(jd)

In [17]:

res = prompt_llama(str(jd))
print(res)

 {
"Role": "Machine Learning Engineer",
"Relevant Experiences required": "6+ years experience working in a Data Science role",
"Experience Duration required": null,
"Skillset and Tools required": "Python, Java, Spark, Hadoop, NoSQL Database, data science libraries, command line Linux environment",
"Projects required": null,
"Responsibilities required": "developing and deploying ML models, measuring model impact, collaborating with cross-functional teams",
"Certifications required": null,
"Education required": "Bachelor's degree in Computer Science, Mathematics, Statistics, or other analytical fields"
}


# Create text columns

In [18]:
def create_text_columns(jd, output):
  text = """[INST] <<SYS>>
          You are a skilled talent recruiter, your task is to provide a concise structured summary of the given job description in a JSON format.

    Follow the following instructions:
    Step-1: Analyse and parse the following information from the job description, do not just extract the data, rephrase it meaningfully:
        Role, Relevant Experiences required, Experience Duration required, Skillset and Tools required, Projects required,
        Certifications required and Roles Achievements Contributions required
        If value of a key is missing in the resume then value should be null.
        If not a resume then all the key's value should be null
    Step-2: Only return the meaningful parsed data in a sturctured JSON format with key and corresponding value format as follows-
        'Role':string
        'Relevant Experiences required': string,
        'Experience Duration required': string,
        'Skillset and Tools required' : string,
        'Project description': string,
        'Responsibilities required': string,
        'Certifications required': string
        'Education required': string
        If not a job description then all the key's value should be null.
          <</SYS>> """ + str(jd) + " [/INST] " + str(output)

  return text

# Run on entire dataset

In [None]:
data = []
for index, row in df.iterrows():
  try:
    print(index)
    jd = str(row['JD'])
    output = str(prompt_llama(jd))
    text = create_text_columns(jd, output)

    temp = dict()
    temp["JD"] = jd
    temp["output"] = output
    temp["text"] = text
    data.append(temp)
  except Exception as e:
    print("The error is: ",e)
    continue


In [21]:
jd_summ_100 = pd.DataFrame(data)

In [23]:
jd_summ_100.shape

(86, 3)

In [1]:
jd_summ_100.head()

NameError: ignored

In [24]:
jd_summ_100 = pd.DataFrame(data)
jd_summ_100.to_excel('jd_summ_100.xlsx')

# Fine-tuning the Llama 2 Model

## Data preparation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:

import pandas as pd

path = "/content/drive/MyDrive/resume-txt/jd_summ_100.xlsx"

# reading the CSV file
df = pd.read_excel(path)
type(df)

pandas.core.frame.DataFrame

In [14]:
df

Unnamed: 0.1,Unnamed: 0,JD,output,text
0,0,Job description\nQualifications :\n6+ years ex...,"{\n""Role"": ""Machine Learning Engineer"",\n""Rel...",[INST] <<SYS>>\n You are a skilled ta...
1,1,Job description\nYou will build projects with ...,"{\n""Role"": ""Business Intelligence & Analytics...",[INST] <<SYS>>\n You are a skilled ta...
2,2,Job description\n\nAdvanced knowledge of proba...,"{\n""Role"": ""Manager-Delivery"",\n""Relevant Exp...",[INST] <<SYS>>\n You are a skilled ta...
3,3,Job description\n• Exp in Data Scientist / Ana...,"{\n""Role"": ""Data Science & Analytics - Other""...",[INST] <<SYS>>\n You are a skilled ta...
4,4,Job description\nL&D Trainer - Python & Data S...,"{\n""Role"": ""L&D Trainer - Python & Data Scien...",[INST] <<SYS>>\n You are a skilled ta...
...,...,...,...,...
81,81,Job description\nWe are looking for dynamic an...,"{\n""Role"": ""Accounts Executive"",\n""Relevant E...",[INST] <<SYS>>\n You are a skilled ta...
82,82,Job description\nRoles and Responsibilities : ...,"{\n""Role"": ""Finance Executive"",\n""Relevant Ex...",[INST] <<SYS>>\n You are a skilled ta...
83,83,"Job description\nDS,GST INCOME TAX ON SALARY I...","{\n""Role"": ""Accountant / Accounts Executive"",...",[INST] <<SYS>>\n You are a skilled ta...
84,84,Job description\nSkills\n\nBank Reconciliation...,"{\n""Role"": ""Accountant / Accounts Executive"",...",[INST] <<SYS>>\n You are a skilled ta...


In [15]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1)

In [16]:
print(train_df.shape, test_df.shape)

(77, 4) (9, 4)


In [None]:
!pip install datasets

In [18]:
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset


dataset = ds.dataset(pa.Table.from_pandas(train_df).to_batches())

### convert to Huggingface dataset
train = Dataset(pa.Table.from_pandas(train_df))



In [19]:
train

Dataset({
    features: ['Unnamed: 0', 'JD', 'output', 'text', '__index_level_0__'],
    num_rows: 77
})

In [20]:
dataset = ds.dataset(pa.Table.from_pandas(test_df).to_batches())

### convert to Huggingface dataset
test = Dataset(pa.Table.from_pandas(test_df))


# Packages

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

In [22]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Hyperparameter

In [23]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model name
new_model = "llama-2-7b-jd-summarizer"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

# Continue

In [24]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)


In [25]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [26]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [27]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

In [28]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [29]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training


Downloading (…)okenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

# Testing before trraining the model

In [30]:
d1 = pd.DataFrame(test)
d1.head()

Unnamed: 0.1,Unnamed: 0,JD,output,text,__index_level_0__
0,41,"Job description\nProduct management, understan...","{\n""Role"": ""Product Manager"",\n""Relevant Expe...",[INST] <<SYS>>\n You are a skilled ta...,41
1,33,"Job description\nAs a Product Manager, you wil...","{\n""Role"": ""Product Manager"",\n""Relevant Expe...",[INST] <<SYS>>\n You are a skilled ta...,33
2,13,Job description\nLocation : Remote\n\nRole : F...,"{\n""Role"": ""Full Stack Engineer"",\n""Relevant ...",[INST] <<SYS>>\n You are a skilled ta...,13
3,46,Job description\nAssist the sales team in the ...,"{\n""Role"": ""Product Manager"",\n""Relevant Expe...",[INST] <<SYS>>\n You are a skilled ta...,46
4,58,Job description\nRole Overview:\nWe are seekin...,"{\n""Role"": ""Producer"",\n""Relevant Experiences...",[INST] <<SYS>>\n You are a skilled ta...,58


In [31]:
prompt = "A recipe calls for 2 cups of flour. If you want to make half of the recipe, how many cups of flour do you need?"  # change to your desired prompt
gen = pipeline('text-generation', model=model, tokenizer=tokenizer, max_length = 200)
result = gen(prompt)
print(result[0]['generated_text'])

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


A recipe calls for 2 cups of flour. If you want to make half of the recipe, how many cups of flour do you need?

Answer: To make half of the recipe, you will need 1 cup of flour.

Explanation: If a recipe calls for 2 cups of flour and you want to make half of the recipe, you will need 1 cup of flour. This is because half of 2 cups is equal to 1 cup.


In [32]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [33]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [34]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [35]:
# Train model
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=20, training_loss=1.900202178955078, metrics={'train_runtime': 270.1145, 'train_samples_per_second': 0.285, 'train_steps_per_second': 0.074, 'total_flos': 1427848087265280.0, 'train_loss': 1.900202178955078, 'epoch': 1.0})

In [39]:
# Save trained model
#trainer.model.save_pretrained(new_model)

# Prompt for fine-tuned model

In [36]:
def prompt_fine_tuned_model(jd):
  template_llama = f"""
          <s>[INST] <<SYS>>
          You are a skilled talent recruiter, your task is to provide a concise structured summary of the given job description in a JSON format.

          Follow the following instructions:
          Step-1: Analyse and parse the following information from the job description, do not just extract the data, rephrase it meaningfully:
              Role, Relevant Experiences required, Experience Duration required, Skillset and Tools required, Projects required,
              Certifications required and Roles Achievements Contributions required
              If value of a key is missing in the resume then value should be null.
              If not a resume then all the key's value should be null
          Step-2: Only return the meaningful parsed data in a sturctured JSON format with key and corresponding value format as follows-
              'Role':string
              'Relevant Experiences required': string,
              'Experience Duration required': string,
              'Skillset and Tools required' : string,
              'Project description': string,
              'Responsibilities required': string,
              'Certifications required': string
              'Education required': string
              If not a job description then all the key's value should be null.
                <</SYS>>
                Paragraph
                Job_description: {jd}
                Only return the structured parsed json format of the resume of candidate.
          [/INST]
          """
  return template_llama

In [37]:
test_df = pd.DataFrame(test)

In [38]:
jd = list(test_df["JD"])[0]
print(jd)

Job description
Product management, understanding product P&L and business levers
Ability to work with large teams and drive product development
Strong influencing & communication skills both internally and with external partners
support to sales team
Maintains MIS
Role: Product Manager
Industry Type: NBFC
Department: Product Management
Employment Type: Full Time, Permanent
Role Category: Product Management - Technology
Education
UG: Any Graduate


In [None]:
prompt = prompt_fine_tuned_model(jd)
gen = pipeline('text-generation', model=model, tokenizer=tokenizer)
result = gen(prompt)
print(result[0]['generated_text'])

# load the model in huggingface

In [50]:
import locale
print(locale.getpreferredencoding())

ANSI_X3.4-1968


In [51]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [53]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [56]:
model.push_to_hub(model, use_temp_dir=False)
tokenizer.push_to_hub(model, use_temp_dir=False)

In [55]:
trainer.push_to_hub()

In [2]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

NameError: ignored

In [None]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7