In [None]:
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U transformers==4.38.0

In [None]:
!pip install -q -U datasets==2.16.1

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [2]:
# from huggingface_hub import notebook_login
# notebook_login()

In [3]:
model_id = "Mr-Vicky-01/Gemma-2B-Finetuined-pythonCode"


model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
job_title = "ML Engineer"
preferred_qualification = "3+ years of Deep Learning"
hiring_company_name = "Google"
user_name = "Vicky"
past_working_experience= "Associate Analyst at zoho for 4 years"
current_working_experience = "Senior Analyst at TCS for 1 year"
skilleset= "Machine Learning, Deep Learning, AI, SQL, NLP"
qualification = "Bachelor of commerce with computer application"


prompt_template = f"<start_of_turn>user Generate Cover Letter for Role: {job_title}, \
 Preferred Qualifications: {preferred_qualification}, \
 Hiring Company: {hiring_company_name}, User Name: {user_name}, \
 Past Working Experience: {past_working_experience}, Current Working Experience: {current_working_experience}, \
 Skillsets: {skilleset}, Qualifications: {qualification} <end_of_turn>\n<start_of_turn>model"

# prompt_template = """
# <start_of_turn>user based on general question tell me correct answer here are the question
# What is Artificial Intelligence?
# <end_of_turn>\n<start_of_turn>model
# """

prompt = prompt_template
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

model_inputs = encodeds.to('cuda')

# Increase max_new_tokens if needed
generated_ids = model.generate(**model_inputs, max_new_tokens=250, do_sample=True, pad_token_id=tokenizer.eos_token_id)
decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(decoded)

In [4]:
from datasets import load_dataset

dataset = load_dataset("ShashiVish/cover-letter-dataset")
dataset

DatasetDict({
    train: Dataset({
        features: ['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications', 'Cover Letter'],
        num_rows: 813
    })
    test: Dataset({
        features: ['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications', 'Cover Letter'],
        num_rows: 349
    })
})

In [5]:
from datasets import concatenate_datasets, DatasetDict

summary_train = concatenate_datasets([dataset['train'],dataset['test']])

raw_datasets = DatasetDict()
raw_datasets["train"] = summary_train

In [6]:
raw_datasets["train"]

Dataset({
    features: ['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications', 'Cover Letter'],
    num_rows: 1162
})

In [7]:
def generate_prompt(data_point):
    """Generate input text based on a prompt, task instruction, (context info), and answer.

    :param data_point: dict: Data point
    :return: dict: Data point with the added "prompt" field
    """
    
    prompt_text = f"""<start_of_turn>user Generate Cover Letter for Role: {data_point['Job Title']}, \
                 Preferred Qualifications: {data_point['Preferred Qualifications']}, \
                 Hiring Company: {data_point['Hiring Company']}, User Name: {data_point['Applicant Name']}, \
                 Past Working Experience: {data_point['Past Working Experience']}, Current Working Experience: {data_point['Current Working Experience']}, \
                 Skillsets: {data_point['Skillsets']}, Qualifications: {data_point['Qualifications']} <end_of_turn>\n<start_of_turn>model: {data_point["Cover Letter"]}"""
    data_point["prompt"] = prompt_text
    return data_point

# Add the "prompt" column to the dataset
dataset = raw_datasets["train"].map(generate_prompt)

# Print the updated dataset
dataset

Dataset({
    features: ['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications', 'Cover Letter', 'prompt'],
    num_rows: 1162
})

In [8]:
print(dataset["prompt"][10])

<start_of_turn>user Generate Cover Letter for Role:  Data Scientist,                  Preferred Qualifications: 5-10 years of experience in data analysis.
Experience in anti-money laundering/terrorist financing activities.
Knowledge of French and English (bilingual).
Bachelor's degree in computer science, mathematics, data science, informatics, operations research, engineering, or a related field.
Experience with Microsoft Azure and Azure DevOps.,                  Hiring Company: XYZ Corporation, User Name: John Smith,                  Past Working Experience: Data Analyst at ABC Company, Current Working Experience: Data Scientist at DEF Company,                  Skillsets: Python, SQL, SAS, Power BI, TensorFlow, data analysis, reporting design, anti-money laundering, French and English language proficiency, Microsoft Azure., Qualifications: Bachelor's degree in computer science. <end_of_turn>
<start_of_turn>model: Dear Hiring Manager,

I am writing to express my interest in the Data S

In [9]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

In [10]:
dataset

Dataset({
    features: ['Job Title', 'Preferred Qualifications', 'Hiring Company', 'Applicant Name', 'Past Working Experience', 'Current Working Experience', 'Skillsets', 'Qualifications', 'Cover Letter', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 1162
})

In [11]:
# dataset = dataset.train_test_split(test_size=0.1)
# train_data = dataset["train"]
# test_data = dataset["test"]

In [12]:
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [13]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [14]:
modules = find_all_linear_names(model)
print(modules)

['up_proj', 'gate_proj', 'q_proj', 'v_proj', 'down_proj', 'o_proj', 'k_proj']


In [15]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=128,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [16]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 156893184 | total: 2663065600 | Percentage: 5.8915%


In [17]:
import transformers
import torch
from trl import SFTTrainer

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,  # Increased batch size
        gradient_accumulation_steps=2,  # Adjusted accumulation steps
        warmup_steps=500,  # Increased warm-up steps
        num_train_epochs=1,  # Increased epochs
        max_steps=500,  # Increased maximum steps
        learning_rate=1e-4,  # Decreased learning rate
        logging_steps=10,  # Adjusted logging frequency
        output_dir="python_model",  # Change the output directory
        optim="adamw_hf",  # Changed optimizer
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


2024-03-21 09:55:01.685676: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-21 09:55:01.685729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-21 09:55:01.687029: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [18]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mvickyvijay069[0m ([33mvicky12[0m). Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,1.8288
20,1.9043
30,1.7477
40,1.7541
50,1.5967
60,1.5073
70,1.3748
80,1.3941
90,1.116
100,1.0341



KeyboardInterrupt



In [19]:
new_model = "gemma-finetuned-python_code"
trainer.model.save_pretrained(new_model)

In [20]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model_new",safe_serialization=True)
tokenizer.save_pretrained("merged_model_new_tokenizer")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

merged_model = AutoModelForCausalLM.from_pretrained("/kaggle/working/merged_model_new")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/merged_model_new_tokenizer")

In [31]:
job_title = "ML Engineer"
preferred_qualification = "strong AI realted skills"
hiring_company_name = "Google"
user_name = "Vicky"
past_working_experience= "N/A"
current_working_experience = "Fresher"
skilleset= "Machine Learning, Deep Learning, AI, SQL, NLP"
qualification = "Bachelor of commerce with computer application"


prompt_template = f"<start_of_turn>user Generate Cover Letter for Role: {job_title}, \
 Preferred Qualifications: {preferred_qualification}, \
 Hiring Company: {hiring_company_name}, User Name: {user_name}, \
 Past Working Experience: {past_working_experience}, Current Working Experience: {current_working_experience}, \
 Skillsets: {skilleset}, Qualifications: {qualification} <end_of_turn>\n<start_of_turn>model"

prompt = prompt_template
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).input_ids

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
merged_model.to(device)
inputs = encodeds.to(device)


# Increase max_new_tokens if needed
generated_ids = merged_model.generate(inputs, max_new_tokens=250, do_sample=False, pad_token_id=tokenizer.eos_token_id)
ans = ''
for i in tokenizer.decode(generated_ids[0], skip_special_tokens=True).split('<end_of_turn>')[:2]:
    ans += i

# Extract only the model's answer
model_answer = ans.split("model")[1].strip()
print(model_answer)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


I am a recent graduate with a Bachelor of Commerce with Computer Application. I have strong AI related skills and am eager to apply them at Google. I am currently a fresher and have no previous working experience. I am confident that my skills and experience will make me a valuable asset to your team. I am looking forward to the opportunity to contribute to your success.


In [30]:
dataset['Cover Letter'][0]

'With a strong background in customer support and a knack for problem-solving, I am confident in my ability to significantly contribute to Innovation Inc. In my previous roles at XYZ Company and ABC Company, I have consistently provided high-quality support to customers, resolving issues efficiently and maintaining a high level of customer satisfaction. I am eager to bring my strong commitment to quality support to your team.'

In [27]:
dataset["prompt"][0]

'<start_of_turn>user Generate Cover Letter for Role: Senior Support Engineer,                  Preferred Qualifications: 5+ years experience in customer support, knowledge of SQL,                  Hiring Company: Innovation Inc., User Name: Jane Smith,                  Past Working Experience: Customer Support Representative at XYZ Company for 3 years, Current Working Experience: Senior Customer Support Representative at ABC Company for 3 years,                  Skillsets: Customer Support, SQL, Problem-solving, Qualifications: B.A. in Business Administration <end_of_turn>\n<start_of_turn>model: With a strong background in customer support and a knack for problem-solving, I am confident in my ability to significantly contribute to Innovation Inc. In my previous roles at XYZ Company and ABC Company, I have consistently provided high-quality support to customers, resolving issues efficiently and maintaining a high level of customer satisfaction. I am eager to bring my strong commitment t