In [1]:
%%capture
%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb

In [2]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [5]:
from huggingface_hub import interpreter_login

interpreter_login()

wb_token = "5cc64c99d29243008b6b36346efac5f17b8f24ad"

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune Llama 3 8B on Medical Dataset',
    job_type="training",
    anonymous="allow"
)


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): ··········
Add token as git credential? (Y/n) Y
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m270203manoj[0m ([33m270203manoj-pes-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name = "Manoj2702/InterviewTrain"
new_model = "llama-3-8b-judge-interview"

In [7]:
torch_dtype = torch.float16
attn_implementation = "eager"

In [8]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [10]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

In [11]:
#Importing the dataset
dataset = load_dataset(dataset_name, split="all")
# dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["question_cand_answer"]},
               {"role": "assistant", "content": row["justification_judge"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/10.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/816 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/816 [00:00<?, ? examples/s]

"<|im_start|>user\nInterviewer: Great! Can you describe your role and responsibilities as the Platform Architect for the IPP project? \n\n\nInterviewee: Yes. So, my role as a platform architect as, as IPP project at Zooy So, as you may know, Zoie is, is one of our largest shareholders, and, you know, we just need a way to connect and integrate with, and the IPP project is basically we're working on the projects where we can connect and integrate with a lot of systems and manufacturers' equipment, supply chain managements, and CRM. So my role here is we I'm working on the overall technical visions and road map of the IPP. So I I was, designing, like, hybrid cloud infrastructures, on AWS. Just to fulfill, fulfill the the the scalability and secure security requirements. I evaluate and chose the appropriate IoT platforms and protocol for the device connectivity and data digestsions. And also, I, was have the responsibility to implement security measures as well as ensure the compliance wi

In [12]:
dataset = dataset.train_test_split(test_size=0.1)

In [13]:
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=2,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb"
)



In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length=512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/734 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

In [15]:
trainer.train()



Step,Training Loss,Validation Loss
147,2.4542,2.36287
294,2.5714,2.340618
441,2.5262,2.360572
588,1.7711,2.359635


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


TrainOutput(global_step=734, training_loss=2.2276234668996744, metrics={'train_runtime': 2537.6355, 'train_samples_per_second': 0.578, 'train_steps_per_second': 0.289, 'total_flos': 3.389899681057997e+16, 'train_loss': 2.2276234668996744, 'epoch': 2.0})

In [16]:
wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▁▇▇
eval/runtime,▅█▁▅
eval/samples_per_second,▄▁█▄
eval/steps_per_second,▄▁█▄
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▁▂▂▂▂▂▂▁▂▁▂▂▂▂▂▂▂▂▂▂▂▄▄▃▄▅▅▄▄▄▄▄▅▆▆▅█▄█▅
train/learning_rate,▇███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁▁
train/loss,█▅▅▃▅▅▆▄▄▆▆▄▅▆▄▅▆▆▃▄▃▃▄▁▁▅▂▄▃▃▄▂▃▄▅▃▅▂▅▄

0,1
eval/loss,2.35963
eval/runtime,58.6396
eval/samples_per_second,1.398
eval/steps_per_second,1.398
total_flos,3.389899681057997e+16
train/epoch,2.0
train/global_step,734.0
train/grad_norm,1.44837
train/learning_rate,0.0
train/loss,1.8171


In [20]:
messages = [
    {
        "role": "user",
        "content": "Interviewer: That makes sense. Tailwind CSS and Material UI are great choices for ensuring a consistent and responsive design. Now, considering the backend, you mentioned using Spring Boot for its scalability and security features. What potential challenges do you foresee in integrating the new frontend with the Spring Boot backend, and how would you address them? Interviewee: So the main challenges are we if there is already a back end, like, already a back end products, So, the main challenge will be the security issues. Front end can send any data, and if the front end is not validating, then I need to validate the data at the back end level. I have to check for the integrity and for the security purpose that whether the data is coming is valid or not. Whether the user which who is requesting for the data is valid or not, So the sick yeah. The main concern will be implementing the security and so we can before, giving the access for the data, we can sanitize the data, to solve the security threats. Like, if there is any escalation type thing. So we if, we will go through the scenario sanitation process, then, the eschal injection can be prevented. And, if there is a cross site scripting attack also, then it can be prevented because it arises from the front end level. So that are the challenges and the the scalability challenge also whether the back end is, back end is implemented in such a way that it can handle at the skill level. So if, let's say there is a scenario that, if currently the, in the ecommerce store, there are thousand users and after, 2 to 3 months, the user gets increased to 1,000,000, then, our plan platform needs to be scalable. So the design, consistency need to be there and, the design the project need to be implemented in such a way that the project could be scalable when the user when the number of user gets increased or when the features when we require a new feature? So for that, we can, we can go for the layering structure. Like, we can create a controller layer. We can create a service layer. We can create a down layer. We can create a model layer. So following this approach, we can create a consistent design as well as we can provide some abstraction so that we can we can manage our security also, and, we can take a a, for the security, we can take a help of Spring Security. And all the features of Spring, which from which is supported by the SpringBoard."
    }
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False,
                                       add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True,
                   truncation=True).to("cuda")

outputs = model.generate(**inputs, max_length=1024,
                         temperature=0.7,
                         num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


The candidate demonstrates a solid understanding of backend security and scalability challenges, mentioning key concepts such as data validation, sanitization, and the importance of layering to ensure design consistency and abstraction. They accurately use technical terms like 'escalation,' 'cross-site scripting,' and 'layering structure,' indicating a working knowledge of backend concepts. However, the explanation lacks depth and concrete examples of how these concepts would be implemented in practice, and the candidate's response does not showcase deep expertise or unique insights. The explanation is somewhat disjointed and lacks concrete examples of how these concepts would be implemented in practice. The response does not showcase deep expertise or unique insights, and the candidate's explanation is somewhat disjointed and lacks concrete examples. The candidate's response is somewhat disjointed and lacks concrete examples, but they demonstrate a working knowledge of technical conc

In [21]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)



adapter_model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Manoj2702/llama-3-8b-judge-interview/commit/b12833f65ce40d3921b3f8ee27a439899ab0f568', commit_message='Upload model', commit_description='', oid='b12833f65ce40d3921b3f8ee27a439899ab0f568', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
base_model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
new_model = "/content/llama-3-8b-judge-interview/"

In [24]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)

base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, new_model)





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1004.00 MiB. GPU 

In [None]:
model = model.merge_and_unload()

In [None]:
messages = [{"role": "user", "content": """
Interviewer: Given your expertise in testing, particularly with automation, could you elaborate on the testing framework or tools you used to conduct the load testing for this scenario, and how you interpreted the results to make data-driven decisions on query optimization? Interviewee: So for, uh, load testing, we used JMeter, plus for automation testing, those framework was Selenium. And, uh, the load testing was given, uh, giving us reports, and we used to optimize the hitch according to the according to the traffic which we had. So we first got the data how much traffic we have in minute and an or in an hour. And accordingly, we replicated it through JMeter. And, uh, for testing framework, we used Selenium to automate the platform, to automate it. And after every run, it used to give us reports, how much steps are filled, how many steps are passed. So that through that, we used to check how our platform is working. In that script, we used to run hourly. So at every single hour, we used to get an idea of how our platform is working or is there any product, uh, issue production issue or bug bug being released to any to any, uh, tag deployment or something. Interviewer: Implementing an event-driven architecture for price updates is a sophisticated and effective solution. Could you elaborate on the specific eventing mechanism you used, such as Kafka topics or Google Cloud Pub/Sub, and how you ensured the reliability and scalability of this event-driven price update system, especially in terms of handling high volumes of price change events and ensuring that no updates were missed or applied incorrectly? Interviewee: So, uh, so, of course, uh, let me elaborate. So I deployed, uh, I deployed a PubSub consumer. I created a topic, and I created a subscription inside that topic. So, uh, and I set I had set the batch size to 1,000. So every for every 1,000 messages, it's used to, uh, it used my consumer used to consume all the data, and it used it used to do the processing. So, uh, of course, the processing speed was so high that, uh, there was no such lag in the price updated. It used to get in it used to get in, uh, we used to see the update changes in, like, milliseconds. And, uh, it definitely didn't miss any single message because once we have a message in in the pop up, the consumer was up all the time. So it used to, uh, pick the message instantly and do the updated changes. Even if there is some failure in doing the changes, so we used to we had deployed dead letters as well. So if there is a failure in doing the changes and the packet was not successfully processed, so the failure failed packet used to get, uh, you know, inserted inside the dead letters. So for failure, we I had deployed a mechanism as well. For success, we had already, uh, topic, which is handling all the, uh, packages that are coming in there. And the sub consumer was up all the time, so there was no chance of missing any any single packet. Plus, it was, uh, so, uh, it was so optimized. I had reconciled to we use that we used to see with the changes in we need for microseconds even. So this was my approach of making it more faster in real Apple, and we used to see the changes instantly. That was all.
"""}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

outputs = pipe(prompt, max_new_tokens=512, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

In [None]:
model.save_pretrained("llama-3-8b-judge-interview")
tokenizer.save_pretrained("llama-3-8b-judge-interview")

In [None]:
model.push_to_hub("llama-3-8b-judge-interview", use_temp_dir=False)
tokenizer.push_to_hub("llama-3-8b-judge-interview", use_temp_dir=False)

In [None]:
%cd /content/model
!git clone --depth=1 https://github.com/ggerganov/llama.cpp.git
%cd /content/model/llama.cpp
!sed -i 's|MK_LDFLAGS   += -lcuda|MK_LDFLAGS   += -L/usr/local/nvidia/lib64 -lcuda|' Makefile
!LLAMA_CUDA=1 conda run -n base make -j > /dev/null

In [None]:
!python convert-hf-to-gguf.py /kaggle/input/fine-tuned-adapter-to-full-model/llama-3-8b-judge-interview/ \
    --outfile /content/model/llama-3-8b-judge-interview.gguf \
    --outtype f16

In [25]:
!zip -r /content/llamaJudgeInterview.zip /content/llama-3-8b-judge-interview

  adding: content/llama-3-8b-judge-interview/ (stored 0%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/ (stored 0%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/trainer_state.json (deflated 83%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/tokenizer.json (deflated 74%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/README.md (deflated 66%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/training_args.bin (deflated 51%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/tokenizer_config.json (deflated 96%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/adapter_model.safetensors (deflated 22%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/adapter_config.json (deflated 53%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/scheduler.pt (deflated 55%)
  adding: content/llama-3-8b-judge-interview/checkpoint-500/optimizer.pt (deflated 9%)
  adding: content/llama-3-8b-judge-inte

In [27]:
from google.colab import files
files.download("/content/llamaJudgeInterview.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>