# Install

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

In [2]:
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import torch
import numpy as np
import pandas as pd
import wandb

In [None]:
# HuggingFace login -- training model/data & Pushing model
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

# Loading the Model

In [4]:
new_model = "LimYeri/CodeMind-Gemma-7B-QLoRA-4bit"

In [5]:
model = AutoModelForCausalLM.from_pretrained(new_model, device_map={"": 0})
tokenizer = AutoTokenizer.from_pretrained(new_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/522 [00:00<?, ?B/s]

# Create a prompt

In [6]:
def get_completion(query: str, model, tokenizer) -> str:
  device = "cuda:0"
  prompt_template = """
  <start_of_turn>user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  {query}
  <end_of_turn>\n\n<start_of_turn>model

  """
  prompt = prompt_template.format(query=query)
  encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encodeds.to(device)
  generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
  # decoded = tokenizer.batch_decode(generated_ids)
  decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
  return (decoded)

# QLoRA-fine tuned Gemma model output

In [9]:
result = get_completion(query="Tell me how to solve the Leetcode Two Sum problem", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



  user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  Tell me how to solve the Leetcode Two Sum problem
  \nmodel
  
  , here's how to solve the Leetcode Two Sum problem:


**1. Understand the problem:**
   * The problem asks you to find two numbers in an array that add up to a given target.
   * The array may contain duplicates.
   * You need to return the pair of indices of the two numbers.


**2. Choose an appropriate algorithm:**
   * The most common solution is to use a dictionary to store the necessary pairs.
   * Iterate over the array only once and store the target minus each element as the key and the index of that element as the value.
   * If the key is found in the dictionary, you can retrieve the index of the two numbers and return them.


**3. Implement the solution:**
```python
def twoSum(nums, target):
    # Create a dictionary to store the pairs
    pairs = {}
    
    # Iterate over the array
    for i in

In [37]:
result = get_completion(query="What technique should I use to solve the search-inlocated-array-ii problem in leetcode?", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



  user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  What technique should I use to solve the search-inlocated-array-ii problem in leetcode?
  \nmodel
  
  , the "search-inlocated-array-ii" problem in leetcode asks for a solution to find the target value in a sorted array with duplicates. Here's the technique you can use:

**1. Binary Search:**

- This problem lends itself well to a binary search algorithm. Divide the array into halves, compare the target value with the middle element of the current half.
- If the target value is equal to the middle element, you're done. If it's less than the middle element, recurse on the left half. If it's greater than the middle element, recurse on the right half.
- This technique is efficient as it eliminates half of the remaining elements in each recursive call.

**2. Hashing:**

- For larger arrays, hashing techniques can be more effective than binary search. Create a hash table whe

In [40]:
result = get_completion(query="Please explain the problem of 'Roman to Integer'", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



  user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  Please explain the problem of 'Roman to Integer'
  \nmodel
  
  , here is an explanation of the problem of "Roman to Integer":

**Problem:**

The problem of Roman to Integer is to convert a given Roman numeral (a number written using the symbols I, V, X, L) into its corresponding integer value.

**Input:**

- A Roman numeral as a string.

**Output:**

- The integer value of the Roman numeral.

**Constraints:**

- The input string will be in the format of a valid Roman numeral.
- The input string will not contain any leading or trailing whitespace.
- The input string will not contain any repeating characters.

**Example:**

```
Input: "IV"
Output: 4

Input: "MCMXCIV"
Output: 1994
```

**Solution:**

To solve this problem, we need to create a mapping between the Roman numerals and their corresponding integer values. Then, we need to iterate over the input string, taking i

In [42]:
result = get_completion(query="I don't know about word break problem. please give me a approach about this problem.", model=model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



  user
  Below is an instruction that describes a task. Write a response that appropriately completes the request.
  I don't know about word break problem. please give me a approach about this problem.
  \nmodel
  
  , here is a breakdown of the word break problem and an approach to solving it:

  **Word Break Problem:**

  The word break problem is a problem that asks whether a given sequence of words can be broken down into a set of predefined words. For example, given the sequence "abcdb" and the set of predefined words ["ab", "bc", "cd", "db"], the answer would be yes, as the sequence can be broken down into the words "ab", "bc", and "cd".

**Approach:**

  * **Dynamic Programming:**
     - Create a memoization table `memo` to store the solutions for each subproblem.
     - Calculate the solution for each subproblem recursively, storing the result in `memo`.
     - If the result for a subproblem is already stored in `memo`, reuse that result instead of recalculating it.
     - Onc

In [10]:
#Importing the dataset
dataset_name = "LimYeri/LeetCode_with_Solutions"
dataset = load_dataset(dataset_name)
df = pd.DataFrame(dataset['train'])
dataset

Downloading readme:   0%|          | 0.00/955 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34903 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question_content', 'title_slug', 'tag', 'level', 'question_hints', 'content'],
        num_rows: 34903
    })
})

In [11]:
df.head()

Unnamed: 0,id,question_content,title_slug,tag,level,question_hints,content
0,76,Given two strings `s` and `t` of lengths `m` a...,minimum-window-substring,"Hash Table,String,Sliding Window",Hard,Use two pointers to create a window of letters...,The problem statement involves finding the sma...
1,1844,You are given a **0-indexed** string `s` that ...,maximum-number-of-balls-in-a-box,"Hash Table,Math,Counting",Easy,Note that both lowLimit and highLimit are of s...,### Step 1: Understand the Problem\n- The task...
2,1470,Given the array `nums` consisting of `2n` elem...,tweet-counts-per-frequency,"Hash Table,Binary Search,Design,Sorting,Ordere...",Medium,,### Step 1: Understanding the Problem\nThe pro...
3,489,You are controlling a robot that is located so...,kth-smallest-instructions,"Array,Math,Dynamic Programming,Combinatorics",Hard,"There are nCr(row + column, row) possible inst...",### Problem Description\nWe are tackling probl...
4,62,There is a robot on an `m x n` grid. The robot...,unique-paths,"Math,Dynamic Programming,Combinatorics",Medium,,### Step 1: Understanding the Problem\ncool 62...
