In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/final-concatenation/concatenated_books.txt


# Packages

In [2]:
#Installing required packages
!pip install -q transformers bitsandbytes peft trl accelerate xformers wandb datasets einops

# Libraries

In [3]:
import re 

In [4]:
def remove_non_alphanumeric(text):
    # Use regex to keep alphanumeric characters, spaces, punctuation marks, and the "=" sign
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s.,!?;:()\'"-=]', '', text)
    return cleaned_text

def merge_short_paragraphs(text, word_limit=20):
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    merged_paragraphs = []
    buffer_paragraph = ""

    for paragraph in paragraphs:
        paragraph=remove_non_alphanumeric(paragraph)
        word_count = len(paragraph.split())

        if word_count < word_limit:
            buffer_paragraph += " " + paragraph
        else:
            if buffer_paragraph:
                merged_paragraphs.append(buffer_paragraph.strip() + " " + paragraph)
                buffer_paragraph = ""
            else:
                merged_paragraphs.append(paragraph)

    if buffer_paragraph:
        if merged_paragraphs:
            merged_paragraphs[-1] += " " + buffer_paragraph.strip()
        else:
            merged_paragraphs.append(buffer_paragraph.strip())

    return "\n\n".join(merged_paragraphs)

def process_file(input_file, output_file, word_limit=20):
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    processed_text = merge_short_paragraphs(text, word_limit)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(processed_text)

# Example usage
input_file = '/kaggle/input/final-concatenation/concatenated_books.txt'
output_file = 'output.txt'
process_file(input_file, output_file, word_limit=20)


# Ban gya dataset Hurray<3

In [5]:
import json
import re

# Load the text file
with open("/kaggle/working/output.txt", "r") as file:
    text = file.read()

# Split the text into paragraphs based on double newlines
paragraphs = text.split("\n\n")

# Create a dataset format
dataset = []

for paragraph in paragraphs:
    # Remove newlines within the paragraph using regex
    paragraph = re.sub(r'\n+', ' ', paragraph).strip()

    # Split the paragraph into sentences
    sentences = re.split(r'(?<=[.!?]) +', paragraph)
    
    if len(sentences) < 4:
        # For paragraphs with fewer than 2 sentences, use the whole paragraph as context and question
        dataset.append({
            "context": paragraph.strip(),
            "question": paragraph.strip(),
            "answer": paragraph.strip(),
            "text":paragraph.strip()
        })
    else:
        # Determine the split point for question and answer
        split_point = len(sentences) // 4
        
        # Create the "question" and "answer"
        question = " ".join(sentences[:split_point]).strip()
        answer = " ".join(sentences[split_point:]).strip()
        
        # Add to the dataset
        dataset.append({
            "context": paragraph.strip(),  # Optional: Use the entire paragraph as context
            "question": question,
            "answer": answer,
            "text":paragraph.strip()
        })

# Save the dataset as a JSON file
with open("qa_dataset.json", "w") as outfile:
    json.dump(dataset, outfile, indent=2)


# Working on fine tuning

In [6]:
from datasets import load_dataset

data = load_dataset("json", data_files="/kaggle/working/qa_dataset.json", split="train")

print(data[300])
print(len(data))



Generating train split: 0 examples [00:00, ? examples/s]

{'context': '(d) F I G U R E 2.51 Collapsing the circuit. Now, following the results of Section 2.3.1, or equivalently by applying Ohms law directly, we know that V i1 =  R1 + R2 (R3 +R4 ) R2 +R3 +R4 . Thus, at this point, i0 , v0 , and i1 are known. Our intuitive analysis concludes by expanding the circuit in Figure 2.51d progressively. As we expand, we determine the values of as many of the variables as we can in terms of previously computed variables. Following this process, first, the circuit in Figure 2.51c can be viewed as a voltage divider of v0 . In other words, i1 can be multiplied by each of its two resistances to determine v1 and v2 . Thus, R1', 'question': '(d) F I G U R E 2.51 Collapsing the circuit. Now, following the results of Section 2.3.1, or equivalently by applying Ohms law directly, we know that V i1 =  R1 + R2 (R3 +R4 ) R2 +R3 +R4 .', 'answer': 'Thus, at this point, i0 , v0 , and i1 are known. Our intuitive analysis concludes by expanding the circuit in Figure 2.5

In [7]:
chhota_data = data.select(range(1000))
print(chhota_data[2])

{'context': '1. Notice that Newtons laws of physics are themselves based on discretizing matter. Newtons laws describe the dynamics of discrete bodies of matter by treating them as point masses. The spatial distribution of properties within the discrete elements are ignored.', 'question': '1.', 'answer': 'Notice that Newtons laws of physics are themselves based on discretizing matter. Newtons laws describe the dynamics of discrete bodies of matter by treating them as point masses. The spatial distribution of properties within the discrete elements are ignored.', 'text': '1. Notice that Newtons laws of physics are themselves based on discretizing matter. Newtons laws describe the dynamics of discrete bodies of matter by treating them as point masses. The spatial distribution of properties within the discrete elements are ignored.'}


In [8]:
print(chhota_data[99])

{'context': '1.6 Ideal Two-Terminal Elements markings inside it, as in Figure 1.26b. If the voltage source supplies a voltage V, then we also include the V symbol inside the circle (or just outside the circle if there is not enough room to write the symbol inside). In the same manner, we might also represent an information source, such as a microphone or a sensor, as a voltage source providing a time-varying voltage v(t) at its output (Figure 1.26c). We can assume that the voltage v(t) depends solely on the microphone signal and is independent of the amount of current drawn from the terminals. ( Note that V and v(t) in Figure 1.26 are element values and not terminal variables.) We will see two types of voltage sources: independent and dependent. An independent voltage source supplies a voltage independent of the rest of the circuit. Accordingly, independent sources are a means through which inputs can be made to a circuit. Power supplies, signal generators, and microphones are examples

# Thak gya hu

In [9]:
import os
from kaggle_secrets import UserSecretsClient

# Retrieve the token from the environment variable
secret_label = "HF_TOKEN"
hf_token = UserSecretsClient().get_secret(secret_label)
from huggingface_hub import login
login(token = hf_token,add_to_git_credential=True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("WANDB")


In [11]:
#importing library
from huggingface_hub import notebook_login
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig, TextStreamer
from peft import LoraConfig, get_peft_model, PeftModel
import torch, os, wandb
wandb.login(key = secret_value_0)
new_model = "Huggingface repository link"
base_model = "microsoft/phi-2"

[34m[1mwandb[0m: Currently logged in as: [33mjustharshitjaiswal14[0m ([33mhavelihunters[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
#loading the model and tokenizer
bitsandbytes= BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model, device_map={"":0},
    quantization_config= bitsandbytes, trust_remote_code= True
)
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
run = wandb.init(project='Fine tuning llama3 unsloth', job_type="training", anonymous="allow")

In [14]:
print(model)


PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiSdpaAttention(
          (q_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (dense): Linear4bit(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear4bit(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear4bit(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_la

In [15]:


config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 10,485,760 || all params: 2,790,169,600 || trainable%: 0.3758


In [16]:
#Tokenzing the dataset
def tok(sample):
    model_inps =  tokenizer(sample["text"], padding=True)
    return model_inps
# data = load_dataset("vicgalle/alpaca-gpt4", split="train")
tokenized_training_data = chhota_data.map(tok, batched=True)
tokenized_training_data

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'question', 'answer', 'text', 'input_ids', 'attention_mask'],
    num_rows: 1000
})

In [17]:
#Training hyperparamters
training_arguments = TrainingArguments(
        output_dir="output",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        #EvaluationStrategy = "steps",
        save_strategy="epoch",
        logging_steps=30,
        max_steps=-1,
        num_train_epochs=2,
        report_to="wandb"
    )

In [18]:
trainer = Trainer(
    model=model,
    train_dataset=tokenized_training_data["input_ids"],
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
#Training
trainer.train()
trainer.model.save_pretrained(new_model)
model.config.use_cache = True
model.eval()



Step,Training Loss


In [None]:
def phi_stream(prompt):
    runtimeFlag = "cuda:0"
    
    # Tokenize the input with attention mask and padding
    inputs = tokenizer(
        f'''Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{prompt}\n\n### Response:\n''', 
        return_tensors="pt", 
        return_attention_mask=True,
        padding=True
    ).to(runtimeFlag)
    
    # Set the pad_token_id to avoid warnings
    pad_token_id = tokenizer.eos_token_id
    
    streamer = TextStreamer(tokenizer, skip_prompt=True)
    
    # Generate the response
    _ = model.generate(
        **inputs, 
        streamer=streamer, 
        max_new_tokens=100,
        pad_token_id=pad_token_id
    )

# Test the function with a single task
phi_stream("computer")


In [None]:
phi_stream("p-n junction diode")

In [None]:
phi_stream("Avalanche diode ")