#Install packages

In [1]:
#!pip install trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
#!pip install datasets bitsandbytes einops wandb -Uqqq

#Import packages

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, GenerationConfig
from peft import LoraConfig, get_peft_model, PeftConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")




##hugging face login

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#Load dataset

In [4]:
dataset = load_dataset("ttbui/html_alpaca")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['instruction', 'response', 'input', 'output'],
        num_rows: 636
    })
})


#Checking data consistency

If HTML tags are closed properly.

If there are any non-HTML elements within the code that should be removed.

The balance between different types of webpages (e.g., landing pages, contact forms) to ensure diversity.





In [5]:
!pip install html5lib





In [6]:

from bs4 import BeautifulSoup

# Assuming we're working with the 'train' split
html_data = dataset['train']

# Function to clean and validate HTML
def clean_html(html):
    try:
        soup = BeautifulSoup(html, 'html5lib')  # Using html5lib for lenient parsing
        cleaned_html = soup.prettify()
    except Exception as e:
        print(f"An error occurred while cleaning HTML: {e}")
        cleaned_html = html  # Keep original if error occurs
    return cleaned_html

# Function to remove script and style tags
def remove_noise(html):
    soup = BeautifulSoup(html, 'html5lib')
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    return soup.prettify()

# Apply the functions to the 'output' column of the dataset
def preprocess_html(example):
    example['output'] = clean_html(example['output'])  #taking the output column of the dataset ie the html code
    example['output'] = remove_noise(example['output'])
    return example

# Apply preprocessing
my_dataset = html_data.map(preprocess_html)
#print(my_dataset[0])
# If you want to see the changes, you can compare the original and preprocessed data
for i in range(5):  # Adjust the range as needed
    print()
    print(f"Original HTML:\n{html_data['output'][i]}")
    print()
    print(f"Cleaned HTML:\n{my_dataset['output'][i]}")



Original HTML:
<html>
  <head>
    <title>My Portfolio</title>
  </head>
  <body>
    <center>
      <h1>My Portfolio</h1>
    </center>
  </body>
</html>

Cleaned HTML:
<html>
 <head>
  <title>
   My Portfolio
  </title>
 </head>
 <body>
  <center>
   <h1>
    My Portfolio
   </h1>
  </center>
 </body>
</html>


Original HTML:
<html>
<head>
    <title>My Web Page</title>
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" integrity="sha384-JcKb8q3iqJ61gNV9KGb8thSsNjpSL0n8PARn9HuZOnIxN0hoP+VmmDGMN5t9UJ0Z" crossorigin="anonymous">
    <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js" integrity="sha384-DfXdz2htPH0lsSSs5nCTpuj/zy4C+OGpamoFVy38MVBnE+IbbVYUew+OrCXaRkfj" crossorigin="anonymous"></script>
    <script src="https://cdn.jsdelivr.net/npm/popper.js@1.16.1/dist/umd/popper.min.js" integrity="sha384-9/reFTGAW83EW2RDu2S0VKaIzap3H66lZH81PoYlFhbGU+6BZp6G7niu735Sk7lN" crossorigin="anonymous"></script>
    <script src="h

##split the dataset

In [7]:
from datasets import DatasetDict
train_test_val_split = my_dataset.train_test_split(test_size=0.3, seed=42)
test_val_split = train_test_val_split['test'].train_test_split(test_size=0.5, seed=42)

dataset_dict = DatasetDict({
    'train': train_test_val_split['train'],
    'validation': test_val_split['train'],
    'test': test_val_split['test']
})


####concatenating text fields

In [8]:
def concat_fields(example):
    # Concatenate instruction and input fields if input is not None
    if example['input'] is not None:
        return {'text': example['instruction'] + ' ' + example['input']}
    else:
        return {'text': example['instruction']}

# Apply this function to the dataset
train_dataset = dataset_dict['train']
val_dataset = dataset_dict['validation']
test_dataset = dataset_dict['test']

train_dataset = train_dataset.map(concat_fields)
val_dataset   = val_dataset.map(concat_fields)
test_dataset  = test_dataset.map(concat_fields)

print(train_dataset)
print(test_dataset)
print(val_dataset)

Dataset({
    features: ['instruction', 'response', 'input', 'output', 'text'],
    num_rows: 445
})
Dataset({
    features: ['instruction', 'response', 'input', 'output', 'text'],
    num_rows: 96
})
Dataset({
    features: ['instruction', 'response', 'input', 'output', 'text'],
    num_rows: 95
})


In [9]:
print(val_dataset)

Dataset({
    features: ['instruction', 'response', 'input', 'output', 'text'],
    num_rows: 95
})


#Load the pre-trained sharded falcon-7b model

In [12]:
model_name = "ybelkada/falcon-7b-sharded-bf16" # sharded falcon-7b model

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,            # load model in 4-bit precision
    bnb_4bit_quant_type="nf4",    # pre-trained model should be quantized in 4-bit NF format
    bnb_4bit_use_double_quant=True, # Using double quantization as mentioned in QLoRA paper
    bnb_4bit_compute_dtype=torch.bfloat16, # During computation, pre-trained model should be loaded in BF16 format
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config, # Use bitsandbytes config
    device_map="auto",  # Specifying device_map="auto" so that HF Accelerate will determine which GPU to put each layer of the model on
    trust_remote_code=True, # Set trust_remote_code=True to use falcon-7b model with custom code
)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

This worked. Other techniques that I can explore:

1. dynamic quantization

2. selective and incemental loading

3. model compression

4. environment optimisation

#Dataset preprocessing

##Tokenization and Padding

steps:

1. converting the both the labels and target into tokens

2. Making each token of same length for batch processing

3. Mapping the text into numerical represenations called token id( different from text encoding techniques like BOW or Word2vec as no semantic meaning carried)

4. These token ids are used to lookup embeddings during forward pass

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Set trust_remote_code=True so that the custom tokenizer code is executed
tokenizer.pad_token = tokenizer.eos_token
# Setting pad_token same as eos_token ie any padded areas are treated as end  of the sequence  during training and inference, which can be beneficial for these models to understand when to stop generating text.
#also helps prevent the model from "seeing" the padding during training and generation, as it's treated as the end of the sequence.

#Fine tuning script

###setting up LORA configuration

Steps for setting up LORA configuration which is a technique for PEFT(parameter efficient fine tuning)

1. kbit training: used to reduce precision of model's training, reducing memory footprint. ie reducing the bit size of weights and activations like 8 or 4 bit instead of standard 32 or 16 bit.
smaller bit can be carried quickly--> more of them fit into memory at once--> faster computations


2. Setup LORA configuration: modifies weights matrix to a low rank format. These trainable low rank matrices are inserted in the pre-trained model that now gets trained on the subset of weights with most imapct and orginal pre-trained wieghts remain frozen , enabling fine-tuning with fewer parameters.


3. Setup training arguments

4. Initialise the trainer

Observations in the code:

1. Regularisation through dropout, scaling factor, and rank of matrices

2. Higher value of alpha and rank can imporv model's performance but at the cost of computational resources.

3. We can hyperparamter tune these through gridSearchCV and monitoring training and validation loss.

In [14]:
model = prepare_model_for_kbit_training(model)

lora_alpha = 32 # scaling factor for the weight matrices-->  scaling the influence of the learnt params during training
lora_dropout = 0.05 # dropout probability of the LoRA layers
lora_rank = 32 # dimension of the low-rank matrices--> low dim means lower complexity and fewer params

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM",
    target_modules=[         # Setting names of layers where we want to apply lora to
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

peft_model = get_peft_model(model, peft_config)

###setting up training_arguments

In [16]:
from transformers import TrainingArguments

output_dir = "basemodel/falcon-7b-sharded-bf16-finetuned-html-code-generation"
per_device_train_batch_size = 2
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_strategy = "steps"
save_steps = 20
logging_strategy = "steps"
logging_steps = 20
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 320
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_strategy=save_strategy,
    save_steps=save_steps,
    logging_strategy=logging_strategy,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    bf16=False,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    tf32=False,
    eval_strategy="steps",           # <— use “eval_strategy” instead of “evaluation_strategy”
    eval_steps=20,                   # still valid
    load_best_model_at_end=True      # still valid
)

print(training_arguments)


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=20,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,


**Observations in training_arguments code**:

1. MONITOR the  warmup -  A larger warmup might be beneficial if you find the model is not learning initially.


2.  LORA already acts as a form of regularization by limiting the number of parameters updated. However, you could consider increasing lora_dropout slightly if overfitting occurs.


3. Monitor the gradient norms during training, and if they're consistently lower than your threshold, consider increasing the max grad norm.

4. By setting save_steps and logging_steps to 20, the model will save checkpoints and log training information after every 20 steps. This should help in managing the frequency of saves and logs, thus optimizing the use of computational resources.

5. Small batch size, paged optimizer, gradient accumulation are important for training large models like Falcon 7B on hardware with limited memory.

6. The learning rate, scheduler, and warmup settings are crucial hyperparameters and might need tuning based on your specific task and dataset.

###Instantiate trainer

#### introducing early stopping

In [17]:
from transformers import EarlyStoppingCallback

In [18]:
import inspect
from trl import SFTTrainer

print(inspect.getfullargspec(SFTTrainer.__init__))


FullArgSpec(args=['self', 'model', 'args', 'data_collator', 'train_dataset', 'eval_dataset', 'processing_class', 'compute_loss_func', 'compute_metrics', 'callbacks', 'optimizers', 'optimizer_cls_and_kwargs', 'preprocess_logits_for_metrics', 'peft_config', 'formatting_func'], varargs=None, varkw=None, defaults=(None, None, None, None, None, None, None, None, (None, None), None, None, None, None), kwonlyargs=[], kwonlydefaults=None, annotations={'model': typing.Union[str, torch.nn.modules.module.Module, transformers.modeling_utils.PreTrainedModel], 'args': typing.Union[trl.trainer.sft_config.SFTConfig, transformers.training_args.TrainingArguments, NoneType], 'data_collator': typing.Optional[transformers.data.data_collator.DataCollator], 'train_dataset': typing.Union[datasets.arrow_dataset.Dataset, datasets.iterable_dataset.IterableDataset, NoneType], 'eval_dataset': typing.Union[datasets.arrow_dataset.Dataset, dict[str, datasets.arrow_dataset.Dataset], NoneType], 'processing_class': typi

In [19]:
from trl import SFTTrainer
from transformers import EarlyStoppingCallback

# … (assume peft_model, peft_config, tokenizer, train_dataset, val_dataset, training_arguments are already defined) …

trainer = SFTTrainer(
    model=peft_model,
    args=training_arguments,          # TrainingArguments instance
    train_dataset=train_dataset,      # must already have a "text" column
    eval_dataset=val_dataset,         # must already have a "text" column
    processing_class=tokenizer,       # tokenizer is used to tokenize/encode examples
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    peft_config=peft_config           # your PEFT configuration
    # data_collator, compute_loss_func, compute_metrics, optimizers, etc. will use defaults
)

# Cast all normalization‐type layers to float32 in‐place:
import torch
for module in trainer.model.modules():
    if module.__class__.__name__.lower().endswith("norm"):
        module.to(torch.float32)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


#Training

problems to address:

1. Long training time

2. overfitting


3. others(GPU memory limitations)

In [20]:
peft_model.config.use_cache = True
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkrishshandilya18[0m ([33mkrishshandilya18-pes-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
20,2.0136,1.792436
40,1.6568,1.476718
60,1.4786,1.493415
80,1.605,1.408503
100,1.2733,1.340126
120,1.2419,1.303843
140,1.1761,1.432534
160,1.1081,1.283794
180,1.163,1.302755
200,1.0335,1.278751


TrainOutput(global_step=260, training_loss=1.2755528193253738, metrics={'train_runtime': 1637.8218, 'train_samples_per_second': 0.782, 'train_steps_per_second': 0.195, 'total_flos': 1419674050928640.0, 'train_loss': 1.2755528193253738})

**steps taken for reducing model runtime**:

Credits to the blog: https://betterprogramming.pub/speed-up-llm-inference-83653aa24c47

1. device map for distrbuted training over GPUs

2. mixed precision

3. learning rate scheduling for faster convergence

4. early stopping

5. data preprcoessing- by combining textual fields--> reduced data complexity


###Saving the model

In [21]:
trainer.push_to_hub()



CommitInfo(commit_url='https://huggingface.co/PES1UG22CS292/falcon-7b-sharded-bf16-finetuned-html-code-generation/commit/7334734b5232614eb12d6982f75330c4b7b58173', commit_message='End of training', commit_description='', oid='7334734b5232614eb12d6982f75330c4b7b58173', pr_url=None, repo_url=RepoUrl('https://huggingface.co/PES1UG22CS292/falcon-7b-sharded-bf16-finetuned-html-code-generation', endpoint='https://huggingface.co', repo_type='model', repo_id='PES1UG22CS292/falcon-7b-sharded-bf16-finetuned-html-code-generation'), pr_revision=None, pr_num=None)

#Loading the original and the fine tuned model

In [23]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    llm_int8_enable_fp32_cpu_offload=True,  # ← move the offload flag here
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

print("Model successfully loaded with 4-bit quantization + CPU offload.")


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Model successfully loaded with 4-bit quantization + CPU offload.


In [24]:
from peft import PeftModel
PEFT_MODEL = "PES1UG22CS292/falcon-7b-sharded-bf16-finetuned-html-code-generation"

# Attach the PEFT adapter to the already-loaded base model
peft_model = PeftModel.from_pretrained(
    model,        # already loaded Falcon-7B model (quantized)
    PEFT_MODEL,
    device_map="auto"  # respect quantization device placement
)

# Load tokenizer
peft_tokenizer = AutoTokenizer.from_pretrained(PEFT_MODEL, trust_remote_code=True)
peft_tokenizer.pad_token = peft_tokenizer.eos_token

print("✅ Fine-tuned PEFT adapter successfully applied.")

adapter_config.json:   0%|          | 0.00/872 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.72k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

✅ Fine-tuned PEFT adapter successfully applied.



**Why Use BLEU Score**

1. Widely Adopted
2. Easy to Compute

3. N-gram Overlap: BLEU considers the overlap of n-grams (contiguous sequences of n items from a given sample of text) between the generated text and the reference text. It counts how many n-grams in the generated text are present in the reference text.

4. Precision-Oriented: BLEU is precision-oriented, meaning it measures how many words in the generated text appear in the reference text, thus capturing fluency and adequacy to some extent.


**Alternative Metrics**

ROUGE: Often used in summarization tasks, ROUGE measures the overlap of n-grams, word sequences, and word pairs between the generated text and a set of reference texts.

Exact Match: This metric checks if the generated output is exactly the same as the reference, which can be relevant for code generation where exact syntax is important.

Custom Metrics: For HTML generation, you might develop a custom metric that evaluates whether the generated HTML is syntactically correct and renders the desired web elements correctly.