<a href="https://colab.research.google.com/github/Midhilesh4890/LLM-Finetuning/blob/main/Mercor_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Enabling faster GPUs like A100

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

## Check ram utilization of the current run time

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
try:
    import shutil

    # get the path to the folder
    folder_path = "/content/llama3"

    # delete the folder
    shutil.rmtree(folder_path)
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

In [None]:
try:
    import shutil

    # get the path to the folder
    folder_path = "/content/"

    # delete the folder
    shutil.rmtree(folder_path)
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

In [None]:
# Install necessary dependencies
!pip install torch transformers python-dotenv numpy datasets tqdm huggingface-hub

## Load Dataset

## Selecting different dataset for checking the performance for various datasets to benchmark

In [None]:
# Load the MMLU dataset with a random configuration
from datasets import load_dataset
import random
available_configs = ['high_school_european_history', 'business_ethics', 'clinical_knowledge', 'medical_genetics', 'high_school_us_history', 'high_school_physics', 'high_school_world_history', 'virology', 'high_school_microeconomics', 'econometrics', 'college_computer_science', 'high_school_biology', 'abstract_algebra', 'professional_accounting', 'philosophy', 'professional_medicine', 'nutrition', 'global_facts', 'machine_learning', 'security_studies', 'public_relations', 'professional_psychology', 'prehistory', 'anatomy', 'human_sexuality', 'college_medicine', 'high_school_government_and_politics', 'college_chemistry', 'logical_fallacies', 'high_school_geography', 'elementary_mathematics', 'human_aging', 'college_mathematics', 'high_school_psychology', 'formal_logic', 'high_school_statistics', 'international_law', 'high_school_mathematics', 'high_school_computer_science', 'conceptual_physics', 'miscellaneous', 'high_school_chemistry', 'marketing', 'professional_law', 'management', 'college_physics', 'jurisprudence', 'world_religions', 'sociology', 'us_foreign_policy', 'high_school_macroeconomics', 'computer_security', 'moral_scenarios', 'moral_disputes', 'electrical_engineering', 'astronomy', 'college_biology']
random_config = random.choice(available_configs)
print(f"Random config: {random_config}")

dataset = load_dataset("lukaemon/mmlu", 'marketing')

## Cloning the official Repo

In [None]:
# Clone the official Llama-3-8B repository
!git clone https://github.com/meta-llama/llama3.git
%cd llama3

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np
from tqdm import tqdm
import logging
import matplotlib.pyplot as plt

# Set up logging
logging.basicConfig(level=logging.INFO, filename='rmsnorm_logs.txt', filemode='w')

## Modified RMSNorm class with logging


* **eps (epsilon)**: A small number added to the denominator to prevent division by zero.

*  **scale**: A learnable parameter that scales the normalized input. It's
initialized to a vector of ones with the same dimension as the input features, allowing different scaling for each feature.

* **layer_name**: This is mainly used for debugging purposes, to help identify which layer is being logged.

* **call_count**: Keeps track of how many times the forward method has been called. This is used to restrict the logging to the first 65 calls, which can help in avoiding log clutter during long training or inference sessions.

*  **RMS Calculation**: The root mean square of the input tensor is calculated along the last dimension, which typically corresponds to feature dimensions in batched input.

In [None]:
class RMSNorm(torch.nn.Module):
    def __init__(self, dim, layer_name, eps=1e-8):
        super().__init__()
        # epsilon to avoid division by zero during normalization
        self.eps = eps
        # learnable scaling parameter initialized to ones
        self.scale = torch.nn.Parameter(torch.ones(dim))
        # name of the layer, useful for debugging and tracking
        self.layer_name = layer_name
        # counter to track the number of times the forward pass is called
        self.call_count = 0

    def forward(self, x):
        # Increment call counter at each forward pass
        self.call_count += 1
        # Limit logging to the first 65 calls to avoid excessive logging in long-running processes
        if self.call_count <= 65:
            # Compute the root mean square of the input tensor along the last dimension
            rms = torch.sqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
            # Log information about the layer, call number, and RMS value
            logging.info(f'Layer: {self.layer_name}, Call: {self.call_count}, RMS: {rms}')
        # Normalize the input tensor and scale it
        return x / rms * self.scale


## Replacing original RMSNorm with Custom RMS function

**Iteration through modules**: The function iterates over all modules in the model using model.named_modules(), yielding both the name and module for every component.

**Redundant module check**: It checks if the current module is a PyTorch module, which is unnecessary since named_modules() already ensures this.

**Attribute iteration**: For each module, the function iterates through all attributes using dir(module) to identify if any are instances of torch.nn.LayerNorm.

**Identification of LayerNorm**: The function targets attributes that are torch.nn.LayerNorm instances, essential for the intended replacement with RMSNorm.

**LayerNorm replacement**: When a LayerNorm is found, it's replaced with RMSNorm, initialized with dimensions from the original LayerNorm and named by combining the parent's name with its own.

In [None]:
# Function to replace RMSNorm layers in the model
def replace_rmsnorm(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Module):
            for attr_name in dir(module):
                attr = getattr(module, attr_name)
                if isinstance(attr, torch.nn.Module) and isinstance(attr, torch.nn.LayerNorm):
                    setattr(module, attr_name, RMSNorm(attr.normalized_shape[0], name + '.' + attr_name))

## Connecting to Hugging using Token

*   this token is saved in .env file in root folder



In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import HfApi, notebook_login

load_dotenv("/content/.env")
HFTOKEN = os.environ.get("HFTOKEN")
notebook_login()

## Download weights from the repo using Huggingface CLI

In [None]:
!huggingface-cli download meta-llama/Meta-Llama-3-8B --include "original/*" --local-dir Meta-Llama-3-8B

In [None]:
# !huggingface-cli download meta-llama/Meta-Llama-3-8B --local-dir Meta-Llama-3-8B

NOTE: If config.json is not downloaded along with the original file then use this to download or run above cell.

In [None]:
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3-8B"

pipeline = transformers.pipeline(
  "text-generation",
  model=model_id,
  model_kwargs={"torch_dtype": torch.bfloat16},
  device="cuda",
)

## Load the llama3-8b model

In [None]:
# Load the Llama-3-8B model inf float 16 precision
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16)

In [None]:
# Replace the original RMSNorm in the model with the modified version
replace_rmsnorm(model)

**Setting Model to Evaluation Mode:** Disables certain layers like dropout and batch normalization that behave differently during training.

**Input Preparation:** Concatenates the question and choices into a single string for the model to process.

**Tokenization:** Converts the input string into model-readable tokens, adds necessary formatting, and moves the data to the device (GPU or CPU) where the model is located.

**Prediction Generation:** The model generates an answer based on the input tokens.

**Accuracy Calculation:** The function computes the accuracy by dividing the number of correct predictions by the total number of questions.

In [None]:
from tqdm import tqdm

def evaluate_mmlu(model, tokenizer, dataset):
    # Set the model to evaluation mode which turns off layers like dropout
    model.eval()
    correct = 0  # Counter for correct predictions
    total = 0    # Counter for total predictions

    # Iterate through each example in the test set of the dataset
    for example in tqdm(dataset["test"]):
        # Prepare the input text by appending question and choices
        input_text = example["question"] + " " + " ".join(example["choices"])
        # Tokenize the input text and convert it to a tensor suitable for the model
        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
        # Generate the answer using the model
        outputs = model.generate(inputs["input_ids"], max_length=256)
        # Decode the generated tensor to a string answer
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Check if the predicted answer matches the correct answer
        if prediction == example["answer"]:
            correct += 1
        total += 1

    # Calculate the accuracy as the ratio of correct answers to total answers
    accuracy = correct / total
    return accuracy

# Evaluate the model and print the accuracy
accuracy = evaluate_mmlu(model, tokenizer, dataset)
print(f"MMLU 5-shot performance with logging: {accuracy:.2f}")


In [None]:
# Read the log file and extract RMS(a) values
with open('rmsnorm_logs.txt', 'r') as f:
    rms_logs = [line.strip().split(', ') for line in f if line.startswith('Layer:')]

## Data for Plotting the values for layers, call_counts and rms values

In [None]:
# Extract data for plotting
layer_names = [log[0].split(': ')[1] for log in rms_logs]
call_counts = [int(log[1].split(': ')[1]) for log in rms_logs]
rms_values = [float(log[2].split(': ')[1][1:-1]) for log in rms_logs]

## Histograms

In [None]:
# Create histograms
plt.figure(figsize=(12, 6))
plt.hist(rms_values, bins=50, alpha=0.75, color='blue', edgecolor='black')
plt.title('Histogram of RMS(a) values for all layers')
plt.xlabel('RMS(a) value')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

## Plotting the layers

In [None]:
|# Histograms for specific layers
layers_to_plot = ['first_rmsnorm_layer', 'last_rmsnorm_layer', 'other_interesting_layer']  # Replace with actual layer names

for layer in layers_to_plot:
    layer_rms_values = [rms for rms, name in zip(rms_values, layer_names) if name == layer]
    plt.figure(figsize=(12, 6))
    plt.hist(layer_rms_values, bins=50, alpha=0.75, color='blue', edgecolor='black')
    plt.title(f'Histogram of RMS(a) values for {layer}')
    plt.xlabel('RMS(a) value')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()