# Insight AI Codellama 3 Finetuning 

## Check GPU availability

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

## Using 4 bit quantized models for fine-tuning
SEQ_LEN = 2048
llama-3-8b 

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True 

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    
)

config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.4
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




---



## Using LoRa Low Rank Adaption model:

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2024.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


<a name="Data"></a>
### Data Prep
1. Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html).

2. Used eval59.8-coding-dataset

3. mbpp-coding dataset

4. mbpp-prompts dataset

### Define the prompt and handle EOS End of sentence token


In [None]:
import json
import pandas as pd

# Define file paths
file1 = "eval59.8-coding-dataset.jsonl"
file2 = "mbpp-coding.json"
file3 = "mbpp-prompts.json"

# Load eval59.8-coding-dataset.jsonl as a DataFrame
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame(data)

eval_dataset = load_jsonl(file1)

# Load mbpp-coding.json
with open(file2, 'r') as f:
    mbpp_coding_data = json.load(f)

mbpp_coding_df = pd.DataFrame(mbpp_coding_data)

# Load mbpp-prompts.json
with open(file3, 'r') as f:
    mbpp_prompts_data = json.load(f)

mbpp_prompts_df = pd.DataFrame(mbpp_prompts_data)

# Merge datasets

if 'id' in eval_dataset.columns and 'id' in mbpp_coding_df.columns and 'id' in mbpp_prompts_df.columns:
    dataset = eval_dataset.merge(mbpp_coding_df, on='id', how='outer').merge(mbpp_prompts_df, on='id', how='outer')
else:
    dataset = pd.concat([eval_dataset, mbpp_coding_df, mbpp_prompts_df], ignore_index=True)


## Load and format Alpaca dataset

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token 
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN 
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

<a name="Train"></a>
### Train model
- 60 steps done
- Define batch size and learning rates
- Train for 4 epochs

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, 
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = None,
        num_train_epochs=4
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        ),
        
    )

  self.pid = os.fork()


Map (num_proc=2):   0%|          | 0/51760 [00:00<?, ? examples/s]

# Memory stats

In [None]:

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.668 GB of memory reserved.


#  Actual training

In [None]:

trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.8146
2,2.2932
3,1.6895
4,1.9524
5,1.6457
6,1.6399
7,1.2177
8,1.2469
9,1.0693
10,1.1739


# Final Memory stats

In [None]:

used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

476.2261 seconds used for training.
7.94 minutes used for training.
Peak reserved memory = 8.982 GB.
Peak reserved memory for training = 3.314 GB.
Peak reserved memory % of max memory = 60.903 %.
Peak reserved memory for training % of max memory = 22.471 %.


<a name="Inference"></a>
### Test The Modal (Inference)

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "List the prime numbers contained within the range.", # instruction
        "1-50", #input
        "", #output
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nList the prime numbers contained within the range.\n\n### Input:\n1-50\n\n### Response:\n2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47<|end_of_text|>']

## TextStreamer for continuous inference

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Convert these binary numbers to decimal.", # instruction
        "1010, 1101, 1111", # input
        "", # output 
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Convert these binary numbers to decimal.

### Input:
1010, 1101, 1111

### Response:
The decimal equivalent of 1010 is 10. The decimal equivalent of 1101 is 13. The decimal equivalent of 1111 is 15.<|end_of_text|>


<a name="Save"></a>
# Saving, loading finetuned models


In [None]:
model.save_pretrained("lora_model") 

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", 
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)


inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output 
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\nOne of the most famous tall towers in Paris is the Eiffel Tower. It is a wrought iron tower located on the Champ de Mars in Paris, France. It was built in 1889 as the entrance to the 1889 World's Fair, and it was designed by the French engineers Gustave Eiff"]

## Saving

In [None]:
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

In [None]:
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")


if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

## Running The Modal

In [2]:
from typing import List, Dict
import json
import pandas as pd
from pydantic import BaseModel
from file import groq

class Column(BaseModel):
    name: str
    data_type: str
    description: str
    sample_values: List[str]
    null_count: int
    unique_count: int

class DataSchema(BaseModel):
    dataset_name: str
    columns: List[Column]
    total_columns: int
    total_rows: int

def analyze_dataset(file_path: str) -> Dict:
    """
    Read and analyze the dataset to gather detailed information about its structure.
    Returns a dictionary containing comprehensive information about the dataset.
    """
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Initialize the analysis dictionary
        analysis = {
            "column_info": []
        }
        
        # Analyze each column
        for column in df.columns:
            # Get the data type
            dtype = str(df[column].dtype)
            
            # Get sample values (first 3 non-null values)
            samples = df[column].dropna().head(1).astype(str).tolist()
            
            # Count null values
            null_count = df[column].isnull().sum()
            
            # Count unique values
            unique_count = df[column].nunique()
            
            # Determine if datetime
            is_datetime = False
            if dtype == 'object':
                try:
                    pd.to_datetime(df[column], errors='raise')
                    is_datetime = True
                except:
                    pass
            
            # Create column info dictionary
            column_info = {
                "name": column,
                "inferred_type": "datetime" if is_datetime else dtype,
                "sample_values": samples,
                "null_count": int(null_count),
                "unique_count": int(unique_count)
            }
            
            analysis["column_info"].append(column_info)
        
        # Add dataset-level information
        analysis["total_rows"] = len(df)
        analysis["total_columns"] = len(df.columns)
        analysis["file_name"] = file_path.split("/")[-1]
        
        return analysis
        
    except Exception as e:
        raise Exception(f"Error analyzing dataset: {str(e)}")

def get_data_schema(file_path: str) -> DataSchema:
    """
    Analyze the dataset and use GROQ to provide detailed information about the data types
    and generate meaningful descriptions.
    """
    
    
    # First, analyze the dataset
    dataset_analysis = analyze_dataset(file_path)
    
    # Create a detailed prompt for GROQ
    column_details = "\n".join([
        f"Column: {col['name']}\n"
        f"Inferred Type: {col['inferred_type']}\n"
        f"Sample Values: {', '.join(col['sample_values'][:1])}\n"
        f"Null Count: {col['null_count']}\n"
        f"Unique Values: {col['unique_count']}\n"
        for col in dataset_analysis["column_info"]
    ])

    chat_completion = groq.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": f"""You are a data analyst expert. Analyze the provided dataset information and generate detailed column descriptions.
                Return the analysis in JSON format using the schema: {json.dumps(DataSchema.model_json_schema(), indent=2)}
                
                When determining column types and descriptions:
                - For ID columns: Mark as categorical with unique identifiers
                - For date/time columns: Ensure they're marked as datetime
                - For numeric columns with decimals: Note if they represent currency or percentages
                - For categorical columns: Mention if they represent categories, segments, or discrete values
                """
            },
            {
                "role": "user",
                "content": f"""Analyze the following dataset:
                File: {dataset_analysis['file_name']}
                Total Rows: {dataset_analysis['total_rows']}
                Total Columns: {dataset_analysis['total_columns']}
                
                Column Details:
                {column_details}
                
                Provide a detailed analysis of each column's type and generate meaningful descriptions
                based on the column name, sample values, and data characteristics.
                """
            }
        ],
        model="llama-3.3-70b-specdec",
        temperature=0.6,
        stream=False,
        response_format={"type": "json_object"}
    )
    
    return DataSchema.model_validate_json(chat_completion.choices[0].message.content)

def print_data_schema(schema: DataSchema):
    """Print the data schema in a detailed, organized format."""
    print(f"\nDataset Analysis: {schema.dataset_name}")
    print(f"Total Columns: {schema.total_columns}")
    print(f"Total Rows: {schema.total_rows}")
    
    print("\nDetailed Column Analysis:")
    print("=" * 100)
    
    for column in schema.columns:
        print(f"\nColumn: {column.name}")
        print(f"Type: {column.data_type}")
        print(f"Description: {column.description}")
        print("-" * 100)

def main():
    try:
        file_path = "zomato.csv"
        print(f"Analyzing dataset: {file_path}")
        
        # Get the schema with detailed analysis
        schema = get_data_schema(file_path)
        
        # Print the results
        print_data_schema(schema)
        
    except Exception as e:
        print(f"Error: {str(e)}")
main()

Analyzing dataset: zomato.csv


  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')
  pd.to_datetime(df[column], errors='raise')



Dataset Analysis: zomato
Total Columns: 17
Total Rows: 51717

Detailed Column Analysis:

Column: url
Type: object
Description: Unique identifier for each restaurant in the form of a URL
----------------------------------------------------------------------------------------------------

Column: address
Type: object
Description: Physical address of the restaurant
----------------------------------------------------------------------------------------------------

Column: name
Type: object
Description: Name of the restaurant
----------------------------------------------------------------------------------------------------

Column: online_order
Type: object
Description: Whether online ordering is available or not
----------------------------------------------------------------------------------------------------

Column: book_table
Type: object
Description: Whether table booking is available or not
----------------------------------------------------------------------------------------