In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import json
import re

# Load CSV
file_path = "/kaggle/input/train.csv"
df = pd.read_csv(file_path)

# Drop rows with missing price or catalog_content 
df = df.dropna(subset=['price', 'catalog_content'])

# Shuffle dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split dataset
train_frac, val_frac = 0.8, 0.1
n = len(df)
train_end = int(n * train_frac)
val_end = train_end + int(n * val_frac)

df_train = df.iloc[:train_end]
df_val = df.iloc[train_end:val_end]
df_test = df.iloc[val_end:]

# Function to convert row to JSONL for numeric regression
# Better: string completion with newline and optional label
def row_to_jsonl(row):
    return {
        "prompt": row['catalog_content'] + "\n\n###\n\n",
        "completion": f"{float(row['price']):.2f}\n"   # just number, still string
    }
# Save JSONL files
for split_name, split_df in zip(
    ['train', 'validation', 'test'],
    [df_train, df_val, df_test]
):
    output_file = f"/content/{split_name}_llama2_numeric.jsonl"
    with open(output_file, 'w') as f:
        for _, row in split_df.iterrows():
            json.dump(row_to_jsonl(row), f)
            f.write('\n')
    print(f"{split_name}_llama2_numeric.jsonl saved with {len(split_df)} examples")


In [None]:
import shutil

files = ["train_llama2_numeric.jsonl", "validation_llama2_numeric.jsonl", "test_llama2_numeric.jsonl"]

for f in files:
    shutil.move(f"/content/{f}", f"/kaggle/working/{f}")

!ls /kaggle/working/

In [None]:
## packages for llama 3b 
!pip install -q transformers==4.44.2 peft==0.11.1 accelerate bitsandbytes==0.43.1 datasets==2.21.0

In [None]:
import torch #dont runnnnnnnnnnnnnnnnnn
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model

In [None]:
from huggingface_hub import login


login(token="hf_*****")

In [None]:
!rm -rf ~/.cache/huggingface

In [2]:
# imports
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"

# tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

# load model in fp16 (no bitsandbytes)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # adjust if these module names don't exist
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 4,587,520 || all params: 3,217,337,344 || trainable%: 0.1426


In [None]:
# 1) Uninstall bitsandbytes and triton (and optionally reinstall peft cleanly)
!pip uninstall -y bitsandbytes triton || true
!pip install -q --upgrade peft transformers accelerate

In [3]:
from datasets import load_dataset

# Load the JSONL file
dataset = load_dataset("json", data_files="train_llama2_numeric.jsonl")["train"]

print("Number of examples:", len(dataset))


Generating train split: 0 examples [00:00, ? examples/s]

Number of examples: 4000


In [4]:
def tokenize(example):
    # concatenate catalog_content + completion as string
    full_text = example["prompt"] + str(example["completion"]) + "\n"
    tokenized = tokenizer(
        full_text,
        truncation=True,
        padding="max_length",
        max_length=256  # adjust based on your average text length
    )
    # For causal LM, labels = input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize, batched=False)


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [5]:
from transformers import Trainer, TrainingArguments
from transformers import IntervalStrategy

training_args = TrainingArguments(
    output_dir="./llama3_price_model",
    per_device_train_batch_size=2,  # adjust to GPU memory
    gradient_accumulation_steps=2,  # effective batch size = 4
    num_train_epochs=3,
    learning_rate=2e-4,
    fp16=True,                        # mixed precision
    logging_steps=10,
    save_strategy=IntervalStrategy.EPOCH,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [6]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009, 'pad_token_id': 128009}.


Step,Training Loss
10,3.8961
20,0.7695
30,0.629
40,0.5485
50,0.5107
60,0.5065
70,0.484
80,0.5035
90,0.4982
100,0.4823


TrainOutput(global_step=3000, training_loss=0.42790805967648826, metrics={'train_runtime': 2501.3941, 'train_samples_per_second': 4.797, 'train_steps_per_second': 1.199, 'total_flos': 5.2039709097984e+16, 'train_loss': 0.42790805967648826, 'epoch': 3.0})

In [None]:
#below EDA is done on differenet devices and style ( just pasting here for reference)

In [None]:
# Step 1: Install dependencies (if needed)
!pip install pandas

# Step 2: Import libraries
import pandas as pd
import re

# Step 3: Load CSV
file_path = "test.csv"  # Replace with your CSV file path
df = pd.read_csv(file_path)

# Step 4: Clean and truncate to 150 words in-place
def clean_and_truncate_words(text, max_words=150):
    if pd.isna(text):
        return ""
    text = str(text)
    # Remove 'Item Name: ' if it's at the start
    text = re.sub(r'^Item Name:\s*', '', text, flags=re.IGNORECASE)
    # Remove 'Bullet Point X:' (case-insensitive)
    text = re.sub(r'Bullet Point \d+:?', '', text, flags=re.IGNORECASE)
    # Replace multiple spaces, newlines, tabs with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip leading/trailing spaces
    text = text.strip()
    # Truncate to first max_words words
    words = text.split()
    return ' '.join(words[:max_words])

df['catalog_content'] = df['catalog_content'].apply(clean_and_truncate_words)

# Step 5: Save cleaned CSV
output_file = "/content/testCleaned.csv"
df.to_csv(output_file, index=False)

print(f"Catalog content cleaned and truncated to 150 words in-place. Saved to {output_file}")
print(df['catalog_content'].head())


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
# Load your CSV
df = pd.read_csv("train_final.csv")

# Optional: remove rows with missing prices or content
df = df.dropna(subset=["price", "catalog_content_modified"])

# Log-transform price to reduce skew (optional)
df["log_price"] = (df["price"] + 1).apply(lambda x: np.log(x))

# Bin prices into categories for stratification
df["price_bin"] = pd.qcut(df["log_price"], q=20, duplicates="drop")  # 20 bins

# Stratified sampling: pick 10k examples
df_sampled, _ = train_test_split(
    df,
    train_size=5000,
    stratify=df["price_bin"],
    random_state=42,
)
# Drop helper columns
df_sampled = df_sampled.drop(columns=["log_price", "price_bin"])

# Save sampled dataset
df_sampled.to_csv("sampled_5k.csv", index=False)
print("Sampled 5k examples saved!")
