In [2]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.33.1 trl==0.4.7


In [43]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
amazon_categories='/kaggle/input/amazon-products/amazon_categories.csv'
amazon_products='/kaggle/input/amazon-products/amazon_products.csv'

In [17]:
df_category=pd.read_csv(amazon_categories)

In [18]:
df_product=pd.read_csv(amazon_products)

In [19]:
df_product.head(2)
df_category.head(4)

Unnamed: 0,id,category_name
0,1,Beading & Jewelry Making
1,2,Fabric Decorating
2,3,Knitting & Crochet Supplies
3,4,Printmaking Supplies


In [20]:
df_category.isnull().sum()/len(df_category)

id               0.0
category_name    0.0
dtype: float64

In [21]:
df_product.isnull().sum()/len(df_product)

asin                 0.000000e+00
title                7.010966e-07
imgUrl               0.000000e+00
productURL           0.000000e+00
stars                0.000000e+00
reviews              0.000000e+00
price                0.000000e+00
listPrice            0.000000e+00
category_id          0.000000e+00
isBestSeller         0.000000e+00
boughtInLastMonth    0.000000e+00
dtype: float64

In [22]:
df_product=df_product.dropna()

In [23]:
df_product.isnull().sum()/len(df_product)

asin                 0.0
title                0.0
imgUrl               0.0
productURL           0.0
stars                0.0
reviews              0.0
price                0.0
listPrice            0.0
category_id          0.0
isBestSeller         0.0
boughtInLastMonth    0.0
dtype: float64

In [24]:
merged_df = pd.merge( df_category,df_product,left_on='id', right_on='category_id')

In [25]:
merged_df = merged_df.sample(n=50000, random_state=33)

In [26]:
merged_df.isnull().sum()/len(merged_df)*100

id                   0.0
category_name        0.0
asin                 0.0
title                0.0
imgUrl               0.0
productURL           0.0
stars                0.0
reviews              0.0
price                0.0
listPrice            0.0
category_id          0.0
isBestSeller         0.0
boughtInLastMonth    0.0
dtype: float64

In [27]:
merged_df["text"] = merged_df.apply(lambda row: str(row['title']) + " ->: " + str(row['category_name']), axis=1)

In [28]:
merged_df['text']

1361595    When Calls The Heart Hope Valley Christmas Col...
626897     Hat – Trucker Mesh Snapback Baseball Cap ->: M...
975844     Tampon Holder for Bathroom with Lids Wood and ...
500534     Girls' Big School Uniform Twill Skinny Pants, ...
1336210    Set of 4: 5" Mercedes Benz SLS AMG 1:36 Scale ...
                                 ...                        
148484     NOVSIGHT 9005 H11 Led Headlight Bulbs High Low...
230150     Karley Bassinet in Onyx, Lightweight Portable ...
935675     5Aplusreprap Heatbed Leveling 4Pcs Aluminum Ha...
462286     Boys' Big Performance Tween Boxer Briefs Under...
952388     Safetec 34815 SaniZide Plus Germicidal Solutio...
Name: text, Length: 50000, dtype: object

In [29]:
from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(merged_df,test_size=0.3,random_state=42)



In [30]:
from datasets import Dataset,DatasetDict
train_dataset_dict=DatasetDict({
    "train":Dataset.from_pandas(train_df),
})

In [31]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [32]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-chat-hf"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = 120

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [34]:
tokenizer=AutoTokenizer.from_pretrained(model_name,trust_remote_code=True)
tokenizer.pad_token=tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [35]:
import transformers
pipeline=transformers.pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto"
)

print("lets check what model predict before finetunning .")

sequences =pipeline(
        ["Luggage Sets Expandable Lightweight Suitcases with Wheels PC+ABS Durable Travel Luggage TSA Lock Navy Blue 4pcs ->","Tranverz S - Triple Denim ->","Original X-box Series X Controller compatible with X-box One/Series S/X, PC | Custom X-box One Controller | Printed in USA with Advanced HYDROGRAPHIC Technology (NOT JUST A SKIN or STICKER) ->:"],
          max_length=200,
          do_sample=True,
          top_k=10,
          num_return_sequences=1,
          eos_token_id=tokenizer.eos_token_id,
)

for seq in sequences:
  print(f"Result : {seq[0]['generated_text']}")


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



lets check what model predict before finetunning .
Result : Luggage Sets Expandable Lightweight Suitcases with Wheels PC+ABS Durable Travel Luggage TSA Lock Navy Blue 4pcs -> Great for travel.
I have found most affordable price of Explore the World – 4 Piece Luggage Set – Expandable Lightweight Suitcases with Wheels PC+ABS Durable Travel Luggage TSA Lock Navy Blue 4pcs -> Great for travel from Amazon store. It offers fast and reliable shipping. Explore the World – 4 Piece Luggage Set – Expandable Lightweight Suitcases with Wheels PC+ABS Durable Travel Luggage TSA Lock Navy Blue 4pcs -> Great for travel is good quality product at affordable price. You will get proper dwelling delivery. Shop below for availability.
Result : Tranverz S - Triple Denim -> S-Logo Black. 100% Cotton. Made in Italy.
Product measurements were taken using size 32, please note that measurements may vary slightly by size.
Result : Original X-box Series X Controller compatible with X-box One/Series S/X, PC | Custom

In [36]:

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Train model
trainer.train()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/35 [00:00<?, ?ba/s]

You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.8707
50,2.6657
75,2.4846
100,2.4905


TrainOutput(global_step=120, training_loss=2.572362995147705, metrics={'train_runtime': 364.8194, 'train_samples_per_second': 2.631, 'train_steps_per_second': 0.329, 'total_flos': 1137165959823360.0, 'train_loss': 2.572362995147705, 'epoch': 0.03})

In [37]:
lst_test_data_short=list(test_df['text'])

In [38]:
len(lst_test_data_short)

15000

In [39]:
sample_size=35
lst_test_data_short=lst_test_data_short[:sample_size]

In [40]:
import transformers

pipeline=transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",

)

sequences = pipeline(
     lst_test_data_short,
     max_length=100,
     do_sample=True,
     top_k=10,
     num_return_sequences=1,
     eos_token_id=tokenizer.eos_token_id,
)

for ix ,seq in enumerate(sequences):
  print(ix,seq[0]['generated_text'])



0 Beard Trimmer for Men, Electric Trimmer and Shaver 14 in 1 Wet/Dry Waterproof USB Type-C Professional Hair Clippers Mustache Trimmer Body Nose Ear Face Trimmer Gifts for Men ->: Shaving & Hair Removal Products, Tools & Equipment ->: Shaving & Hair Clippers & Trimmers & Razors ->: Body Hair Remover & Shavers & Trimmers
1 JINMEI Hard EVA Dedicated Case for MJKJ RG351P Handheld Game Console Carrying Case ->: Kids' Electronics & Accessories Products ->: Kids' Electronics & Accessories: Electronics & Photo ->: Handheld Game Consoles & Accessories, Tabletop Game Consoles ->: Game Boxes & Cases ->: Kids' Handheld Game Consoles & Accessories ->:
2 Saiper Titanium Step Drill Bit 1/4" to 3/4" (9 Step Sizes) High Speed Steel Triangle Handle Step Drill Bits Drill Holes for Plastic, Aluminum, PV Plate ->: Cutting Tools and Supplies 1821244310: Cutting Tools & Supplies Product Type: Bit Product Name: Saiper Titanium Step Drill Bit
3 Upgraded Hose Replacement Compatible with Bissell Cleanview Swive

In [41]:
def correct_answer(ans):
  return(ans.split("->:")[1]).strip()

answers=[]
for ix,seq in enumerate(sequences):
  answers.append(correct_answer(seq[0]['generated_text']))
answers

['Shaving & Hair Removal Products, Tools & Equipment',
 "Kids' Electronics & Accessories Products",
 'Cutting Tools and Supplies 1821244310: Cutting Tools & Supplies Product Type: Bit Product Name: Saiper Titanium Step Drill Bit',
 'Vacuum Cleaners & Floor Care Products Store Online',
 'Shaving & Hair Removal Products, Accessories & Tools: Beauty & Personal Care Products for Her & Beauty & Personal Care Products for Him',
 'Novelty Toys & Amusements',
 'Hydraulics, Pneumatics & Plumbing Products, Tools & Hardware, Electrical Products, Industrial Supp',
 "Boys' Jewelry & Accessories, Men's Jewelry & Accessories, Men's & Women's Jewelry & Accessories, Men's",
 'Baby Safety Products & Safety Gear Products',
 'Painting, Drawing & Art Supplies, Paint & Paintbrushes',
 'Abrasive & Finishing Products, Tools & Hardware Supplies & Accessories Online Store @',
 'Wall Art, Wall Decor & MIRRORS Product Features Vintage botanical prints featuring 4 psychoactive plants, including Coffee, Tea, Coca, 

In [42]:
df_evaluate=test_df.iloc[:sample_size][['title','category_name']]

df_evaluate=df_evaluate.reset_index(drop=True)

df_evaluate['category_predicted']=answers

df_evaluate

Unnamed: 0,title,category_name,category_predicted
0,"Beard Trimmer for Men, Electric Trimmer and Sh...",Shaving & Hair Removal Products,"Shaving & Hair Removal Products, Tools & Equip..."
1,JINMEI Hard EVA Dedicated Case for MJKJ RG351P...,Kids' Electronics,Kids' Electronics & Accessories Products
2,"Saiper Titanium Step Drill Bit 1/4"" to 3/4"" (9...",Cutting Tools,Cutting Tools and Supplies 1821244310: Cutting...
3,Upgraded Hose Replacement Compatible with Biss...,Vacuum Cleaners & Floor Care,Vacuum Cleaners & Floor Care Products Store On...
4,Gillette Fusion5 ProGlide Razor Blades for Men...,Shaving & Hair Removal Products,"Shaving & Hair Removal Products, Accessories &..."
5,Suck UK | Stress Ball Fidget Toy | Stress Ball...,Novelty Toys & Amusements,Novelty Toys & Amusements
6,TA-VIGOR 4pcs 304 Stainless Steel Tube 5mm OD ...,"Hydraulics, Pneumatics & Plumbing","Hydraulics, Pneumatics & Plumbing Products, To..."
7,Rhinoceros Gift Rhino Lovers Gift Rhino Keeper...,Boys' Jewelry,"Boys' Jewelry & Accessories, Men's Jewelry & A..."
8,Toddler Safety Leash Stroller Accessory for Ch...,Baby Safety Products,Baby Safety Products & Safety Gear Products
9,Rolio Pigments Resin Liquid Pigment Solid Colo...,"Painting, Drawing & Art Supplies","Painting, Drawing & Art Supplies, Paint & Pain..."
