In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer

ModuleNotFoundError: No module named 'datasets'

## Load data


In [None]:
# Define the dataset path
dataset_path = "D:/UNI/4 kurs/GENAI/Project/.venv/amazon_dataset/"

# Load categories and products CSV files
data_cat = pd.read_csv(dataset_path + "amazon_categories.csv")
data_prod = pd.read_csv(dataset_path + "amazon_products.csv")

In [3]:
print(type(data_cat))

<class 'pandas.core.frame.DataFrame'>


In [4]:
data_cat.head(3)

Unnamed: 0,id,category_name
0,1,Beading & Jewelry Making
1,2,Fabric Decorating
2,3,Knitting & Crochet Supplies


In [5]:
data_prod.head(5)

Unnamed: 0,asin,title,imgUrl,productURL,stars,reviews,price,listPrice,category_id,isBestSeller,boughtInLastMonth
0,B014TMV5YE,"Sion Softside Expandable Roller Luggage, Black...",https://m.media-amazon.com/images/I/815dLQKYIY...,https://www.amazon.com/dp/B014TMV5YE,4.5,0,139.99,0.0,104,False,2000
1,B07GDLCQXV,Luggage Sets Expandable PC+ABS Durable Suitcas...,https://m.media-amazon.com/images/I/81bQlm7vf6...,https://www.amazon.com/dp/B07GDLCQXV,4.5,0,169.99,209.99,104,False,1000
2,B07XSCCZYG,Platinum Elite Softside Expandable Checked Lug...,https://m.media-amazon.com/images/I/71EA35zvJB...,https://www.amazon.com/dp/B07XSCCZYG,4.6,0,365.49,429.99,104,False,300
3,B08MVFKGJM,Freeform Hardside Expandable with Double Spinn...,https://m.media-amazon.com/images/I/91k6NYLQyI...,https://www.amazon.com/dp/B08MVFKGJM,4.6,0,291.59,354.37,104,False,400
4,B01DJLKZBA,Winfield 2 Hardside Expandable Luggage with Sp...,https://m.media-amazon.com/images/I/61NJoaZcP9...,https://www.amazon.com/dp/B01DJLKZBA,4.5,0,174.99,309.99,104,False,400


In [6]:
data_prod.columns

Index(['asin', 'title', 'imgUrl', 'productURL', 'stars', 'reviews', 'price',
       'listPrice', 'category_id', 'isBestSeller', 'boughtInLastMonth'],
      dtype='object')

In [7]:
len(data_prod)

1426337

Since we have over a million products, which is very large, I will be taking a subset of them to save up computational power. We are fine-tuning a pre-trained model, so we don't need a too large dataset.

### Prep data

In [8]:
# Rename 'id' column for merging
data_cat.rename(columns={'id': 'category_id'}, inplace=True)

# Merge product titles with category names
data = pd.merge(data_prod, data_cat, on='category_id', how='left')

# Keep only relevant columns
data = data[['title', 'category_name']].dropna()
data.rename(columns={'title': 'product'}, inplace=True)

In [9]:
data.head(3)

Unnamed: 0,product,category_name
0,"Sion Softside Expandable Roller Luggage, Black...",Suitcases
1,Luggage Sets Expandable PC+ABS Durable Suitcas...,Suitcases
2,Platinum Elite Softside Expandable Checked Lug...,Suitcases


In [10]:


print("PyTorch Version:", torch.__version__)
print("CUDA Available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())

if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))
    print("GPU Compute Capability:", torch.cuda.get_device_capability(0))
else:
    print("No GPU detected by PyTorch.")


PyTorch Version: 2.6.0+cu124
CUDA Available: True
Number of GPUs: 1
GPU Name: Quadro M2200
GPU Compute Capability: (5, 2)


In [11]:
torch.backends.cudnn.benchmark = True  # Optimize GPU computations
torch.cuda.set_device(0)  # Set GPU 0 as default
print(f"Using GPU: {torch.cuda.get_device_name(0)}")

Using GPU: Quadro M2200


### Selecting a Subset of 50K Products with Balanced Categories

In [12]:
from sklearn.utils import resample

# Find the number of categories
num_categories = data['category_name'].nunique()
print(f"Total Categories: {num_categories}")

Total Categories: 248


In [13]:
# Target: 200K samples evenly distributed across categories
target_per_category = 100000 // num_categories  # Equal samples per category
print(target_per_category)

403


In [14]:
# Create an empty dataframe
data_balanced = pd.DataFrame(columns=['product', 'category_name'])

# Loop through each category and sample evenly
for category in data['category_name'].unique():
    category_subset = data[data['category_name'] == category]
    category_sample = resample(category_subset, replace=False, n_samples=min(target_per_category, len(category_subset)), random_state=42)
    data_balanced = pd.concat([data_balanced, category_sample])

# Display the new dataset size
print(f"Balanced dataset size: {len(data_balanced):,} products")

Balanced dataset size: 94,930 products


Save the sampled dataset

In [15]:
data_balanced.to_csv("D:/UNI/4 kurs/GENAI/Project/.venv/amazon_dataset/sample_100k_products.csv", index=False)
print("Sampled dataset saved successfully!")


Sampled dataset saved successfully!


## Preprocess data

In [16]:
# Encode category labels as numbers
label_encoder = LabelEncoder()
data_balanced['category_id'] = label_encoder.fit_transform(data_balanced['category_name'])

In [17]:
# Split into train and test sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_balanced['product'].tolist(), 
    data_balanced['category_id'].tolist(), 
    test_size=0.2, 
    random_state=42
)

In [18]:
# Convert to Hugging Face dataset format
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

### Tokenize for BERT

In [19]:
# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

print("Tokenization Complete!")


Map: 100%|██████████| 75944/75944 [00:08<00:00, 9471.02 examples/s]
Map: 100%|██████████| 18986/18986 [00:01<00:00, 9806.23 examples/s]

Tokenization Complete!





## Fine-Tune BERT for Product Categorization

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load DistilBERT with classification head
num_labels = len(label_encoder.classes_)  # Number of product categories
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=num_labels
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:


# Ensure we use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {device}")  # Should print "cuda"

#  Move model to GPU
model.to(device)

#  Ensure training batch moves to GPU
def move_batch_to_device(batch):
    return {k: torch.tensor(v).to(device) for k, v in batch.items()}

train_dataset = train_dataset.map(move_batch_to_device)
val_dataset = val_dataset.map(move_batch_to_device)

print(" Model and data moved to GPU successfully!")



 Using device: cuda


Map: 100%|██████████| 75944/75944 [01:21<00:00, 926.62 examples/s] 
Map: 100%|██████████| 18986/18986 [00:20<00:00, 912.81 examples/s] 

 Model and data moved to GPU successfully!





In [22]:
torch.backends.cudnn.benchmark = True  # Optimize GPU computations
torch.cuda.set_device(0)  # Set GPU 0 as default
print(f" Using GPU: {torch.cuda.get_device_name(0)}")


 Using GPU: Quadro M2200


In [24]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, 
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

print("Model Training Complete!")

Epoch,Training Loss,Validation Loss
1,1.3047,1.340767
2,0.8646,1.152597
3,0.7062,1.115687


Model Training Complete!


In [28]:
import numpy as np
from sklearn.metrics import accuracy_score

In [29]:
# Get model predictions
preds = trainer.predict(val_dataset)
pred_labels = np.argmax(preds.predictions, axis=1)

# Compute accuracy
accuracy = accuracy_score(val_labels, pred_labels)
print(f"✅ Model Accuracy: {accuracy:.4f}")

KeyboardInterrupt: 

In [None]:
import pickle

label_encoder_path = "D:/UNI/4 kurs/GENAI/Project/.venv/bert_product_classifier/label_encoder.pkl"
label_encoder.text.encode("cp1251").decode('cp1251').encode('utf8')
#  Save label encoder with binary mode ('wb')
with open(label_encoder_path, "wb") as f:
    pickle.dump(label_encoder, f)

print(" Label encoder saved successfully!")


: 

In [25]:


# Define the save path
model_save_path = "D:/UNI/4 kurs/GENAI/Project/.venv/bert_product_classifier"

# Save the model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

print(f" Model and tokenizer saved successfully at: {model_save_path}")


 Model and tokenizer saved successfully at: D:/UNI/4 kurs/GENAI/Project/.venv/bert_product_classifier


In [26]:
torch.save(model.state_dict(), "D:/UNI/4 kurs/GENAI/Project/.venv/bert_model.pth")
print("Model saved as .pth file!")


Model saved as .pth file!
