In [8]:
import pandas as pd
import re
from tqdm import tqdm

def clean_catalog(text):
    if pd.isna(text):
        return ""

    # Normalize line breaks and spaces
    text = text.replace('\n', ' ').replace('\r', ' ').strip()
    
    # Split by common bullet or delimiter patterns
    parts = re.split(r'[‚Ä¢;|,‚Ä¢\-‚Äì‚óè‚óÜ‚ñ∂‚Üí\t]+', text)
    parts = [p.strip() for p in parts if len(p.strip()) > 1]

    # Try to make each specification human-readable
    clean_parts = []
    for p in parts:
        # If looks like key:value ‚Üí reformat it
        if ":" in p:
            key, val = p.split(":", 1)
            clean_parts.append(f"{key.strip()} is {val.strip()}")
        elif "=" in p:
            key, val = p.split("=", 1)
            clean_parts.append(f"{key.strip()} is {val.strip()}")
        else:
            clean_parts.append(p)

    # Join into one line, separated by commas
    clean_text = ", ".join(clean_parts)

    # Remove extra spaces
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    return clean_text

# === MAIN ===
print("Loading data...")
df = pd.read_csv("dataset/test.csv")

print("Cleaning catalog content...")
tqdm.pandas()
df["catalog_content_clean"] = df["catalog_content"].progress_apply(clean_catalog)

# Keep only required columns
clean_df = df[["sample_id", "catalog_content_clean"]]

# Save cleaned CSV
output_path = "test/train_clean.csv"
clean_df.to_csv(output_path, index=False)

print(f"‚úÖ Saved cleaned file as: {output_path}")
print(clean_df.head(5))


Loading data...
Cleaning catalog content...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75000/75000 [00:05<00:00, 14912.16it/s]


‚úÖ Saved cleaned file as: test/train_clean.csv
   sample_id                              catalog_content_clean
0     100179  Item Name is Rani 14, Spice Eshamaya's Mango C...
1     245611  Item Name is Natural MILK TEA Flavoring extrac...
2     146263  Item Name is Honey Filled Hard Candy, Bulk Pac...
3      95658  Item Name is Vlasic Snack'mm's Kosher Dill 16 ...
4      36806  Item Name is McCormick Culinary Vanilla Extrac...


In [10]:
import pandas as pd
import re
from tqdm import tqdm

def clean_catalog(text):
    if pd.isna(text):
        return ""

    # Replace newlines and normalize spaces
    text = text.replace("\n", " ").replace("\r", " ").strip()

    # Remove field names like "Bullet Point 1:", "Bullet Point 2:", "Item Name:", "Value:", "Unit:" etc.
    text = re.sub(r'\b[Bb]ullet\s*[Pp]oint\s*\d*\s*:', '', text)
    text = re.sub(r'\b[Ii]tem\s*[Nn]ame\s*:', '', text)
    text = re.sub(r'\b[Vv]alue\s*:', '', text)
    text = re.sub(r'\b[Uu]nit\s*:', '', text)
    text = re.sub(r'\b[Bb]rand\s*:', '', text)
    text = re.sub(r'\b[Dd]escription\s*:', '', text)
    text = re.sub(r'\b[Ss]pecifications\s*:', '', text)

    # Split text on punctuation or bullet-like delimiters
    parts = re.split(r'[‚Ä¢;|,‚Ä¢\-‚Äì‚óè‚óÜ‚ñ∂‚Üí\t]+', text)
    parts = [p.strip() for p in parts if len(p.strip()) > 1]

    # Join into a single readable line
    clean_text = ", ".join(parts)

    # Remove redundant spaces and punctuation
    clean_text = re.sub(r'\s+', ' ', clean_text)
    clean_text = re.sub(r'\s+,', ',', clean_text)
    clean_text = clean_text.strip(' ,')

    return clean_text


# === MAIN ===
print("Loading data...")
df = pd.read_csv("dataset/test.csv")

print("Cleaning catalog_content column...")
tqdm.pandas()
df["catalog_content_clean"] = df["catalog_content"].progress_apply(clean_catalog)

# Keep only the desired columns
clean_df = df[["sample_id", "catalog_content_clean"]]

# Save the cleaned version
output_path = "test/test_clean.csv"
clean_df.to_csv(output_path, index=False)

print(f"‚úÖ Cleaned dataset saved at: {output_path}")
print(clean_df.head(5))


Loading data...
Cleaning catalog_content column...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75000/75000 [00:14<00:00, 5170.28it/s]


‚úÖ Cleaned dataset saved at: test/test_clean.csv
   sample_id                              catalog_content_clean
0     100179  Rani 14, Spice Eshamaya's Mango Chutney (India...
1     245611  Natural MILK TEA Flavoring extract by HALO PAN...
2     146263  Honey Filled Hard Candy, Bulk Pack 2 Pounds, I...
3      95658  Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...
4      36806  McCormick Culinary Vanilla Extract, 32 fl oz, ...


In [11]:
import pandas as pd
import re
from tqdm import tqdm

def clean_catalog(text):
    if pd.isna(text):
        return ""

    # Replace newlines and normalize spaces
    text = text.replace("\r", " ").replace("\n", " ").strip()

    # Remove field names and common prefixes
    text = re.sub(r'\b[Bb]ullet\s*[Pp]oint\s*\d*\s*:', '', text)
    text = re.sub(r'\b[Ii]tem\s*[Nn]ame\s*:', '', text)
    text = re.sub(r'\b[Vv]alue\s*:', '', text)
    text = re.sub(r'\b[Uu]nit\s*:', '', text)
    text = re.sub(r'\b[Bb]rand\s*:', '', text)
    text = re.sub(r'\b[Dd]escription\s*:', '', text)
    text = re.sub(r'\b[Ss]pecifications\s*:', '', text)

    # Split on patterns that indicate new items or bullet content
    parts = re.split(r'(?:[.?!]\s+|\s{2,}|,?\s*[‚Ä¢\-‚Äì‚óè‚óÜ‚ñ∂‚Üí]\s*|,?\s*Bullet\s*Point\s*\d*\s*)', text)
    parts = [p.strip() for p in parts if len(p.strip()) > 1]

    # Add comma after every bullet/segment
    clean_parts = []
    for p in parts:
        if not p.endswith(","):
            p = p + ","
        clean_parts.append(p)

    # Join them together
    clean_text = " ".join(clean_parts)

    # Clean redundant commas and spaces
    clean_text = re.sub(r'\s+,', ',', clean_text)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    clean_text = clean_text.strip(' ,')

    return clean_text


# === MAIN ===
print("Loading training data...")
df = pd.read_csv("dataset/test.csv")

print("Cleaning catalog_content column...")
tqdm.pandas()
df["catalog_content_clean"] = df["catalog_content"].progress_apply(clean_catalog)

# Keep only desired columns
clean_df = df[["sample_id", "catalog_content_clean"]]

# Save to file
output_path = "test/test_clean2.csv"
clean_df.to_csv(output_path, index=False)

print(f"‚úÖ Cleaned dataset saved as: {output_path}")
print(clean_df.head(3))


Loading training data...
Cleaning catalog_content column...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 75000/75000 [00:20<00:00, 3688.72it/s]


‚úÖ Cleaned dataset saved as: test/test_clean2.csv
   sample_id                              catalog_content_clean
0     100179  Rani 14, Spice Eshamaya's Mango Chutney (India...
1     245611  Natural MILK TEA Flavoring extract by HALO PAN...
2     146263  Honey Filled Hard Candy, Bulk Pack 2 Pounds, I...


In [12]:
import pandas as pd

# Load your cleaned dataset
df = pd.read_csv("test/test_clean2.csv")

def extract_product_name(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Split by comma and take the first segment
    first_part = text.split(",")[0].strip()

    # Remove extra spaces or stray punctuation
    first_part = first_part.strip(' .,:;')
    return first_part

# Apply the extraction
df["product_name"] = df["catalog_content_clean"].apply(extract_product_name)

# Reorder columns (optional)
df = df[["sample_id", "product_name", "catalog_content_clean"]]

# Save to a new CSV
output_path = "test/test_with_product_name.csv"
df.to_csv(output_path, index=False)

print(f"‚úÖ New dataset saved at: {output_path}")
print(df.head(5))


‚úÖ New dataset saved at: test/test_with_product_name.csv
   sample_id                                       product_name  \
0     100179                                            Rani 14   
1     245611  Natural MILK TEA Flavoring extract by HALO PAN...   
2     146263                            Honey Filled Hard Candy   
3      95658    Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2)   
4      36806                 McCormick Culinary Vanilla Extract   

                               catalog_content_clean  
0  Rani 14, Spice Eshamaya's Mango Chutney (India...  
1  Natural MILK TEA Flavoring extract by HALO PAN...  
2  Honey Filled Hard Candy, Bulk Pack 2 Pounds, I...  
3  Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...  
4  McCormick Culinary Vanilla Extract, 32 fl oz, ...  


In [13]:
import pandas as pd
import re

# Load the original train.csv
df = pd.read_csv("dataset/test.csv")

# Functions to extract "Value" and "Unit" fields
def extract_value(text):
    if pd.isna(text):
        return None
    match = re.search(r'Value\s*:\s*([0-9]*\.?[0-9]+)', text)
    if match:
        return match.group(1).strip()
    return None

def extract_unit(text):
    if pd.isna(text):
        return None
    match = re.search(r'Unit\s*:\s*([A-Za-z ]+)', text)
    if match:
        return match.group(1).strip()
    return None

# Apply the extraction
df["value"] = df["catalog_content"].apply(extract_value)
df["unit"] = df["catalog_content"].apply(extract_unit)

# Keep only required columns
value_unit_df = df[["value", "unit"]]

# Save to new CSV
output_path = "test/value_unit.csv"
value_unit_df.to_csv(output_path, index=False)

print(f"‚úÖ New file saved at: {output_path}")
print(value_unit_df.head(10))


‚úÖ New file saved at: test/value_unit.csv
   value   unit
0   10.5  Ounce
1    2.0  Fl Oz
2   32.0  Ounce
3    2.0  Count
4   32.0  Fl Oz
5   16.0  Ounce
6   45.0  Ounce
7    1.0  Count
8  180.0     oz
9   18.0  Ounce


In [14]:
import pandas as pd
import re

# Load original train.csv
df = pd.read_csv("dataset/test.csv")

# Functions to extract Value and Unit
def extract_value(text):
    if pd.isna(text):
        return 0
    match = re.search(r'Value\s*:\s*([0-9]*\.?[0-9]+)', text)
    if match:
        return float(match.group(1).strip())
    return 0  # default if not found

def extract_unit(text):
    if pd.isna(text):
        return "None"
    match = re.search(r'Unit\s*:\s*([A-Za-z ]+)', text)
    if match:
        return match.group(1).strip()
    return "None"  # default if not found

# Apply extraction
df["value"] = df["catalog_content"].apply(extract_value)
df["unit"] = df["catalog_content"].apply(extract_unit)

# Keep sample_id + value + unit
value_unit_df = df[["sample_id", "value", "unit"]]

# Save to CSV
output_path = "test/value_unit_with_id.csv"
value_unit_df.to_csv(output_path, index=False)

print(f"‚úÖ New file saved at: {output_path}")
print(value_unit_df.head(10))


‚úÖ New file saved at: test/value_unit_with_id.csv
   sample_id  value   unit
0     100179   10.5  Ounce
1     245611    2.0  Fl Oz
2     146263   32.0  Ounce
3      95658    2.0  Count
4      36806   32.0  Fl Oz
5     148239   16.0  Ounce
6      92659   45.0  Ounce
7       3780    1.0  Count
8     196940  180.0     oz
9      20472   18.0  Ounce


In [15]:
import pandas as pd

# Load the CSV
df = pd.read_csv("test/value_unit_with_id.csv")

# Convert all units to lowercase
df['unit'] = df['unit'].str.lower()

# Save back to CSV (overwrite or new file)
output_path = "test/value_unit_with_id_lower.csv"
df.to_csv(output_path, index=False)

print(f"‚úÖ Units converted to lowercase and saved at: {output_path}")
print(df.head(10))


‚úÖ Units converted to lowercase and saved at: test/value_unit_with_id_lower.csv
   sample_id  value   unit
0     100179   10.5  ounce
1     245611    2.0  fl oz
2     146263   32.0  ounce
3      95658    2.0  count
4      36806   32.0  fl oz
5     148239   16.0  ounce
6      92659   45.0  ounce
7       3780    1.0  count
8     196940  180.0     oz
9      20472   18.0  ounce


In [17]:
import pandas as pd

# Load the CSV
df = pd.read_csv("dataset/value_unit_standardized.csv")

# Total number of products
total_products = len(df)

# Count how many have 'None' as unit
none_count = (df['unit'] == "None").sum()
print(f"Total products: {total_products}")
print(f"Products with missing unit (None): {none_count}")
print(f"Percentage missing unit: {none_count / total_products * 100:.2f}%\n")

# Count of products per unit type
unit_counts = df['unit'].value_counts()
print("Count of products per unit type:")
print(unit_counts)

# Optional: percentage per unit
unit_percent = df['unit'].value_counts(normalize=True) * 100
print("\nPercentage of products per unit type:")
print(unit_percent)


Total products: 75000
Products with missing unit (None): 0
Percentage missing unit: 0.00%

Count of products per unit type:
unit
ounce                   44022
count                   18207
fl oz                   11332
none                      958
pound                     202
gm                         49
ct                         49
lb                         34
each                       26
pack                       21
ml                         15
bottle                      9
liters                      7
can                         6
kg                          6
bag                         6
jar                         4
pounds                      4
per carton                  3
box                         3
k                           3
case                        3
piece                       3
product                     2
sq ft                       2
pouch                       2
per box                     2
gr                          2
tea bags                    2
f

In [16]:
import pandas as pd

# Load CSV
df = pd.read_csv("test/value_unit_with_id_lower.csv")  # Already lowercase

# Mapping of variations ‚Üí standardized units
unit_mapping = {
    "oz": "ounce",
    "ounces": "ounce",
    "fluid ounce": "fl oz",
    "fl": "fl oz",
    "fl oz": "fl oz",
    "fl ounce": "fl oz",
    "fluid ounces": "fl oz",
    "packs": "pack",
    "pack": "pack",
    "mililitro": "ml",
    "milliliter": "ml",
    "millilitre": "ml",
    "ml": "ml",
    "gram": "gm",
    "gramm": "gm",
    "grams": "gm",
    "gm": "gm",
    "none": "none"   # Keep 'none' as is
}

# Function to standardize units
def standardize_unit(unit):
    if pd.isna(unit):
        return "none"
    unit = unit.strip().lower()
    return unit_mapping.get(unit, unit)  # default to original if not in mapping

# Apply mapping
df['unit'] = df['unit'].apply(standardize_unit)

# Save cleaned CSV
output_path = "test/value_unit_standardized.csv"
df.to_csv(output_path, index=False)

print(f"‚úÖ Standardized units saved at: {output_path}")
print(df['unit'].value_counts())


‚úÖ Standardized units saved at: test/value_unit_standardized.csv
unit
ounce                                    43914
count                                    18187
fl oz                                    11406
none                                      1031
pound                                      200
ct                                          39
pack                                        36
gm                                          35
each                                        23
lb                                          18
ml                                          17
bottle                                       9
liters                                       7
bag                                          7
product                                      5
box                                          5
tea bags                                     4
foot                                         4
jar                                          4
ea                                  

In [18]:
import pandas as pd

# Load both CSVs
df_main = pd.read_csv("test/test_with_product_name.csv")  # sample_id, product_name, catalog_content_clean, price
df_value_unit = pd.read_csv("test/value_unit_standardized.csv")  # sample_id, value, unit

# Merge on sample_id (inner join keeps only matching rows)
merged_df = pd.merge(df_main, df_value_unit, on="sample_id", how="left")  # left join keeps all main rows

# Optional: reorder columns
merged_df = merged_df[["sample_id", "product_name", "catalog_content_clean", "value", "unit"]]

# Save merged CSV
output_path = "test/merged_train.csv"
merged_df.to_csv(output_path, index=False)

print(f"‚úÖ Merged dataset saved at: {output_path}")
print(merged_df.head(5))


‚úÖ Merged dataset saved at: test/merged_train.csv
   sample_id                                       product_name  \
0     100179                                            Rani 14   
1     245611  Natural MILK TEA Flavoring extract by HALO PAN...   
2     146263                            Honey Filled Hard Candy   
3      95658    Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2)   
4      36806                 McCormick Culinary Vanilla Extract   

                               catalog_content_clean  value   unit  
0  Rani 14, Spice Eshamaya's Mango Chutney (India...   10.5  ounce  
1  Natural MILK TEA Flavoring extract by HALO PAN...    2.0  fl oz  
2  Honey Filled Hard Candy, Bulk Pack 2 Pounds, I...   32.0  ounce  
3  Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...    2.0  count  
4  McCormick Culinary Vanilla Extract, 32 fl oz, ...   32.0  fl oz  


In [19]:
import pandas as pd
import re

# Load your merged CSV
df = pd.read_csv("test/merged_test.csv")

# Function to remove emojis and special symbols
def remove_emojis(text):
    if pd.isna(text):
        return text
    # Remove all characters that are NOT basic printable ASCII
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Apply to catalog_content_clean and product_name
df['catalog_content_clean'] = df['catalog_content_clean'].apply(remove_emojis)
df['product_name'] = df['product_name'].apply(remove_emojis)

# Save cleaned version
output_path = "test/merged_test_cleaned.csv"
df.to_csv(output_path, index=False)

print(f"‚úÖ Cleaned dataset saved to: {output_path}")


‚úÖ Cleaned dataset saved to: test/merged_test_cleaned.csv


In [20]:
import pandas as pd

# Load your merged CSV
df = pd.read_csv("test/merged_test_cleaned.csv")

# Add new column 'image' using sample_id
df['image'] = df['sample_id'].astype(str) + '.jpg'

# Save updated CSV
output_path = "test/merged_test_with_image.csv"
df.to_csv(output_path, index=False)

print(f"‚úÖ Added image column using sample_id. Saved to: {output_path}")
print(df.head(5))


‚úÖ Added image column using sample_id. Saved to: test/merged_test_with_image.csv
   sample_id                                       product_name  \
0     100179                                            Rani 14   
1     245611  Natural MILK TEA Flavoring extract by HALO PAN...   
2     146263                            Honey Filled Hard Candy   
3      95658    Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2)   
4      36806                 McCormick Culinary Vanilla Extract   

                               catalog_content_clean  value   unit       image  
0  Rani 14, Spice Eshamaya's Mango Chutney (India...   10.5  ounce  100179.jpg  
1  Natural MILK TEA Flavoring extract by HALO PAN...    2.0  fl oz  245611.jpg  
2  Honey Filled Hard Candy, Bulk Pack 2 Pounds, I...   32.0  ounce  146263.jpg  
3  Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2...    2.0  count   95658.jpg  
4  McCormick Culinary Vanilla Extract, 32 fl oz, ...   32.0  fl oz   36806.jpg  


In [21]:
import os
import pandas as pd

# Path to your image folder
IMAGE_FOLDER = "test_images/test_images"

# List all files in the folder
image_files = [f for f in os.listdir(IMAGE_FOLDER) if os.path.isfile(os.path.join(IMAGE_FOLDER, f))]

# Create a DataFrame
df_images = pd.DataFrame(image_files, columns=["image"])

# Save to CSV
output_path = "present_images.csv"
df_images.to_csv(output_path, index=False)

print(f"‚úÖ Saved {len(df_images)} image names to {output_path}")


‚úÖ Saved 74999 image names to present_images.csv


In [1]:
import pandas as pd

# Load the CSV files (adjust paths as needed)
first_csv_path = 'resnet_predictions.csv'  # Replace with your first CSV path
second_csv_path = 'p1.csv'  # Replace with your second CSV path (e.g., train.csv)

# Read CSVs
first_df = pd.read_csv(first_csv_path)
second_df = pd.read_csv(second_csv_path)

# Extract image names
first_images = set(first_df['image'].astype(str))  # Convert to set for efficient comparison
second_images = set(second_df['image'].astype(str))

# Find images in second CSV but not in first
missing_images = second_images - first_images

# Convert to list for output
missing_images_list = sorted(list(missing_images))

# Print results
print(f"Images in second CSV but not in first CSV ({len(missing_images_list)}):")
for img in missing_images_list:
    print(img)

# Optionally, save to a new CSV
output_df = pd.DataFrame({'image': missing_images_list})
output_df.to_csv('missing_images.csv', index=False)
print("Results saved to 'missing_images.csv'")

KeyError: 'image'

In [4]:
import pandas as pd

# Load your CSV file
df = pd.read_csv("predictions/sample_test_out.csv")

# Get data types of each column
print("Column Data Types:\n", df.dtypes)

# Separate integer and string columns
int_columns = df.select_dtypes(include=['int64', 'int32']).columns.tolist()
str_columns = df.select_dtypes(include=['object']).columns.tolist()

print("\nüßÆ Integer Columns:", int_columns)
print("\nüî§ String Columns:", str_columns)


Column Data Types:
 sample_id      int64
price        float64
dtype: object

üßÆ Integer Columns: ['sample_id']

üî§ String Columns: []


In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageFile
import numpy as np
from pathlib import Path

# --- Hugging Face warnings fixes ---
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_HUB_DISABLE_XET_WARNING"] = "1"

# Enable loading of truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# --- Custom Collate Function ---
def custom_collate_fn(batch):
    images, texts, prices, sample_ids, image_paths = zip(*batch)
    return list(images), list(texts), torch.tensor(prices, dtype=torch.float32), list(sample_ids), list(image_paths)

# --- Product Dataset ---
class ProductDataset(Dataset):
    def __init__(self, df, image_folder, has_price=False):
        self.df = df
        self.image_folder = Path(image_folder)
        self.has_price = has_price  # Test dataset does not have price

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_name = str(row['image']).strip()  # remove stray spaces/tabs/newlines
        image_path = self.image_folder / image_name
        sample_id = row['sample_id']

        try:
            img = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"‚ùå Error loading image {image_path} (sample_id: {sample_id}): {e}")
            img = Image.new('RGB', (224, 224), (0, 0, 0))  # fallback black image

        text = row.get('catalog_content_clean', "No description available")
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            text = "No description available"

        price = float(row['price']) if self.has_price else 0.0
        return img, text, price, sample_id, str(image_path)

# --- Embedding Extraction ---
def extract_and_save_embeddings(image_model, text_model, image_processor, text_tokenizer, data_loader, save_dir, max_length=128):
    image_model.eval()
    text_model.eval()
    os.makedirs(save_dir, exist_ok=True)
    
    # Detect last processed batch
    existing_batches = [
        int(f.split('_')[1])
        for f in os.listdir(save_dir)
        if f.startswith('batch_') and os.path.isdir(os.path.join(save_dir, f))
    ]
    start_batch = max(existing_batches) + 1 if existing_batches else 0
    print(f"üîÅ Resuming from batch {start_batch}")

    metadata_path = os.path.join(save_dir, 'metadata.csv')
    processed_sample_ids = set()
    if os.path.exists(metadata_path):
        processed_metadata = pd.read_csv(metadata_path)
        processed_sample_ids = set(processed_metadata['sample_id'])
        print(f"Found {len(processed_sample_ids)} processed samples")

    all_image_embeddings = []
    all_text_embeddings = []
    all_sample_ids = []
    all_image_paths = []

    with torch.no_grad():
        for batch_idx, (batch_images, batch_texts, batch_prices, batch_sample_ids, batch_image_paths) in enumerate(
            tqdm(data_loader, desc="Extracting embeddings")
        ):
            if batch_idx < start_batch:
                continue
            if all(sample_id in processed_sample_ids for sample_id in batch_sample_ids):
                print(f"Skipping batch {batch_idx} (already processed)")
                continue

            print(f"Processing batch {batch_idx}")

            # Image embeddings
            try:
                inputs = image_processor(images=batch_images, return_tensors="pt").to(device)
                image_emb = image_model(**inputs).last_hidden_state.cpu().numpy()
            except Exception as e:
                print(f"‚ùå Error processing image embeddings for batch {batch_idx}: {e}")
                continue

            # Text embeddings
            try:
                inputs = text_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
                text_emb = text_model(**inputs).last_hidden_state.cpu().numpy()
            except Exception as e:
                print(f"‚ùå Error processing text embeddings for batch {batch_idx}: {e}")
                continue

            # Save batch embeddings
            batch_save_dir = os.path.join(save_dir, f'batch_{batch_idx}')
            os.makedirs(batch_save_dir, exist_ok=True)
            torch.save(torch.tensor(image_emb), os.path.join(batch_save_dir, 'image_embeddings.pt'))
            torch.save(torch.tensor(text_emb), os.path.join(batch_save_dir, 'text_embeddings.pt'))

            all_image_embeddings.append(image_emb)
            all_text_embeddings.append(text_emb)
            all_sample_ids.extend(batch_sample_ids)
            all_image_paths.extend(batch_image_paths)

    # Save consolidated embeddings
    all_image_emb_path = os.path.join(save_dir, 'all_image_embeddings.pt')
    all_text_emb_path = os.path.join(save_dir, 'all_text_embeddings.pt')

    if all_image_embeddings:
        all_image_embeddings = np.concatenate(all_image_embeddings, axis=0)
        all_text_embeddings = np.concatenate(all_text_embeddings, axis=0)

        if os.path.exists(all_image_emb_path) and os.path.exists(all_text_emb_path):
            existing_image_emb = torch.load(all_image_emb_path).numpy()
            existing_text_emb = torch.load(all_text_emb_path).numpy()
            all_image_embeddings = np.concatenate([existing_image_emb, all_image_embeddings], axis=0)
            all_text_embeddings = np.concatenate([existing_text_emb, all_text_embeddings], axis=0)

        torch.save(torch.tensor(all_image_embeddings), all_image_emb_path)
        torch.save(torch.tensor(all_text_embeddings), all_text_emb_path)

        new_metadata = pd.DataFrame({
            'index': range(len(all_sample_ids)),
            'sample_id': all_sample_ids,
            'image_path': all_image_paths
        })

        if os.path.exists(metadata_path):
            existing_metadata = pd.read_csv(metadata_path)
            new_metadata = pd.concat([existing_metadata, new_metadata], ignore_index=True)
        new_metadata.to_csv(metadata_path, index=False)

    print(f"‚úÖ Embeddings saved to {save_dir}")
    if all_image_embeddings:
        print(f"Image embeddings shape: {all_image_embeddings.shape} ([num_samples, 257, 768])")
        print(f"Text embeddings shape: {all_text_embeddings.shape} ([num_samples, {max_length}, 384])")
    else:
        print("No new embeddings generated (all batches skipped or empty)")

# --- Main Script ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load models
image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base', use_fast=True)
image_model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)
print("DINOv2 model initialized.")

text_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
text_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2').to(device)
print("MiniLM model initialized.")

# Paths
EMBEDDINGS_DIR = './test_embeddings/'
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

IMAGE_FOLDER = 'test_images/test_images'  # Folder with all images
df = pd.read_csv("test/merged_test_with_image.csv")
print(f"Loaded {len(df)} rows")
print("Columns in CSV:", df.columns.tolist())

# Create dataset and dataloader
dataset = ProductDataset(df, IMAGE_FOLDER, has_price=False)
data_loader = DataLoader(dataset, batch_size=100, shuffle=False, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
print("DataLoader ready.")

# Extract and save embeddings
extract_and_save_embeddings(
    image_model, text_model, image_processor, text_tokenizer,
    data_loader, EMBEDDINGS_DIR, max_length=128
)


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /facebook/dinov2-base/resolve/main/preprocessor_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001FBD0F7E3E0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 5f693e30-5ff4-4e62-9054-2ace0f1ab1df)')' thrown while requesting HEAD https://huggingface.co/facebook/dinov2-base/resolve/main/preprocessor_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /facebook/dinov2-base/resolve/main/preprocessor_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001FBD0ED1D50>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 603267a4-82bd-4ba9-9438-bc0a1ba0c52b)')' thrown while requesting HEAD https://huggingface.co/facebook/dinov2-b

DINOv2 model initialized.
MiniLM model initialized.
Loaded 74999 rows
Columns in CSV: ['sample_id', 'product_name', 'catalog_content_clean', 'value', 'unit', 'image']
DataLoader ready.
üîÅ Resuming from batch 0


Extracting embeddings:   0%|          | 0/750 [00:00<?, ?it/s]

Processing batch 0


Extracting embeddings:   0%|          | 1/750 [00:12<2:30:08, 12.03s/it]

Processing batch 1


Extracting embeddings:   0%|          | 2/750 [00:26<2:50:14, 13.66s/it]

Processing batch 2


Extracting embeddings:   0%|          | 3/750 [00:35<2:23:55, 11.56s/it]

Processing batch 3


Extracting embeddings:   1%|          | 4/750 [00:46<2:20:07, 11.27s/it]

Processing batch 4


Extracting embeddings:   1%|          | 5/750 [00:58<2:21:48, 11.42s/it]

Processing batch 5


Extracting embeddings:   1%|          | 6/750 [01:09<2:19:30, 11.25s/it]

Processing batch 6


Extracting embeddings:   1%|          | 7/750 [01:21<2:23:13, 11.57s/it]

Processing batch 7


Extracting embeddings:   1%|          | 8/750 [01:32<2:19:53, 11.31s/it]

Processing batch 8


Extracting embeddings:   1%|          | 9/750 [01:43<2:17:48, 11.16s/it]

Processing batch 9


Extracting embeddings:   1%|‚ñè         | 10/750 [01:53<2:15:43, 11.00s/it]

Processing batch 10


Extracting embeddings:   1%|‚ñè         | 11/750 [02:05<2:17:50, 11.19s/it]

Processing batch 11


Extracting embeddings:   2%|‚ñè         | 12/750 [02:15<2:13:37, 10.86s/it]

Processing batch 12


Extracting embeddings:   2%|‚ñè         | 13/750 [02:26<2:14:50, 10.98s/it]

Processing batch 13


Extracting embeddings:   2%|‚ñè         | 14/750 [02:35<2:05:35, 10.24s/it]

Processing batch 14


Extracting embeddings:   2%|‚ñè         | 15/750 [02:42<1:55:20,  9.42s/it]

Processing batch 15


Extracting embeddings:   2%|‚ñè         | 16/750 [02:50<1:47:16,  8.77s/it]

Processing batch 16


Extracting embeddings:   2%|‚ñè         | 17/750 [02:57<1:40:29,  8.23s/it]

Processing batch 17


Extracting embeddings:   2%|‚ñè         | 18/750 [03:04<1:37:42,  8.01s/it]

Processing batch 18


Extracting embeddings:   3%|‚ñé         | 19/750 [03:12<1:35:39,  7.85s/it]

Processing batch 19


Extracting embeddings:   3%|‚ñé         | 20/750 [03:19<1:34:20,  7.75s/it]

Processing batch 20


Extracting embeddings:   3%|‚ñé         | 21/750 [03:27<1:34:13,  7.76s/it]

Processing batch 21


Extracting embeddings:   3%|‚ñé         | 22/750 [03:34<1:33:34,  7.71s/it]

Processing batch 22


Extracting embeddings:   3%|‚ñé         | 23/750 [03:42<1:34:25,  7.79s/it]

Processing batch 23


Extracting embeddings:   3%|‚ñé         | 24/750 [03:49<1:31:44,  7.58s/it]

Processing batch 24


Extracting embeddings:   3%|‚ñé         | 25/750 [03:57<1:31:11,  7.55s/it]

Processing batch 25


Extracting embeddings:   3%|‚ñé         | 26/750 [04:04<1:29:52,  7.45s/it]

Processing batch 26


Extracting embeddings:   4%|‚ñé         | 27/750 [04:11<1:28:45,  7.37s/it]

Processing batch 27


Extracting embeddings:   4%|‚ñé         | 28/750 [04:18<1:27:52,  7.30s/it]

Processing batch 28


Extracting embeddings:   4%|‚ñç         | 29/750 [04:26<1:28:20,  7.35s/it]

Processing batch 29


Extracting embeddings:   4%|‚ñç         | 30/750 [04:33<1:28:47,  7.40s/it]

Processing batch 30


Extracting embeddings:   4%|‚ñç         | 31/750 [04:41<1:29:48,  7.49s/it]

Processing batch 31


Extracting embeddings:   4%|‚ñç         | 32/750 [04:49<1:29:57,  7.52s/it]

Processing batch 32


Extracting embeddings:   4%|‚ñç         | 33/750 [04:56<1:29:00,  7.45s/it]

Processing batch 33


Extracting embeddings:   5%|‚ñç         | 34/750 [05:04<1:29:33,  7.51s/it]

Processing batch 34


Extracting embeddings:   5%|‚ñç         | 35/750 [05:12<1:30:40,  7.61s/it]

Processing batch 35


Extracting embeddings:   5%|‚ñç         | 36/750 [05:20<1:33:20,  7.84s/it]

Processing batch 36


Extracting embeddings:   5%|‚ñç         | 37/750 [05:28<1:32:36,  7.79s/it]

Processing batch 37


Extracting embeddings:   5%|‚ñå         | 38/750 [05:36<1:33:40,  7.89s/it]

Processing batch 38


Extracting embeddings:   5%|‚ñå         | 39/750 [05:44<1:33:38,  7.90s/it]

Processing batch 39


Extracting embeddings:   5%|‚ñå         | 40/750 [05:51<1:32:03,  7.78s/it]

Processing batch 40


Extracting embeddings:   5%|‚ñå         | 41/750 [05:58<1:30:03,  7.62s/it]

Processing batch 41


Extracting embeddings:   6%|‚ñå         | 42/750 [06:06<1:28:40,  7.51s/it]

Processing batch 42


Extracting embeddings:   6%|‚ñå         | 43/750 [06:13<1:29:00,  7.55s/it]

Processing batch 43


Extracting embeddings:   6%|‚ñå         | 44/750 [06:21<1:28:01,  7.48s/it]

Processing batch 44


Extracting embeddings:   6%|‚ñå         | 45/750 [06:28<1:28:03,  7.49s/it]

Processing batch 45


Extracting embeddings:   6%|‚ñå         | 46/750 [06:36<1:28:43,  7.56s/it]

Processing batch 46


Extracting embeddings:   6%|‚ñã         | 47/750 [06:44<1:28:59,  7.59s/it]

Processing batch 47


Extracting embeddings:   6%|‚ñã         | 48/750 [06:51<1:28:24,  7.56s/it]

Processing batch 48


Extracting embeddings:   7%|‚ñã         | 49/750 [06:58<1:26:23,  7.39s/it]

Processing batch 49


Extracting embeddings:   7%|‚ñã         | 50/750 [07:05<1:25:40,  7.34s/it]

Processing batch 50


Extracting embeddings:   7%|‚ñã         | 51/750 [07:12<1:24:19,  7.24s/it]

Processing batch 51


Extracting embeddings:   7%|‚ñã         | 52/750 [07:20<1:26:09,  7.41s/it]

Processing batch 52


Extracting embeddings:   7%|‚ñã         | 53/750 [07:27<1:24:45,  7.30s/it]

Processing batch 53


Extracting embeddings:   7%|‚ñã         | 54/750 [07:35<1:25:26,  7.37s/it]

Processing batch 54


Extracting embeddings:   7%|‚ñã         | 55/750 [07:42<1:25:27,  7.38s/it]

Processing batch 55


Extracting embeddings:   7%|‚ñã         | 56/750 [07:50<1:27:46,  7.59s/it]

Processing batch 56


Extracting embeddings:   8%|‚ñä         | 57/750 [07:58<1:27:25,  7.57s/it]

Processing batch 57


Extracting embeddings:   8%|‚ñä         | 58/750 [08:07<1:32:01,  7.98s/it]

Processing batch 58


Extracting embeddings:   8%|‚ñä         | 59/750 [08:17<1:40:58,  8.77s/it]

Processing batch 59


Extracting embeddings:   8%|‚ñä         | 60/750 [08:27<1:44:17,  9.07s/it]

Processing batch 60


Extracting embeddings:   8%|‚ñä         | 61/750 [08:37<1:48:19,  9.43s/it]

Processing batch 61


Extracting embeddings:   8%|‚ñä         | 62/750 [08:46<1:47:27,  9.37s/it]

Processing batch 62


Extracting embeddings:   8%|‚ñä         | 63/750 [08:55<1:46:08,  9.27s/it]

Processing batch 63


Extracting embeddings:   9%|‚ñä         | 64/750 [09:05<1:48:30,  9.49s/it]

Processing batch 64


Extracting embeddings:   9%|‚ñä         | 65/750 [09:15<1:48:08,  9.47s/it]

Processing batch 65


Extracting embeddings:   9%|‚ñâ         | 66/750 [09:22<1:41:17,  8.88s/it]

Processing batch 66


Extracting embeddings:   9%|‚ñâ         | 67/750 [09:31<1:39:28,  8.74s/it]

Processing batch 67


Extracting embeddings:   9%|‚ñâ         | 68/750 [09:39<1:36:04,  8.45s/it]

Processing batch 68


Extracting embeddings:   9%|‚ñâ         | 69/750 [09:46<1:31:54,  8.10s/it]

Processing batch 69


Extracting embeddings:   9%|‚ñâ         | 70/750 [09:54<1:32:01,  8.12s/it]

Processing batch 70


Extracting embeddings:   9%|‚ñâ         | 71/750 [10:03<1:36:11,  8.50s/it]

Processing batch 71


Extracting embeddings:  10%|‚ñâ         | 72/750 [10:12<1:35:14,  8.43s/it]

Processing batch 72


Extracting embeddings:  10%|‚ñâ         | 73/750 [10:20<1:35:17,  8.45s/it]

Processing batch 73


Extracting embeddings:  10%|‚ñâ         | 74/750 [10:28<1:32:43,  8.23s/it]

Processing batch 74


Extracting embeddings:  10%|‚ñà         | 75/750 [10:36<1:31:46,  8.16s/it]

Processing batch 75


Extracting embeddings:  10%|‚ñà         | 76/750 [10:45<1:33:30,  8.32s/it]

Processing batch 76


Extracting embeddings:  10%|‚ñà         | 77/750 [10:53<1:31:54,  8.19s/it]

Processing batch 77


Extracting embeddings:  10%|‚ñà         | 78/750 [11:00<1:29:17,  7.97s/it]

Processing batch 78


Extracting embeddings:  11%|‚ñà         | 79/750 [11:08<1:29:04,  7.96s/it]

Processing batch 79


Extracting embeddings:  11%|‚ñà         | 80/750 [11:15<1:27:26,  7.83s/it]

Processing batch 80


Extracting embeddings:  11%|‚ñà         | 81/750 [11:23<1:27:54,  7.88s/it]

Processing batch 81


Extracting embeddings:  11%|‚ñà         | 82/750 [11:31<1:26:51,  7.80s/it]

Processing batch 82


Extracting embeddings:  11%|‚ñà         | 83/750 [11:39<1:25:44,  7.71s/it]

Processing batch 83


Extracting embeddings:  11%|‚ñà         | 84/750 [11:46<1:24:32,  7.62s/it]

Processing batch 84


Extracting embeddings:  11%|‚ñà‚ñè        | 85/750 [11:54<1:24:56,  7.66s/it]

Processing batch 85


Extracting embeddings:  11%|‚ñà‚ñè        | 86/750 [12:01<1:23:11,  7.52s/it]

Processing batch 86


Extracting embeddings:  12%|‚ñà‚ñè        | 87/750 [12:08<1:22:48,  7.49s/it]

Processing batch 87


Extracting embeddings:  12%|‚ñà‚ñè        | 88/750 [12:16<1:22:53,  7.51s/it]

Processing batch 88


Extracting embeddings:  12%|‚ñà‚ñè        | 89/750 [12:23<1:21:47,  7.42s/it]

Processing batch 89


Extracting embeddings:  12%|‚ñà‚ñè        | 90/750 [12:31<1:22:05,  7.46s/it]

Processing batch 90


Extracting embeddings:  12%|‚ñà‚ñè        | 91/750 [12:39<1:23:28,  7.60s/it]

Processing batch 91


Extracting embeddings:  12%|‚ñà‚ñè        | 92/750 [12:46<1:23:30,  7.61s/it]

Processing batch 92


Extracting embeddings:  12%|‚ñà‚ñè        | 93/750 [12:53<1:21:35,  7.45s/it]

Processing batch 93


Extracting embeddings:  13%|‚ñà‚ñé        | 94/750 [13:01<1:22:01,  7.50s/it]

Processing batch 94


Extracting embeddings:  13%|‚ñà‚ñé        | 95/750 [13:09<1:22:47,  7.58s/it]

Processing batch 95


Extracting embeddings:  13%|‚ñà‚ñé        | 96/750 [13:16<1:22:30,  7.57s/it]

Processing batch 96


Extracting embeddings:  13%|‚ñà‚ñé        | 97/750 [13:23<1:21:14,  7.46s/it]

Processing batch 97


Extracting embeddings:  13%|‚ñà‚ñé        | 98/750 [13:31<1:20:19,  7.39s/it]

Processing batch 98


Extracting embeddings:  13%|‚ñà‚ñé        | 99/750 [13:38<1:20:22,  7.41s/it]

Processing batch 99


Extracting embeddings:  13%|‚ñà‚ñé        | 100/750 [13:45<1:19:24,  7.33s/it]

Processing batch 100


Extracting embeddings:  13%|‚ñà‚ñé        | 101/750 [13:53<1:19:45,  7.37s/it]

Processing batch 101


Extracting embeddings:  14%|‚ñà‚ñé        | 102/750 [14:01<1:20:54,  7.49s/it]

Processing batch 102


Extracting embeddings:  14%|‚ñà‚ñé        | 103/750 [14:08<1:21:00,  7.51s/it]

Processing batch 103


Extracting embeddings:  14%|‚ñà‚ñç        | 104/750 [14:15<1:19:23,  7.37s/it]

Processing batch 104


Extracting embeddings:  14%|‚ñà‚ñç        | 105/750 [14:23<1:20:31,  7.49s/it]

Processing batch 105


Extracting embeddings:  14%|‚ñà‚ñç        | 106/750 [14:31<1:22:02,  7.64s/it]

Processing batch 106


Extracting embeddings:  14%|‚ñà‚ñç        | 107/750 [14:39<1:22:17,  7.68s/it]

Processing batch 107


Extracting embeddings:  14%|‚ñà‚ñç        | 108/750 [14:47<1:22:54,  7.75s/it]

Processing batch 108


Extracting embeddings:  15%|‚ñà‚ñç        | 109/750 [14:54<1:23:00,  7.77s/it]

Processing batch 109


Extracting embeddings:  15%|‚ñà‚ñç        | 110/750 [15:02<1:22:31,  7.74s/it]

Processing batch 110


Extracting embeddings:  15%|‚ñà‚ñç        | 111/750 [15:09<1:20:28,  7.56s/it]

Processing batch 111


Extracting embeddings:  15%|‚ñà‚ñç        | 112/750 [15:17<1:20:05,  7.53s/it]

Processing batch 112


Extracting embeddings:  15%|‚ñà‚ñå        | 113/750 [15:24<1:20:53,  7.62s/it]

Processing batch 113


Extracting embeddings:  15%|‚ñà‚ñå        | 114/750 [15:32<1:19:31,  7.50s/it]

Processing batch 114


Extracting embeddings:  15%|‚ñà‚ñå        | 115/750 [15:39<1:17:39,  7.34s/it]

Processing batch 115


Extracting embeddings:  15%|‚ñà‚ñå        | 116/750 [15:46<1:18:56,  7.47s/it]

Processing batch 116


Extracting embeddings:  16%|‚ñà‚ñå        | 117/750 [15:54<1:17:52,  7.38s/it]

Processing batch 117


Extracting embeddings:  16%|‚ñà‚ñå        | 118/750 [16:01<1:17:32,  7.36s/it]

Processing batch 118


Extracting embeddings:  16%|‚ñà‚ñå        | 119/750 [16:09<1:18:21,  7.45s/it]

Processing batch 119


Extracting embeddings:  16%|‚ñà‚ñå        | 120/750 [16:16<1:18:08,  7.44s/it]

Processing batch 120


Extracting embeddings:  16%|‚ñà‚ñå        | 121/750 [16:24<1:18:48,  7.52s/it]

Processing batch 121


Extracting embeddings:  16%|‚ñà‚ñã        | 122/750 [16:31<1:17:05,  7.37s/it]

Processing batch 122


Extracting embeddings:  16%|‚ñà‚ñã        | 123/750 [16:38<1:16:32,  7.32s/it]

Processing batch 123


Extracting embeddings:  17%|‚ñà‚ñã        | 124/750 [16:46<1:18:55,  7.56s/it]

Processing batch 124


Extracting embeddings:  17%|‚ñà‚ñã        | 125/750 [16:53<1:17:57,  7.48s/it]

Processing batch 125


Extracting embeddings:  17%|‚ñà‚ñã        | 126/750 [17:01<1:18:08,  7.51s/it]

Processing batch 126


Extracting embeddings:  17%|‚ñà‚ñã        | 127/750 [17:08<1:17:12,  7.44s/it]

Processing batch 127


Extracting embeddings:  17%|‚ñà‚ñã        | 128/750 [17:16<1:17:40,  7.49s/it]

Processing batch 128


Extracting embeddings:  17%|‚ñà‚ñã        | 129/750 [17:23<1:18:06,  7.55s/it]

Processing batch 129


Extracting embeddings:  17%|‚ñà‚ñã        | 130/750 [17:32<1:19:37,  7.71s/it]

Processing batch 130


Extracting embeddings:  17%|‚ñà‚ñã        | 131/750 [17:39<1:19:45,  7.73s/it]

Processing batch 131


Extracting embeddings:  18%|‚ñà‚ñä        | 132/750 [17:47<1:20:34,  7.82s/it]

Processing batch 132


Extracting embeddings:  18%|‚ñà‚ñä        | 133/750 [17:55<1:19:20,  7.72s/it]

Processing batch 133


Extracting embeddings:  18%|‚ñà‚ñä        | 134/750 [18:03<1:19:05,  7.70s/it]

Processing batch 134


Extracting embeddings:  18%|‚ñà‚ñä        | 135/750 [18:10<1:19:15,  7.73s/it]

Processing batch 135


Extracting embeddings:  18%|‚ñà‚ñä        | 136/750 [18:18<1:20:01,  7.82s/it]

Processing batch 136


Extracting embeddings:  18%|‚ñà‚ñä        | 137/750 [18:26<1:19:24,  7.77s/it]

Processing batch 137


Extracting embeddings:  18%|‚ñà‚ñä        | 138/750 [18:33<1:18:17,  7.68s/it]

Processing batch 138


Extracting embeddings:  19%|‚ñà‚ñä        | 139/750 [18:41<1:18:27,  7.70s/it]

Processing batch 139


Extracting embeddings:  19%|‚ñà‚ñä        | 140/750 [18:49<1:17:18,  7.60s/it]

Processing batch 140


Extracting embeddings:  19%|‚ñà‚ñâ        | 141/750 [18:56<1:17:37,  7.65s/it]

Processing batch 141


Extracting embeddings:  19%|‚ñà‚ñâ        | 142/750 [19:05<1:20:06,  7.90s/it]

Processing batch 142


Extracting embeddings:  19%|‚ñà‚ñâ        | 143/750 [19:12<1:19:06,  7.82s/it]

Processing batch 143


Extracting embeddings:  19%|‚ñà‚ñâ        | 144/750 [19:20<1:17:16,  7.65s/it]

Processing batch 144


Extracting embeddings:  19%|‚ñà‚ñâ        | 145/750 [19:27<1:17:02,  7.64s/it]

Processing batch 145


Extracting embeddings:  19%|‚ñà‚ñâ        | 146/750 [19:35<1:17:52,  7.74s/it]

Processing batch 146


Extracting embeddings:  20%|‚ñà‚ñâ        | 147/750 [19:42<1:15:26,  7.51s/it]

Processing batch 147


Extracting embeddings:  20%|‚ñà‚ñâ        | 148/750 [19:50<1:14:56,  7.47s/it]

Processing batch 148


Extracting embeddings:  20%|‚ñà‚ñâ        | 149/750 [19:58<1:16:29,  7.64s/it]

Processing batch 149


Extracting embeddings:  20%|‚ñà‚ñà        | 150/750 [20:06<1:18:03,  7.81s/it]

Processing batch 150


Extracting embeddings:  20%|‚ñà‚ñà        | 151/750 [20:14<1:17:46,  7.79s/it]

Processing batch 151


Extracting embeddings:  20%|‚ñà‚ñà        | 152/750 [20:21<1:15:58,  7.62s/it]

Processing batch 152


Extracting embeddings:  20%|‚ñà‚ñà        | 153/750 [20:29<1:16:28,  7.69s/it]

Processing batch 153


: 

In [1]:
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageFile
import numpy as np
from pathlib import Path

# --- Hugging Face warnings fixes ---
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_HUB_DISABLE_XET_WARNING"] = "1"

# Enable loading of truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# --- Custom Collate Function ---
def custom_collate_fn(batch):
    images, texts, prices, sample_ids, image_paths = zip(*batch)
    return list(images), list(texts), torch.tensor(prices, dtype=torch.float32), list(sample_ids), list(image_paths)

# --- Product Dataset ---
class ProductDataset(Dataset):
    def __init__(self, df, image_folder, has_price=False):
        self.df = df
        self.image_folder = Path(image_folder)
        self.has_price = has_price  # Test dataset does not have price

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_name = str(row['image']).strip()  # remove stray spaces/tabs/newlines
        image_path = self.image_folder / image_name
        sample_id = row['sample_id']

        try:
            img = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"‚ùå Error loading image {image_path} (sample_id: {sample_id}): {e}")
            img = Image.new('RGB', (224, 224), (0, 0, 0))  # fallback black image

        text = row.get('catalog_content_clean', "No description available")
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            text = "No description available"

        price = float(row['price']) if self.has_price else 0.0
        return img, text, price, sample_id, str(image_path)

# --- Embedding Extraction ---
def extract_and_save_embeddings(image_model, text_model, image_processor, text_tokenizer, data_loader, save_dir, max_length=128):
    image_model.eval()
    text_model.eval()
    os.makedirs(save_dir, exist_ok=True)
    
    # Detect last processed batch
    existing_batches = [
        int(f.split('_')[1])
        for f in os.listdir(save_dir)
        if f.startswith('batch_') and os.path.isdir(os.path.join(save_dir, f))
    ]
    start_batch = max(existing_batches) + 1 if existing_batches else 0
    print(f"üîÅ Resuming from batch {start_batch}")

    metadata_path = os.path.join(save_dir, 'metadata.csv')
    processed_sample_ids = set()
    if os.path.exists(metadata_path):
        processed_metadata = pd.read_csv(metadata_path)
        processed_sample_ids = set(processed_metadata['sample_id'])
        print(f"Found {len(processed_sample_ids)} processed samples")

    all_image_embeddings = []
    all_text_embeddings = []
    all_sample_ids = []
    all_image_paths = []

    with torch.no_grad():
        for batch_idx, (batch_images, batch_texts, batch_prices, batch_sample_ids, batch_image_paths) in enumerate(
            tqdm(data_loader, desc="Extracting embeddings")
        ):
            if batch_idx < start_batch:
                continue
            if all(sample_id in processed_sample_ids for sample_id in batch_sample_ids):
                print(f"Skipping batch {batch_idx} (already processed)")
                continue

            print(f"Processing batch {batch_idx}")

            # Image embeddings
            try:
                inputs = image_processor(images=batch_images, return_tensors="pt").to(device)
                image_emb = image_model(**inputs).last_hidden_state.cpu().numpy()
            except Exception as e:
                print(f"‚ùå Error processing image embeddings for batch {batch_idx}: {e}")
                continue

            # Text embeddings
            try:
                inputs = text_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
                text_emb = text_model(**inputs).last_hidden_state.cpu().numpy()
            except Exception as e:
                print(f"‚ùå Error processing text embeddings for batch {batch_idx}: {e}")
                continue

            # Save batch embeddings
            batch_save_dir = os.path.join(save_dir, f'batch_{batch_idx}')
            os.makedirs(batch_save_dir, exist_ok=True)
            torch.save(torch.tensor(image_emb), os.path.join(batch_save_dir, 'image_embeddings.pt'))
            torch.save(torch.tensor(text_emb), os.path.join(batch_save_dir, 'text_embeddings.pt'))

            all_image_embeddings.append(image_emb)
            all_text_embeddings.append(text_emb)
            all_sample_ids.extend(batch_sample_ids)
            all_image_paths.extend(batch_image_paths)

    # Save consolidated embeddings
    all_image_emb_path = os.path.join(save_dir, 'all_image_embeddings.pt')
    all_text_emb_path = os.path.join(save_dir, 'all_text_embeddings.pt')

    if all_image_embeddings:
        all_image_embeddings = np.concatenate(all_image_embeddings, axis=0)
        all_text_embeddings = np.concatenate(all_text_embeddings, axis=0)

        if os.path.exists(all_image_emb_path) and os.path.exists(all_text_emb_path):
            existing_image_emb = torch.load(all_image_emb_path).numpy()
            existing_text_emb = torch.load(all_text_emb_path).numpy()
            all_image_embeddings = np.concatenate([existing_image_emb, all_image_embeddings], axis=0)
            all_text_embeddings = np.concatenate([existing_text_emb, all_text_embeddings], axis=0)

        torch.save(torch.tensor(all_image_embeddings), all_image_emb_path)
        torch.save(torch.tensor(all_text_embeddings), all_text_emb_path)

        new_metadata = pd.DataFrame({
            'index': range(len(all_sample_ids)),
            'sample_id': all_sample_ids,
            'image_path': all_image_paths
        })

        if os.path.exists(metadata_path):
            existing_metadata = pd.read_csv(metadata_path)
            new_metadata = pd.concat([existing_metadata, new_metadata], ignore_index=True)
        new_metadata.to_csv(metadata_path, index=False)

    print(f"‚úÖ Embeddings saved to {save_dir}")
    if all_image_embeddings:
        print(f"Image embeddings shape: {all_image_embeddings.shape} ([num_samples, 257, 768])")
        print(f"Text embeddings shape: {all_text_embeddings.shape} ([num_samples, {max_length}, 384])")
    else:
        print("No new embeddings generated (all batches skipped or empty)")

# --- Main Script ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load models
image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base', use_fast=True)
image_model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)
print("DINOv2 model initialized.")

text_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
text_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2').to(device)
print("MiniLM model initialized.")

# Paths
EMBEDDINGS_DIR = './test_embeddings/'
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

IMAGE_FOLDER = 'test_images/test_images'  # Folder with all images
df = pd.read_csv("test/merged_test_with_image.csv")
print(f"Loaded {len(df)} rows")
print("Columns in CSV:", df.columns.tolist())

# Create dataset and dataloader
dataset = ProductDataset(df, IMAGE_FOLDER, has_price=False)
data_loader = DataLoader(dataset, batch_size=100, shuffle=False, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
print("DataLoader ready.")

# Extract and save embeddings
extract_and_save_embeddings(
    image_model, text_model, image_processor, text_tokenizer,
    data_loader, EMBEDDINGS_DIR, max_length=128
)


DINOv2 model initialized.
MiniLM model initialized.
Loaded 74999 rows
Columns in CSV: ['sample_id', 'product_name', 'catalog_content_clean', 'value', 'unit', 'image']
DataLoader ready.
üîÅ Resuming from batch 153


Extracting embeddings:  20%|‚ñà‚ñà        | 153/750 [05:09<19:06,  1.92s/it]

Processing batch 153


Extracting embeddings:  21%|‚ñà‚ñà        | 154/750 [05:15<33:10,  3.34s/it]

Processing batch 154


Extracting embeddings:  21%|‚ñà‚ñà        | 155/750 [05:22<43:10,  4.35s/it]

Processing batch 155


Extracting embeddings:  21%|‚ñà‚ñà        | 156/750 [05:29<50:53,  5.14s/it]

Processing batch 156


Extracting embeddings:  21%|‚ñà‚ñà        | 157/750 [05:36<57:59,  5.87s/it]

Processing batch 157


Extracting embeddings:  21%|‚ñà‚ñà        | 158/750 [05:43<1:01:11,  6.20s/it]

Processing batch 158


Extracting embeddings:  21%|‚ñà‚ñà        | 159/750 [05:50<1:02:49,  6.38s/it]

Processing batch 159


Extracting embeddings:  21%|‚ñà‚ñà‚ñè       | 160/750 [05:57<1:04:01,  6.51s/it]

Processing batch 160


Extracting embeddings:  21%|‚ñà‚ñà‚ñè       | 161/750 [06:04<1:05:24,  6.66s/it]

Processing batch 161


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 162/750 [06:11<1:05:42,  6.70s/it]

Processing batch 162


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 163/750 [06:18<1:05:55,  6.74s/it]

Processing batch 163


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 164/750 [06:25<1:05:57,  6.75s/it]

Processing batch 164


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 165/750 [06:32<1:07:26,  6.92s/it]

Processing batch 165


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 166/750 [06:39<1:08:13,  7.01s/it]

Processing batch 166


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 167/750 [06:46<1:08:31,  7.05s/it]

Processing batch 167


Extracting embeddings:  22%|‚ñà‚ñà‚ñè       | 168/750 [06:54<1:09:54,  7.21s/it]

Processing batch 168


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 169/750 [07:01<1:09:08,  7.14s/it]

Processing batch 169


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 170/750 [07:08<1:08:23,  7.07s/it]

Processing batch 170


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 171/750 [07:14<1:07:00,  6.94s/it]

Processing batch 171


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 172/750 [07:21<1:07:35,  7.02s/it]

Processing batch 172


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 173/750 [07:28<1:06:29,  6.91s/it]

Processing batch 173


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 174/750 [07:36<1:08:05,  7.09s/it]

Processing batch 174


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 175/750 [07:43<1:09:03,  7.21s/it]

Processing batch 175


Extracting embeddings:  23%|‚ñà‚ñà‚ñé       | 176/750 [07:51<1:10:08,  7.33s/it]

Processing batch 176


Extracting embeddings:  24%|‚ñà‚ñà‚ñé       | 177/750 [07:58<1:08:21,  7.16s/it]

Processing batch 177


Extracting embeddings:  24%|‚ñà‚ñà‚ñé       | 178/750 [08:04<1:07:34,  7.09s/it]

Processing batch 178


Extracting embeddings:  24%|‚ñà‚ñà‚ñç       | 179/750 [08:11<1:06:37,  7.00s/it]

Processing batch 179


Extracting embeddings:  24%|‚ñà‚ñà‚ñç       | 180/750 [08:19<1:08:18,  7.19s/it]

Processing batch 180


Extracting embeddings:  24%|‚ñà‚ñà‚ñç       | 181/750 [08:26<1:09:16,  7.30s/it]

Processing batch 181


Extracting embeddings:  24%|‚ñà‚ñà‚ñç       | 182/750 [08:34<1:08:54,  7.28s/it]

Processing batch 182


Extracting embeddings:  24%|‚ñà‚ñà‚ñç       | 183/750 [08:41<1:09:15,  7.33s/it]

Processing batch 183


Extracting embeddings:  25%|‚ñà‚ñà‚ñç       | 184/750 [08:48<1:09:11,  7.33s/it]

Processing batch 184


Extracting embeddings:  25%|‚ñà‚ñà‚ñç       | 185/750 [08:55<1:07:58,  7.22s/it]

Processing batch 185


Extracting embeddings:  25%|‚ñà‚ñà‚ñç       | 186/750 [09:03<1:08:24,  7.28s/it]

Processing batch 186


Extracting embeddings:  25%|‚ñà‚ñà‚ñç       | 187/750 [09:11<1:10:47,  7.54s/it]

Processing batch 187


Extracting embeddings:  25%|‚ñà‚ñà‚ñå       | 188/750 [09:18<1:10:29,  7.53s/it]

Processing batch 188


Extracting embeddings:  25%|‚ñà‚ñà‚ñå       | 189/750 [09:26<1:09:28,  7.43s/it]

Processing batch 189


Extracting embeddings:  25%|‚ñà‚ñà‚ñå       | 190/750 [09:33<1:09:13,  7.42s/it]

Processing batch 190


Extracting embeddings:  25%|‚ñà‚ñà‚ñå       | 191/750 [09:41<1:09:22,  7.45s/it]

Processing batch 191


Extracting embeddings:  26%|‚ñà‚ñà‚ñå       | 192/750 [09:48<1:09:52,  7.51s/it]

Processing batch 192


Extracting embeddings:  26%|‚ñà‚ñà‚ñå       | 193/750 [09:56<1:09:49,  7.52s/it]

Processing batch 193


Extracting embeddings:  26%|‚ñà‚ñà‚ñå       | 194/750 [10:03<1:08:05,  7.35s/it]

Processing batch 194


Extracting embeddings:  26%|‚ñà‚ñà‚ñå       | 195/750 [10:10<1:06:44,  7.22s/it]

Processing batch 195


Extracting embeddings:  26%|‚ñà‚ñà‚ñå       | 196/750 [10:17<1:05:56,  7.14s/it]

Processing batch 196


Extracting embeddings:  26%|‚ñà‚ñà‚ñã       | 197/750 [10:24<1:06:43,  7.24s/it]

Processing batch 197


Extracting embeddings:  26%|‚ñà‚ñà‚ñã       | 198/750 [10:32<1:07:27,  7.33s/it]

Processing batch 198


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 199/750 [10:39<1:07:32,  7.35s/it]

Processing batch 199


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 200/750 [10:46<1:05:19,  7.13s/it]

Processing batch 200


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 201/750 [10:53<1:05:43,  7.18s/it]

Processing batch 201


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 202/750 [11:01<1:06:55,  7.33s/it]

Processing batch 202


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 203/750 [11:08<1:06:30,  7.29s/it]

Processing batch 203


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 204/750 [11:15<1:06:11,  7.27s/it]

Processing batch 204


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 205/750 [11:23<1:06:37,  7.33s/it]

Processing batch 205


Extracting embeddings:  27%|‚ñà‚ñà‚ñã       | 206/750 [11:30<1:07:28,  7.44s/it]

Processing batch 206


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 207/750 [11:38<1:08:04,  7.52s/it]

Processing batch 207


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 208/750 [11:45<1:07:43,  7.50s/it]

Processing batch 208


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 209/750 [11:53<1:06:54,  7.42s/it]

Processing batch 209


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 210/750 [12:00<1:07:06,  7.46s/it]

Processing batch 210


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 211/750 [12:08<1:07:09,  7.48s/it]

Processing batch 211


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 212/750 [12:15<1:07:55,  7.58s/it]

Processing batch 212


Extracting embeddings:  28%|‚ñà‚ñà‚ñä       | 213/750 [12:23<1:08:04,  7.61s/it]

Processing batch 213


Extracting embeddings:  29%|‚ñà‚ñà‚ñä       | 214/750 [12:31<1:07:51,  7.60s/it]

Processing batch 214


Extracting embeddings:  29%|‚ñà‚ñà‚ñä       | 215/750 [12:39<1:09:23,  7.78s/it]

Processing batch 215


Extracting embeddings:  29%|‚ñà‚ñà‚ñâ       | 216/750 [12:47<1:09:38,  7.82s/it]

Processing batch 216


Extracting embeddings:  29%|‚ñà‚ñà‚ñâ       | 217/750 [12:54<1:08:02,  7.66s/it]

Processing batch 217


Extracting embeddings:  29%|‚ñà‚ñà‚ñâ       | 218/750 [13:01<1:06:48,  7.54s/it]

Processing batch 218


Extracting embeddings:  29%|‚ñà‚ñà‚ñâ       | 219/750 [13:09<1:07:17,  7.60s/it]

Processing batch 219


Extracting embeddings:  29%|‚ñà‚ñà‚ñâ       | 220/750 [13:16<1:06:17,  7.51s/it]

Processing batch 220


Extracting embeddings:  29%|‚ñà‚ñà‚ñâ       | 221/750 [13:23<1:04:37,  7.33s/it]

Processing batch 221


Extracting embeddings:  30%|‚ñà‚ñà‚ñâ       | 222/750 [13:31<1:04:56,  7.38s/it]

Processing batch 222


Extracting embeddings:  30%|‚ñà‚ñà‚ñâ       | 223/750 [13:39<1:05:46,  7.49s/it]

Processing batch 223


Extracting embeddings:  30%|‚ñà‚ñà‚ñâ       | 224/750 [13:47<1:08:22,  7.80s/it]

Processing batch 224


Extracting embeddings:  30%|‚ñà‚ñà‚ñà       | 225/750 [13:54<1:06:49,  7.64s/it]

Processing batch 225


Extracting embeddings:  30%|‚ñà‚ñà‚ñà       | 226/750 [14:01<1:05:16,  7.47s/it]

Processing batch 226


Extracting embeddings:  30%|‚ñà‚ñà‚ñà       | 227/750 [14:09<1:05:28,  7.51s/it]

Processing batch 227


Extracting embeddings:  30%|‚ñà‚ñà‚ñà       | 228/750 [14:18<1:09:00,  7.93s/it]

Processing batch 228


Extracting embeddings:  31%|‚ñà‚ñà‚ñà       | 229/750 [14:25<1:07:27,  7.77s/it]

Processing batch 229


Extracting embeddings:  31%|‚ñà‚ñà‚ñà       | 230/750 [14:33<1:07:18,  7.77s/it]

Processing batch 230


Extracting embeddings:  31%|‚ñà‚ñà‚ñà       | 231/750 [14:41<1:06:17,  7.66s/it]

Processing batch 231


Extracting embeddings:  31%|‚ñà‚ñà‚ñà       | 232/750 [14:48<1:05:19,  7.57s/it]

Processing batch 232


Extracting embeddings:  31%|‚ñà‚ñà‚ñà       | 233/750 [14:55<1:04:25,  7.48s/it]

Processing batch 233


Extracting embeddings:  31%|‚ñà‚ñà‚ñà       | 234/750 [15:02<1:03:53,  7.43s/it]

Processing batch 234


Extracting embeddings:  31%|‚ñà‚ñà‚ñà‚ñè      | 235/750 [15:11<1:05:24,  7.62s/it]

Processing batch 235


Extracting embeddings:  31%|‚ñà‚ñà‚ñà‚ñè      | 236/750 [15:18<1:04:46,  7.56s/it]

Processing batch 236


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 237/750 [15:25<1:04:00,  7.49s/it]

Processing batch 237


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 238/750 [15:32<1:02:58,  7.38s/it]

Processing batch 238


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 239/750 [15:40<1:02:41,  7.36s/it]

Processing batch 239


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 240/750 [15:48<1:04:08,  7.55s/it]

Processing batch 240


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 241/750 [15:55<1:03:14,  7.46s/it]

Processing batch 241


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 242/750 [16:02<1:02:40,  7.40s/it]

Processing batch 242


Extracting embeddings:  32%|‚ñà‚ñà‚ñà‚ñè      | 243/750 [16:10<1:03:44,  7.54s/it]

Processing batch 243


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 244/750 [16:18<1:03:18,  7.51s/it]

Processing batch 244


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 245/750 [16:25<1:03:09,  7.50s/it]

Processing batch 245


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 246/750 [16:33<1:03:00,  7.50s/it]

Processing batch 246


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 247/750 [16:41<1:04:22,  7.68s/it]

Processing batch 247


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 248/750 [16:48<1:04:48,  7.75s/it]

Processing batch 248


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 249/750 [16:56<1:04:50,  7.76s/it]

Processing batch 249


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 250/750 [17:04<1:04:17,  7.71s/it]

Processing batch 250


Extracting embeddings:  33%|‚ñà‚ñà‚ñà‚ñé      | 251/750 [17:11<1:03:02,  7.58s/it]

Processing batch 251


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñé      | 252/750 [17:18<1:02:07,  7.49s/it]

Processing batch 252


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñé      | 253/750 [17:26<1:02:52,  7.59s/it]

Processing batch 253


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñç      | 254/750 [17:34<1:02:11,  7.52s/it]

Processing batch 254


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñç      | 255/750 [17:41<1:02:22,  7.56s/it]

Processing batch 255


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñç      | 256/750 [17:49<1:01:56,  7.52s/it]

Processing batch 256


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñç      | 257/750 [17:56<1:02:11,  7.57s/it]

Processing batch 257


Extracting embeddings:  34%|‚ñà‚ñà‚ñà‚ñç      | 258/750 [18:03<1:00:51,  7.42s/it]

Processing batch 258


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñç      | 259/750 [18:11<1:01:25,  7.51s/it]

Processing batch 259


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñç      | 260/750 [18:19<1:02:28,  7.65s/it]

Processing batch 260


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñç      | 261/750 [18:27<1:02:01,  7.61s/it]

Processing batch 261


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñç      | 262/750 [18:34<1:02:02,  7.63s/it]

Processing batch 262


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñå      | 263/750 [18:42<1:00:56,  7.51s/it]

Processing batch 263


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñå      | 264/750 [18:49<1:00:07,  7.42s/it]

Processing batch 264


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñå      | 265/750 [18:56<1:00:00,  7.42s/it]

Processing batch 265


Extracting embeddings:  35%|‚ñà‚ñà‚ñà‚ñå      | 266/750 [19:04<1:00:36,  7.51s/it]

Processing batch 266


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñå      | 267/750 [19:12<1:00:49,  7.56s/it]

Processing batch 267


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñå      | 268/750 [19:20<1:02:21,  7.76s/it]

Processing batch 268


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñå      | 269/750 [19:27<1:01:29,  7.67s/it]

Processing batch 269


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñå      | 270/750 [19:35<1:00:41,  7.59s/it]

Processing batch 270


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñå      | 271/750 [19:42<59:54,  7.50s/it]  

Processing batch 271


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñã      | 272/750 [19:50<1:00:11,  7.56s/it]

Processing batch 272


Extracting embeddings:  36%|‚ñà‚ñà‚ñà‚ñã      | 273/750 [19:57<59:59,  7.55s/it]  

Processing batch 273


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 274/750 [20:05<1:00:30,  7.63s/it]

Processing batch 274


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 275/750 [20:12<59:51,  7.56s/it]  

Processing batch 275


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 276/750 [20:20<58:40,  7.43s/it]

Processing batch 276


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 277/750 [20:27<59:08,  7.50s/it]

Processing batch 277


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 278/750 [20:35<58:33,  7.44s/it]

Processing batch 278


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 279/750 [20:41<56:53,  7.25s/it]

Processing batch 279


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 280/750 [20:49<56:42,  7.24s/it]

Processing batch 280


Extracting embeddings:  37%|‚ñà‚ñà‚ñà‚ñã      | 281/750 [20:56<57:17,  7.33s/it]

Processing batch 281


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 282/750 [21:03<57:11,  7.33s/it]

Processing batch 282


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 283/750 [21:11<57:35,  7.40s/it]

Processing batch 283


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 284/750 [21:19<58:37,  7.55s/it]

Processing batch 284


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 285/750 [21:26<57:57,  7.48s/it]

Processing batch 285


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 286/750 [21:34<58:39,  7.58s/it]

Processing batch 286


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 287/750 [21:42<58:55,  7.64s/it]

Processing batch 287


Extracting embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 288/750 [21:49<58:24,  7.59s/it]

Processing batch 288


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñä      | 289/750 [21:57<58:27,  7.61s/it]

Processing batch 289


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñä      | 290/750 [22:05<58:20,  7.61s/it]

Processing batch 290


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñâ      | 291/750 [22:11<56:36,  7.40s/it]

Processing batch 291


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñâ      | 292/750 [22:19<56:17,  7.37s/it]

Processing batch 292


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñâ      | 293/750 [22:27<58:00,  7.62s/it]

Processing batch 293


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñâ      | 294/750 [22:34<57:39,  7.59s/it]

Processing batch 294


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñâ      | 295/750 [22:42<57:14,  7.55s/it]

Processing batch 295


Extracting embeddings:  39%|‚ñà‚ñà‚ñà‚ñâ      | 296/750 [22:49<56:23,  7.45s/it]

Processing batch 296


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñâ      | 297/750 [22:56<55:42,  7.38s/it]

Processing batch 297


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñâ      | 298/750 [23:04<55:30,  7.37s/it]

Processing batch 298


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñâ      | 299/750 [23:11<56:01,  7.45s/it]

Processing batch 299


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñà      | 300/750 [23:19<55:33,  7.41s/it]

Processing batch 300


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñà      | 301/750 [23:26<54:56,  7.34s/it]

Processing batch 301


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñà      | 302/750 [23:33<54:24,  7.29s/it]

Processing batch 302


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñà      | 303/750 [23:41<55:48,  7.49s/it]

Processing batch 303


Extracting embeddings:  41%|‚ñà‚ñà‚ñà‚ñà      | 304/750 [23:48<55:42,  7.49s/it]

Processing batch 304


: 

In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageFile
import numpy as np
from pathlib import Path

# --- Hugging Face warnings fixes ---
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["HF_HUB_DISABLE_XET_WARNING"] = "1"

# Enable loading of truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# --- Custom Collate Function ---
def custom_collate_fn(batch):
    images, texts, prices, sample_ids, image_paths = zip(*batch)
    return list(images), list(texts), torch.tensor(prices, dtype=torch.float32), list(sample_ids), list(image_paths)

# --- Product Dataset ---
class ProductDataset(Dataset):
    def __init__(self, df, image_folder, has_price=False):
        self.df = df
        self.image_folder = Path(image_folder)
        self.has_price = has_price  # Test dataset does not have price

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_name = str(row['image']).strip()  # remove stray spaces/tabs/newlines
        image_path = self.image_folder / image_name
        sample_id = row['sample_id']

        try:
            img = Image.open(image_path).convert('RGB')
        except Exception as e:
            print(f"‚ùå Error loading image {image_path} (sample_id: {sample_id}): {e}")
            img = Image.new('RGB', (224, 224), (0, 0, 0))  # fallback black image

        text = row.get('catalog_content_clean', "No description available")
        if pd.isna(text) or not isinstance(text, str) or text.strip() == "":
            text = "No description available"

        price = float(row['price']) if self.has_price else 0.0
        return img, text, price, sample_id, str(image_path)

# --- Embedding Extraction ---
def extract_and_save_embeddings(image_model, text_model, image_processor, text_tokenizer, data_loader, save_dir, max_length=128):
    image_model.eval()
    text_model.eval()
    os.makedirs(save_dir, exist_ok=True)
    
    # Detect last processed batch
    existing_batches = [
        int(f.split('_')[1])
        for f in os.listdir(save_dir)
        if f.startswith('batch_') and os.path.isdir(os.path.join(save_dir, f))
    ]
    start_batch = max(existing_batches) + 1 if existing_batches else 0
    print(f"üîÅ Resuming from batch {start_batch}")

    metadata_path = os.path.join(save_dir, 'metadata.csv')
    processed_sample_ids = set()
    if os.path.exists(metadata_path):
        processed_metadata = pd.read_csv(metadata_path)
        processed_sample_ids = set(processed_metadata['sample_id'])
        print(f"Found {len(processed_sample_ids)} processed samples")

    all_image_embeddings = []
    all_text_embeddings = []
    all_sample_ids = []
    all_image_paths = []

    with torch.no_grad():
        for batch_idx, (batch_images, batch_texts, batch_prices, batch_sample_ids, batch_image_paths) in enumerate(
            tqdm(data_loader, desc="Extracting embeddings")
        ):
            if batch_idx < start_batch:
                continue
            if all(sample_id in processed_sample_ids for sample_id in batch_sample_ids):
                print(f"Skipping batch {batch_idx} (already processed)")
                continue

            print(f"Processing batch {batch_idx}")

            # Image embeddings
            try:
                inputs = image_processor(images=batch_images, return_tensors="pt").to(device)
                image_emb = image_model(**inputs).last_hidden_state.cpu().numpy()
            except Exception as e:
                print(f"‚ùå Error processing image embeddings for batch {batch_idx}: {e}")
                continue

            # Text embeddings
            try:
                inputs = text_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)
                text_emb = text_model(**inputs).last_hidden_state.cpu().numpy()
            except Exception as e:
                print(f"‚ùå Error processing text embeddings for batch {batch_idx}: {e}")
                continue

            # Save batch embeddings
            batch_save_dir = os.path.join(save_dir, f'batch_{batch_idx}')
            os.makedirs(batch_save_dir, exist_ok=True)
            torch.save(torch.tensor(image_emb), os.path.join(batch_save_dir, 'image_embeddings.pt'))
            torch.save(torch.tensor(text_emb), os.path.join(batch_save_dir, 'text_embeddings.pt'))

            all_image_embeddings.append(image_emb)
            all_text_embeddings.append(text_emb)
            all_sample_ids.extend(batch_sample_ids)
            all_image_paths.extend(batch_image_paths)

    # Save consolidated embeddings
    all_image_emb_path = os.path.join(save_dir, 'all_image_embeddings.pt')
    all_text_emb_path = os.path.join(save_dir, 'all_text_embeddings.pt')

    if all_image_embeddings:
        all_image_embeddings = np.concatenate(all_image_embeddings, axis=0)
        all_text_embeddings = np.concatenate(all_text_embeddings, axis=0)

        if os.path.exists(all_image_emb_path) and os.path.exists(all_text_emb_path):
            existing_image_emb = torch.load(all_image_emb_path).numpy()
            existing_text_emb = torch.load(all_text_emb_path).numpy()
            all_image_embeddings = np.concatenate([existing_image_emb, all_image_embeddings], axis=0)
            all_text_embeddings = np.concatenate([existing_text_emb, all_text_embeddings], axis=0)

        torch.save(torch.tensor(all_image_embeddings), all_image_emb_path)
        torch.save(torch.tensor(all_text_embeddings), all_text_emb_path)

        new_metadata = pd.DataFrame({
            'index': range(len(all_sample_ids)),
            'sample_id': all_sample_ids,
            'image_path': all_image_paths
        })

        if os.path.exists(metadata_path):
            existing_metadata = pd.read_csv(metadata_path)
            new_metadata = pd.concat([existing_metadata, new_metadata], ignore_index=True)
        new_metadata.to_csv(metadata_path, index=False)

    print(f"‚úÖ Embeddings saved to {save_dir}")
    if all_image_embeddings:
        print(f"Image embeddings shape: {all_image_embeddings.shape} ([num_samples, 257, 768])")
        print(f"Text embeddings shape: {all_text_embeddings.shape} ([num_samples, {max_length}, 384])")
    else:
        print("No new embeddings generated (all batches skipped or empty)")

# --- Main Script ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load models
image_processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base', use_fast=True)
image_model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)
print("DINOv2 model initialized.")

text_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v2')
text_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v2').to(device)
print("MiniLM model initialized.")

# Paths
EMBEDDINGS_DIR = './test_embeddings/'
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

IMAGE_FOLDER = 'test_images/test_images'  # Folder with all images
df = pd.read_csv("test/merged_test_with_image.csv")
print(f"Loaded {len(df)} rows")
print("Columns in CSV:", df.columns.tolist())

# Create dataset and dataloader
dataset = ProductDataset(df, IMAGE_FOLDER, has_price=False)
data_loader = DataLoader(dataset, batch_size=100, shuffle=False, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
print("DataLoader ready.")

# Extract and save embeddings
extract_and_save_embeddings(
    image_model, text_model, image_processor, text_tokenizer,
    data_loader, EMBEDDINGS_DIR, max_length=128
)


DINOv2 model initialized.
MiniLM model initialized.
Loaded 74999 rows
Columns in CSV: ['sample_id', 'product_name', 'catalog_content_clean', 'value', 'unit', 'image']
DataLoader ready.
üîÅ Resuming from batch 573


Extracting embeddings:  40%|‚ñà‚ñà‚ñà‚ñà      | 302/750 [21:29<31:52,  4.27s/it]


KeyboardInterrupt: 