In [1]:
# Step 3: Define Custom Dataset and DataLoader
import os
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split
import torch


IMAGE_FOLDER = 'images/train'
df = pd.read_csv("dataset/merged_train_with_image.csv")
print(f"Loaded {len(df)} rows")
print("Columns in temp_train.csv:", df.columns.tolist())
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"Train: {len(train_df)} rows, Test: {len(test_df)} rows")

def custom_collate_fn(batch):
    """
    Custom collate function to handle PIL Images, strings, and prices.
    Returns images as a list, texts as a list, and prices as a tensor.
    """
    images, texts, prices = zip(*batch)
    return list(images), list(texts), torch.tensor(prices, dtype=torch.float32)

class ProductDataset(Dataset):
    def __init__(self, df, image_folder):
        """
        Initialize dataset with DataFrame and image folder.
        - df: Contains [sample_id, product_name, catalog_content_clean, price, value, unit, image]
        - image_folder: Directory with images named in 'image' column (e.g., 33127.jpg)
        """
        self.df = df
        self.image_folder = image_folder
        self.transform = None  # ViTImageProcessor handles preprocessing

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image'])  # Use 'image' column
        try:
            img = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"Image not found at {image_path}, using placeholder.")
            img = Image.new('RGB', (224, 224), (0, 0, 0))
        text = row['catalog_content_clean']
        price = float(row['price'])
        return img, text, price

# Create and test DataLoaders
try:
    train_dataset = ProductDataset(train_df, IMAGE_FOLDER)
    test_dataset = ProductDataset(test_df, IMAGE_FOLDER)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=16, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
    print("DataLoaders ready with your dataset structure.")
    # Test one batch
    for batch_images, batch_texts, batch_prices in train_loader:
        print(f"Batch loaded: {len(batch_images)} images, {len(batch_texts)} texts, {batch_prices.shape} prices")
        print(f"Image type: {type(batch_images[0])}, Price type: {type(batch_prices)}")
        break
except Exception as e:
    print(f"Error creating DataLoaders: {e}")

Loaded 74999 rows
Columns in temp_train.csv: ['sample_id', 'product_name', 'catalog_content_clean', 'price', 'value', 'unit', 'image']
Train: 59999 rows, Test: 15000 rows
DataLoaders ready with your dataset structure.
Batch loaded: 16 images, 16 texts, torch.Size([16]) prices
Image type: <class 'PIL.Image.Image'>, Price type: <class 'torch.Tensor'>


In [2]:
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Step 4: Define Multimodal Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import ViTImageProcessor, ViTModel, BertTokenizer, BertModel, DistilBertModel

class MultimodalModel(nn.Module):
    def __init__(self, fine_tune_encoders=True):
        super(MultimodalModel, self).__init__()
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.text_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.text_proj = nn.Linear(768, 512)
        self.image_proj = nn.Linear(768, 512)
        # Add projection layer to reduce 1024-dim (512+512) to 768-dim for DistilBERT
        self.concat_proj = nn.Linear(1024, 768)
        self.transformer = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.regression = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.ReLU()
        )
        if not fine_tune_encoders:
            for param in self.image_model.parameters():
                param.requires_grad = False
            for param in self.text_model.parameters():
                param.requires_grad = False
        print("Encoders fine-tuning:", fine_tune_encoders)

    def forward(self, images, texts):
        # Image encoding
        inputs = self.image_processor(images=images, return_tensors="pt").to(device)
        image_outputs = self.image_model(**inputs).last_hidden_state.mean(dim=1)
        image_proj = F.normalize(self.image_proj(image_outputs), dim=-1)
        # Text encoding
        inputs = self.text_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        text_outputs = self.text_model(**inputs).last_hidden_state.mean(dim=1)
        text_proj = F.normalize(self.text_proj(text_outputs), dim=-1)
        # Concatenate and project to 768-dim
        concat_emb = torch.cat([text_proj, image_proj], dim=-1)  # [batch_size, 1024]
        concat_emb = self.concat_proj(concat_emb).unsqueeze(1)  # [batch_size, 1, 768]
        # Transformer
        transformer_out = self.transformer(inputs_embeds=concat_emb).last_hidden_state.mean(dim=1)
        # Regression
        price = self.regression(transformer_out)
        return price.squeeze(), text_proj, image_proj
device = torch.device('cuda')  # Define device
model = MultimodalModel(fine_tune_encoders=False).to(device)
print("Model initialized.")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Encoders fine-tuning: False
Model initialized.


In [3]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import time
import os  # Add this import for os.path operations
from tqdm import tqdm

In [11]:
import torch
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import time
import os  # Add this import for os.path operations
from tqdm import tqdm

# --- Define Losses ---
def contrastive_loss(text_proj, image_proj, temperature=0.07):
    """
    Contrastive loss to align image and text projections.
    """
    similarity = text_proj @ image_proj.T / temperature
    labels = torch.arange(text_proj.size(0)).to(device)
    return F.cross_entropy(similarity, labels)

def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error for evaluation.
    """
    y_true = y_true.cpu().numpy() if isinstance(y_true, torch.Tensor) else np.array(y_true)
    y_pred = y_pred.cpu().numpy() if isinstance(y_pred, torch.Tensor) else np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true) / np.maximum(denominator, 1e-8)  # avoid /0
    return np.mean(diff) * 100

# --- Optimizer and Loss ---
criterion = torch.nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

start_time = time.time()
num_epochs = 100

for epoch in range(num_epochs):
    # ------------------- TRAINING -------------------
    model.train()
    train_mse, train_cont = 0, 0
    train_preds, train_true = [], []

    print(f"\nüîπ Epoch {epoch + 1}/{num_epochs} - Training")
    for batch_images, batch_texts, batch_prices in tqdm(train_loader, desc="Training Progress", leave=False):
        optimizer.zero_grad()
        pred, text_proj, image_proj = model(batch_images, batch_texts)

        mse = criterion(pred, batch_prices.to(device))
        cont = contrastive_loss(text_proj, image_proj)

        loss = mse + 0.5 * cont
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_mse += mse.item()
        train_cont += cont.item()
        train_preds.extend(pred.cpu().detach().numpy())
        train_true.extend(batch_prices.cpu().numpy())

    avg_train_mse = train_mse / len(train_loader)
    avg_train_cont = train_cont / len(train_loader)
    train_smape = smape(np.array(train_true), np.array(train_preds))
    print(f"‚úÖ Train MSE = {avg_train_mse:.4f}, Train Cont = {avg_train_cont:.4f}, Train SMAPE = {train_smape:.2f}%")

    # ------------------- TESTING -------------------
    model.eval()
    test_mse, test_cont = 0, 0
    test_preds, test_true = [], []

    print(f"üß™ Epoch {epoch + 1}/{num_epochs} - Testing")
    with torch.no_grad():
        for batch_images, batch_texts, batch_prices in tqdm(test_loader, desc="Testing Progress", leave=False):
            pred, text_proj, image_proj = model(batch_images, batch_texts)
            mse = criterion(pred, batch_prices.to(device))
            cont = contrastive_loss(text_proj, image_proj)
            test_mse += mse.item()
            test_cont += cont.item()

            if pred.dim() == 0:
                test_preds.append(pred.cpu().numpy().item())
            else:
                test_preds.extend(pred.cpu().numpy())
            test_true.extend(batch_prices.cpu().numpy())

    avg_test_mse = test_mse / len(test_loader)
    avg_test_cont = test_cont / len(test_loader)
    test_smape = smape(np.array(test_true), np.array(test_preds))
    print(f"üìä Test MSE = {avg_test_mse:.4f}, Test Cont = {avg_test_cont:.4f}, Test SMAPE = {test_smape:.2f}%")

    # --- Save Model Per Epoch ---
    model_save_path = os.path.join(DATASET_FOLDER, f'multimodal_model_epoch_{epoch + 1}.pth')
    torch.save(model.state_dict(), model_save_path)
    print(f"üíæ Model saved for epoch {epoch + 1} at {model_save_path}")

# --- Final Message ---
print(f"\nüèÅ Training completed in {time.time() - start_time:.2f} seconds.")


üîπ Epoch 1/100 - Training


                                                                                

‚úÖ Train MSE = 1016.5583, Train Cont = 2.2768, Train SMAPE = 70.43%
üß™ Epoch 1/100 - Testing


                                                                                

üìä Test MSE = 880.1211, Test Cont = 2.2500, Test SMAPE = 67.41%
üíæ Model saved for epoch 1 at train_images/multimodal_model_epoch_1.pth

üîπ Epoch 2/100 - Training


                                                                                

‚úÖ Train MSE = 1012.9527, Train Cont = 2.1931, Train SMAPE = 69.45%
üß™ Epoch 2/100 - Testing


                                                                                

üìä Test MSE = 1056.8264, Test Cont = 2.1715, Test SMAPE = 78.97%
üíæ Model saved for epoch 2 at train_images/multimodal_model_epoch_2.pth

üîπ Epoch 3/100 - Training


                                                                                

‚úÖ Train MSE = 1089.0373, Train Cont = 2.1872, Train SMAPE = 71.28%
üß™ Epoch 3/100 - Testing


                                                                                

üìä Test MSE = 859.7275, Test Cont = 2.2194, Test SMAPE = 68.91%
üíæ Model saved for epoch 3 at train_images/multimodal_model_epoch_3.pth

üîπ Epoch 4/100 - Training


                                                                                

‚úÖ Train MSE = 906.2997, Train Cont = 2.2208, Train SMAPE = 66.79%
üß™ Epoch 4/100 - Testing


                                                                                

üìä Test MSE = 805.9354, Test Cont = 2.1953, Test SMAPE = 62.60%
üíæ Model saved for epoch 4 at train_images/multimodal_model_epoch_4.pth

üîπ Epoch 5/100 - Training


                                                                                

‚úÖ Train MSE = 1087.1153, Train Cont = 2.2016, Train SMAPE = 69.69%
üß™ Epoch 5/100 - Testing


                                                                                

üìä Test MSE = 1210.0062, Test Cont = 2.1981, Test SMAPE = 79.55%
üíæ Model saved for epoch 5 at train_images/multimodal_model_epoch_5.pth

üîπ Epoch 6/100 - Training


                                                                                

‚úÖ Train MSE = 1109.2570, Train Cont = 2.2141, Train SMAPE = 71.78%
üß™ Epoch 6/100 - Testing


                                                                                

üìä Test MSE = 762.2732, Test Cont = 2.1955, Test SMAPE = 63.85%
üíæ Model saved for epoch 6 at train_images/multimodal_model_epoch_6.pth

üîπ Epoch 7/100 - Training


                                                                                

KeyboardInterrupt: 

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultimodalModel(fine_tune_encoders=False).to(device)
model.load_state_dict(torch.load("multimodal_model_epoch_6.pth", map_location=device))
model.eval()
print("‚úÖ Model loaded successfully for testing!")


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Encoders fine-tuning: False


  model.load_state_dict(torch.load("multimodal_model_epoch_6.pth", map_location=device))


‚úÖ Model loaded successfully for testing!


In [7]:
TEST_CSV = "test/merged_test_with_image.csv"
IMAGE_FOLDER = "test_images/test_images"  # or wherever your images are

test_df = pd.read_csv(TEST_CSV)
print(f"Loaded {len(test_df)} test samples")


Loaded 74999 test samples


In [8]:
class PredictionDataset(Dataset):
    def __init__(self, df, image_folder):
        self.df = df
        self.image_folder = image_folder
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
        self.text_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image'])
        try:
            img = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"‚ö†Ô∏è Image not found: {image_path}, using blank placeholder")
            img = Image.new('RGB', (224, 224), (0, 0, 0))

        text = row['catalog_content_clean']
        return img, text, row['sample_id']


In [9]:
test_dataset = PredictionDataset(test_df, IMAGE_FOLDER)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [10]:
predictions = []

with torch.no_grad():
    for images, texts, ids in tqdm(test_loader, desc="Predicting"):
        # Image preprocessing (inside forward call)
        pred, _, _ = model(images, texts)
        preds = pred.cpu().numpy()
        
        for i, sid in enumerate(ids):
            predictions.append({"sample_id": sid, "predicted_price": float(preds[i])})


Predicting:   0%|          | 0/9375 [00:00<?, ?it/s]


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>

In [None]:
pred_df = pd.DataFrame(predictions)
output_path = "/content/train_images/test_predictions.csv"
pred_df.to_csv(output_path, index=False)
print(f"‚úÖ Predictions saved to {output_path}")
print(pred_df.head())


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import ViTImageProcessor, ViTModel, BertTokenizer, BertModel, DistilBertModel
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# --- Multimodal Model ---
class MultimodalModel(nn.Module):
    def __init__(self, fine_tune_encoders=True):
        super(MultimodalModel, self).__init__()
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.text_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.text_proj = nn.Linear(768, 512)
        self.image_proj = nn.Linear(768, 512)
        self.concat_proj = nn.Linear(1024, 768)
        self.transformer = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.regression = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.ReLU()
        )
        if not fine_tune_encoders:
            for param in self.image_model.parameters():
                param.requires_grad = False
            for param in self.text_model.parameters():
                param.requires_grad = False
        print("Encoders fine-tuning:", fine_tune_encoders)

    def forward(self, images, texts):
        inputs = self.image_processor(images=images, return_tensors="pt").to(device)
        image_outputs = self.image_model(**inputs).last_hidden_state.mean(dim=1)
        image_proj = F.normalize(self.image_proj(image_outputs), dim=-1)
        inputs = self.text_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        text_outputs = self.text_model(**inputs).last_hidden_state.mean(dim=1)
        text_proj = F.normalize(self.text_proj(text_outputs), dim=-1)
        concat_emb = torch.cat([text_proj, image_proj], dim=-1)
        concat_emb = self.concat_proj(concat_emb).unsqueeze(1)
        transformer_out = self.transformer(inputs_embeds=concat_emb).last_hidden_state.mean(dim=1)
        price = self.regression(transformer_out)
        return price.squeeze(), text_proj, image_proj

# --- Custom Dataset and Collate Function ---
def custom_collate_fn(batch):
    images, texts, prices, sample_ids, image_paths = zip(*batch)
    return list(images), list(texts), torch.tensor(prices, dtype=torch.float32), list(sample_ids), list(image_paths)

class ProductDataset(Dataset):
    def __init__(self, df, image_folder):
        self.df = df
        self.image_folder = image_folder
        self.transform = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image'])
        try:
            img = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"Image not found at {image_path}, using placeholder.")
            img = Image.new('RGB', (224, 224), (0, 0, 0))
        text = row['catalog_content_clean']
        price = float(row['price'])
        sample_id = row['sample_id']
        return img, text, price, sample_id, image_path

# --- Function to Predict and Save Prices ---
def predict_and_save_prices(model, data_loader, save_dir, csv_filename='predictions.csv'):
    model.eval()
    os.makedirs(save_dir, exist_ok=True)
    all_sample_ids = []
    all_predicted_prices = []
    
    with torch.no_grad():
        for batch_images, batch_texts, _, batch_sample_ids, _ in tqdm(data_loader, desc="Predicting prices"):
            try:
                # Predict prices
                pred_price, _, _ = model(batch_images, batch_texts)
                pred_price = pred_price.cpu().numpy()
                
                # Handle scalar or array outputs
                if pred_price.ndim == 0:
                    pred_price = [pred_price.item()]
                else:
                    pred_price = pred_price.tolist()
                
                # Store results
                all_sample_ids.extend(batch_sample_ids)
                all_predicted_prices.extend(pred_price)
            except Exception as e:
                print(f"Error processing batch: {e}")
                continue
    
    # Create DataFrame and save to CSV
    results = pd.DataFrame({
        'sample_id': all_sample_ids,
        'predicted_price': all_predicted_prices
    })
    output_path = os.path.join(save_dir, csv_filename)
    results.to_csv(output_path, index=False)
    print(f"‚úÖ Predictions saved to {output_path}")
    print(f"Total samples predicted: {len(results)}")
    return results

# --- Main Script ---
device = torch.device('cuda')

# Initialize and load model
MODEL_WEIGHTS_PATH = 'multimodal_model_epoch_6.pth'
model = MultimodalModel(fine_tune_encoders=False).to(device)
try:
    # Load with strict=False to handle potential weight mismatches
    state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location=device)
    model.load_state_dict(state_dict, strict=False)
    print(f"Loaded weights from {MODEL_WEIGHTS_PATH} (strict=False)")
except Exception as e:
    print(f"Error loading weights: {e}")
    exit()

# Load data
IMAGE_FOLDER = 'test_images/test_images'
CSV_PATH = 'test/merged_test_with_image.csv'
try:
    df = pd.read_csv(CSV_PATH)
    print(f"Loaded {len(df)} rows")
    print("Columns in CSV:", df.columns.tolist())
except Exception as e:
    print(f"Error loading CSV: {e}")
    exit()

# Create DataLoader
try:
    dataset = ProductDataset(df, IMAGE_FOLDER)
    data_loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
    print("DataLoader ready with dataset structure.")
    for batch_images, batch_texts, batch_prices, batch_sample_ids, batch_image_paths in data_loader:
        print(f"Batch loaded: {len(batch_images)} images, {len(batch_texts)} texts, {batch_prices.shape} prices")
        print(f"Image type: {type(batch_images[0])}, Text type: {type(batch_texts[0])}, Price type: {type(batch_prices)}")
        break
except Exception as e:
    print(f"Error creating DataLoader: {e}")
    exit()

# Predict and save prices
PREDICTIONS_DIR = './predictions/'
predict_and_save_prices(model, data_loader, PREDICTIONS_DIR, csv_filename='predictions.csv')

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /google/vit-base-patch16-224/resolve/main/preprocessor_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001CF7BB8B910>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 6e00979a-0b48-43b6-808d-df06768b9f88)')' thrown while requesting HEAD https://huggingface.co/google/vit-base-patch16-224/resolve/main/preprocessor_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /google/vit-base-patch16-224/resolve/main/preprocessor_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001CF7BB8BD30>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 11d04167-fd6f-4d62-bcd2-e05ee3701d3a)')' thrown while requesting HEAD https://huggingface

Encoders fine-tuning: False


  state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location=device)


Loaded weights from multimodal_model_epoch_6.pth (strict=False)
Loaded 74999 rows
Columns in CSV: ['sample_id', 'product_name', 'catalog_content_clean', 'value', 'unit', 'image']
DataLoader ready with dataset structure.
Error creating DataLoader: 'price'


Predicting prices:   0%|          | 0/4688 [00:00<?, ?it/s]


KeyError: 'price'

: 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import ViTImageProcessor, ViTModel, BertTokenizer, BertModel, DistilBertModel
import os
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from PIL import Image

# --- Multimodal Model ---
class MultimodalModel(nn.Module):
    def __init__(self, fine_tune_encoders=True):
        super(MultimodalModel, self).__init__()
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.text_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.text_proj = nn.Linear(768, 512)
        self.image_proj = nn.Linear(768, 512)
        self.concat_proj = nn.Linear(1024, 768)
        self.transformer = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.regression = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.ReLU()
        )
        if not fine_tune_encoders:
            for param in self.image_model.parameters():
                param.requires_grad = False
            for param in self.text_model.parameters():
                param.requires_grad = False
        print("Encoders fine-tuning:", fine_tune_encoders)

    def forward(self, images, texts):
        inputs = self.image_processor(images=images, return_tensors="pt").to(device)
        image_outputs = self.image_model(**inputs).last_hidden_state.mean(dim=1)
        image_proj = F.normalize(self.image_proj(image_outputs), dim=-1)
        inputs = self.text_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        text_outputs = self.text_model(**inputs).last_hidden_state.mean(dim=1)
        text_proj = F.normalize(self.text_proj(text_outputs), dim=-1)
        concat_emb = torch.cat([text_proj, image_proj], dim=-1)
        concat_emb = self.concat_proj(concat_emb).unsqueeze(1)
        transformer_out = self.transformer(inputs_embeds=concat_emb).last_hidden_state.mean(dim=1)
        price = self.regression(transformer_out)
        return price.squeeze(), text_proj, image_proj

# --- Custom Dataset and Collate Function ---
def custom_collate_fn(batch):
    images, texts, sample_ids, image_paths = zip(*batch)
    return list(images), list(texts), list(sample_ids), list(image_paths)

class ProductDataset(Dataset):
    def __init__(self, df, image_folder):
        self.df = df
        self.image_folder = image_folder
        self.transform = None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = os.path.join(self.image_folder, row['image'])
        try:
            img = Image.open(image_path).convert('RGB')
        except FileNotFoundError:
            print(f"Image not found at {image_path}, using placeholder.")
            img = Image.new('RGB', (224, 224), (0, 0, 0))
        text = row['catalog_content_clean']
        sample_id = row['sample_id']
        return img, text, sample_id, image_path

# --- Function to Predict and Save Prices ---
def predict_and_save_prices(model, data_loader, save_dir, csv_filename='predictions.csv'):
    model.eval()
    os.makedirs(save_dir, exist_ok=True)
    all_sample_ids = []
    all_predicted_prices = []
    
    with torch.no_grad():
        for batch_images, batch_texts, batch_sample_ids, _ in tqdm(data_loader, desc="Predicting prices"):
            try:
                pred_price, _, _ = model(batch_images, batch_texts)
                pred_price = pred_price.cpu().numpy()
                
                if pred_price.ndim == 0:
                    pred_price = [pred_price.item()]
                else:
                    pred_price = pred_price.tolist()
                
                all_sample_ids.extend(batch_sample_ids)
                all_predicted_prices.extend(pred_price)
            except Exception as e:
                print(f"Error processing batch: {e}")
                continue
    
    results = pd.DataFrame({
        'sample_id': all_sample_ids,
        'predicted_price': all_predicted_prices
    })
    output_path = os.path.join(save_dir, csv_filename)
    results.to_csv(output_path, index=False)
    print(f"‚úÖ Predictions saved to {output_path}")
    print(f"Total samples predicted: {len(results)}")
    return results

# --- Main Script ---
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize and load model
MODEL_WEIGHTS_PATH = 'multimodal_model_epoch_6.pth'
model = MultimodalModel(fine_tune_encoders=False).to(device)
try:
    state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location=device)
    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
    print(f"Loaded weights from {MODEL_WEIGHTS_PATH} (strict=False)")
    if missing_keys:
        print(f"Missing keys: {missing_keys}")
    if unexpected_keys:
        print(f"Unexpected keys: {unexpected_keys}")
except Exception as e:
    print(f"Error loading weights: {e}")
    exit()

# Load data
IMAGE_FOLDER = 'test_images/test_images'
CSV_PATH = 'test/merged_test_with_image.csv'  # Update to your test CSV path
try:
    df = pd.read_csv(CSV_PATH)
    print(f"Loaded {len(df)} rows")
    print("Columns in CSV:", df.columns.tolist())
except Exception as e:
    print(f"Error loading CSV: {e}")
    exit()

# Create DataLoader
try:
    dataset = ProductDataset(df, IMAGE_FOLDER)
    data_loader = DataLoader(dataset, batch_size=16, shuffle=False, num_workers=0, pin_memory=True, collate_fn=custom_collate_fn)
    print("DataLoader ready with dataset structure.")
    for batch_images, batch_texts, batch_sample_ids, batch_image_paths in data_loader:
        print(f"Batch loaded: {len(batch_images)} images, {len(batch_texts)} texts")
        print(f"Image type: {type(batch_images[0])}, Text type: {type(batch_texts[0])}, Sample ID type: {type(batch_sample_ids[0])}")
        break
except Exception as e:
    print(f"Error creating DataLoader: {e}")
    exit()

# Predict and save prices
PREDICTIONS_DIR = './predictions/'
predict_and_save_prices(model, data_loader, PREDICTIONS_DIR, csv_filename='predictions.csv')

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Encoders fine-tuning: False


  state_dict = torch.load(MODEL_WEIGHTS_PATH, map_location=device)


Loaded weights from multimodal_model_epoch_6.pth (strict=False)
Loaded 74999 rows
Columns in CSV: ['sample_id', 'product_name', 'catalog_content_clean', 'value', 'unit', 'image']
DataLoader ready with dataset structure.
Batch loaded: 16 images, 16 texts
Image type: <class 'PIL.Image.Image'>, Text type: <class 'str'>, Sample ID type: <class 'numpy.int64'>


Predicting prices: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4688/4688 [1:34:01<00:00,  1.20s/it] 


‚úÖ Predictions saved to ./predictions/predictions.csv
Total samples predicted: 74999


Unnamed: 0,sample_id,predicted_price
0,100179,23.529806
1,245611,16.727566
2,146263,37.904911
3,95658,20.636444
4,36806,28.833317
...,...,...
74994,93616,12.656566
74995,249434,14.198102
74996,162217,9.435763
74997,230487,27.267073
