In [14]:
import pandas as pd



In [15]:
df  = pd.read_csv("FullData_train_with_image_name.xlsx - train_with_image_name.csv")

In [16]:
df.shape

(75000, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sample_id              75000 non-null  int64  
 1   catalog_content        75000 non-null  object 
 2   brand_name             69729 non-null  object 
 3   descriptions           32533 non-null  object 
 4   benefits               60720 non-null  object 
 5   values                 75000 non-null  object 
 6   units                  74059 non-null  object 
 7   pack_of                19707 non-null  float64
 8   item_name              74994 non-null  object 
 9   pack_of.1              26677 non-null  float64
 10  image_link             75000 non-null  object 
 11  price                  75000 non-null  float64
 12  downloaded_image_name  75000 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 7.4+ MB


In [None]:
# --- 1. Import Libraries ---
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
from PIL import Image
import pandas as pd
import numpy as np
import open_clip
import os
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torchvision import transforms
import warnings

# Suppress annoying but harmless warnings
warnings.filterwarnings("ignore", "Passing `palette` without `max_colors`")

# --- 2. Configuration & Setup ---
class Config:
    """A centralized class for all hyperparameters and settings."""
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    MODEL_NAME = 'ViT-B-32'
    PRETRAINED_CHECKPOINT = 'laion2b_s34b_b79k'
    BATCH_SIZE = 64
    EPOCHS = 5

    # Differential Learning Rates for fine-tuning
    CLIP_LR = 1e-6
    HEAD_LR = 1e-4
    WEIGHT_DECAY = 0.01

    # Early Stopping parameters
    PATIENCE = 3
    MIN_DELTA = 0.001

    # Paths (‚ö†Ô∏è IMPORTANT: Update this path if needed)
    IMAGE_DIR = 'output_folder'

print(f"Using device: {Config.DEVICE}")
if not os.path.exists(Config.IMAGE_DIR):
    raise FileNotFoundError(f"Image directory not found: {Config.IMAGE_DIR}. Please check the path.")

# --- 3. Data Loading and Enhanced Feature Engineering ---

# ‚ö†Ô∏è ===================================================================
# ‚ö†Ô∏è IMPORTANT: Load your actual DataFrame 'df' here.
# ‚ö†Ô∏è Example: df = pd.read_csv('path/to/your/data.csv')
# ‚ö†Ô∏è ===================================================================
# This block is a placeholder. Ensure your 'df' is loaded before this script runs.
# For demonstration purposes, I will create a dummy DataFrame.
# In your environment, you should already have 'df' loaded from your file.
if 'df' not in locals():
     raise NameError("DataFrame 'df' is not defined. Please load your data where indicated.")

# Add this line to see all your available column names
print(f"Available columns in your DataFrame: {df.columns.tolist()}")


def create_comprehensive_description(row):
    """Combines multiple text fields into a single rich description for CLIP."""
    parts = []
    # Add brand, item name, and general description for richer context
    if pd.notna(row.get('brand_name')): parts.append(str(row['brand_name']))
    if pd.notna(row.get('item_name')): parts.append(str(row['item_name']))
    if pd.notna(row.get('descriptions')): parts.append(f"Description: {row['descriptions']}")

    # This part is kept for generality, it won't find columns in your current df but won't error
    attributes = [str(row[col]) for col in ['flavours_only', 'sizes_only', 'product_attributes'] if col in row and pd.notna(row[col])]
    if attributes: parts.append(f"Features: {', '.join(attributes)}.")

    if pd.notna(row.get('benefits')): parts.append(f"Benefits: {row['benefits']}")

    return " ".join(parts).replace("  ", " ").strip()

# --- Preprocessing for Numerical and Categorical Features ---
# --- Preprocessing for Numerical and Categorical Features ---

# ‚úÖ FIX: Strip leading/trailing whitespace from all column names
df.columns = df.columns.str.strip()

# Optional: Print the cleaned column names to verify
print("Cleaned column names:", df.columns.tolist())

# ‚úÖ MODIFIED: Convert 'values' from object to a numeric type.
# 'coerce' will turn any non-numeric values (like text) into NaN (Not a Number).
df['values'] = pd.to_numeric(df['values'], errors='coerce')

# Handle missing values before splitting.
# For 'pack_of', assume NaN means a single item.
# Handle missing values before splitting.
df['pack_of'] = df['pack_of'].fillna(1.0).astype(float)

# --- Idempotent 'units' processing ---
# This check ensures this block only runs if the 'units' column exists.
if 'units' in df.columns:
    print("Processing 'units' column (one-hot encoding)...")
    # For 'units', fill NaN with 'unknown' to create a separate category.
    df['units'] = df['units'].fillna('unknown').astype(str)
    # This line removes the 'units' column and replaces it with one-hot encoded columns.
    df = pd.get_dummies(df, columns=['units'], prefix='unit', dtype=float)
else:
    print("'units' column already processed. Skipping one-hot encoding.")
    
# Apply text feature engineering
df['description'] = df.apply(create_comprehensive_description, axis=1)

# ... the rest of your code follows

# ‚úÖ MODIFIED #1: Replaced 'YOUR_VALUE_COLUMN_NAME' with 'values' from your DataFrame.
VALUE_COL = 'values'
if VALUE_COL not in df.columns:
    raise KeyError(f"Column '{VALUE_COL}' not found in DataFrame. Please check the column name.")


# Define columns to be used
unit_cols = [col for col in df.columns if col.startswith('unit_')]
numerical_cols = [VALUE_COL, 'pack_of'] + unit_cols # This now correctly includes 'values'
feature_cols = ['description', 'price', 'downloaded_image_name'] + numerical_cols

df_processed = df[feature_cols].copy()
df_processed['image_path'] = df_processed['downloaded_image_name'].apply(lambda x: os.path.join(Config.IMAGE_DIR, str(x)))

# ‚úÖ MODIFIED #2: This line now correctly uses the 'values' column via the VALUE_COL variable.
# It also drops rows where 'values' became NaN after the numeric conversion.
df_processed.dropna(subset=['price', 'description', VALUE_COL], inplace=True)
df_processed = df_processed[df_processed['description'] != '']

print(f"‚úÖ Created features. Total samples: {len(df_processed)}. Numerical/Categorical features: {numerical_cols}")

train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)
print(f"Data split into {len(train_df)} training and {len(test_df)} testing samples.")


# --- 4. Dataset Class and DataLoaders ---

class ProductPriceDataset(Dataset):
    """Custom PyTorch Dataset for loading product images, text, and numerical features."""
    def __init__(self, df, image_transform, tokenizer, numerical_cols, stats):
        self.df = df.copy() # Use a copy to avoid SettingWithCopyWarning
        self.image_transform = image_transform
        self.tokenizer = tokenizer
        self.numerical_cols = numerical_cols
        self.stats = stats
        self.value_col_name = numerical_cols[0] # Assumes the value column ('values') is the first one

        # Pre-calculate normalized values for efficiency
        for col in [self.value_col_name, 'pack_of']:
            mean = self.stats[col]['mean']
            std = self.stats[col]['std']
            self.df[f'{col}_normalized'] = (self.df[col] - mean) / (std + 1e-6)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        try:
            image = self.image_transform(Image.open(row['image_path']).convert('RGB'))
        except (FileNotFoundError, UnboundLocalError, OSError):
            # Return None if image is missing or corrupt, will be handled by collate_fn
            return None

        text = self.tokenizer(row['description'])[0]
        log_price = torch.tensor(np.log1p(row['price']), dtype=torch.float32)

        # Create a single tensor for all numerical features
        # ‚úÖ MODIFIED #3: This now correctly gets the normalized 'values' column.
        norm_value = torch.tensor(row[f'{self.value_col_name}_normalized'], dtype=torch.float32)
        norm_pack_of = torch.tensor(row['pack_of_normalized'], dtype=torch.float32)
        unit_features = torch.tensor(row[[c for c in self.numerical_cols if c.startswith('unit_')]].values.astype(np.float32))
        numerical_feats = torch.cat([norm_value.unsqueeze(0), norm_pack_of.unsqueeze(0), unit_features])

        return {'image': image, 'text': text, 'numerical': numerical_feats, 'log_price': log_price}

def collate_fn(batch):
    """Custom collate function to filter out None values from the batch."""
    batch = list(filter(lambda x: x is not None, batch))
    return torch.utils.data.dataloader.default_collate(batch) if batch else None


# --- 5. Model, Training, and Evaluation ---

class CLIPPricePredictor(nn.Module):
    """The core model combining CLIP embeddings with numerical features."""
    def __init__(self, clip_model, numerical_feature_size, dropout_rate=0.4):
        super().__init__()
        self.clip = clip_model
        clip_embedding_dim = self.clip.text_projection.shape[1]
        # The combined dimension now includes the size of our numerical feature vector
        combined_dim = (clip_embedding_dim * 2) + numerical_feature_size

        self.regression_head = nn.Sequential(
            nn.Linear(combined_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, image, text, numerical_feats):
        # Generate embeddings
        with torch.no_grad(): # Freeze CLIP encoder during training head
            image_features = self.clip.encode_image(image)
            text_features = self.clip.encode_text(text)

        # L2 Normalize embeddings - a standard practice for CLIP
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        # Concatenate image, text, and the multi-dimensional numerical features
        combined = torch.cat([image_features, text_features, numerical_feats], dim=1)

        return self.regression_head(combined).squeeze(-1)

class EarlyStopper:
    """Stops training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss, model):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            print("‚úÖ Validation loss decreased, saving model.")
            torch.save(model.state_dict(), 'best_model.pth')
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                print("üõë Early stopping triggered!")
                return True
        return False

def calculate_smape(preds, targets):
    """Calculates Symmetric Mean Absolute Percentage Error."""
    numerator = torch.abs(preds - targets)
    denominator = (torch.abs(targets) + torch.abs(preds)) / 2
    # Add a small epsilon to avoid division by zero
    return (torch.mean(numerator / (denominator + 1e-8)) * 100).item()

def train_and_evaluate(model, train_loader, val_loader, optimizer, scheduler, loss_fn, config):
    """The main training and validation loop."""
    history = {'train_loss': [], 'val_loss': [], 'val_smape': []}
    early_stopper = EarlyStopper(patience=config.PATIENCE, min_delta=config.MIN_DELTA)

    for epoch in range(config.EPOCHS):
        model.train()
        total_train_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.EPOCHS} [Training]")

        for batch in progress_bar:
            if batch is None: continue # Skip faulty batches

            images = batch['image'].to(config.DEVICE)
            texts = batch['text'].to(config.DEVICE)
            numerical = batch['numerical'].to(config.DEVICE)
            log_prices = batch['log_price'].to(config.DEVICE)

            optimizer.zero_grad()
            pred_log_prices = model(images, texts, numerical)
            loss = loss_fn(pred_log_prices, log_prices)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            progress_bar.set_postfix({'log_huber_loss': f'{loss.item():.4f}'})

        avg_train_loss = total_train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)

        model.eval()
        total_val_loss, total_val_smape = 0, 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{config.EPOCHS} [Validation]"):
                if batch is None: continue

                images = batch['image'].to(config.DEVICE)
                texts = batch['text'].to(config.DEVICE)
                numerical = batch['numerical'].to(config.DEVICE)
                log_prices = batch['log_price'].to(config.DEVICE)

                pred_log_prices = model(images, texts, numerical)
                total_val_loss += loss_fn(pred_log_prices, log_prices).item()

                # Convert back from log scale to calculate SMAPE on actual prices
                true_prices = torch.expm1(log_prices)
                pred_prices = torch.expm1(pred_log_prices)
                total_val_smape += calculate_smape(pred_prices, true_prices)

        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_smape = total_val_smape / len(val_loader)
        history['val_loss'].append(avg_val_loss)
        history['val_smape'].append(avg_val_smape)

        print(f"Epoch {epoch+1}/{config.EPOCHS} -> Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val SMAPE: {avg_val_smape:.2f}% | LR: {scheduler.get_last_lr()[0]:.2e}")

        scheduler.step()
        if early_stopper.early_stop(avg_val_loss, model):
            break

    print("Loading best model state from training.")
    model.load_state_dict(torch.load('best_model.pth'))
    return history, model


# --- 6. Main Execution Block ---
if __name__ == '__main__':
    # Initialize CLIP model, tokenizer, and image transformations
    clip_model, _, base_preprocess = open_clip.create_model_and_transforms(
        Config.MODEL_NAME, pretrained=Config.PRETRAINED_CHECKPOINT, device=Config.DEVICE
    )
    tokenizer = open_clip.get_tokenizer(Config.MODEL_NAME)

    # Add data augmentation for the training set to improve model generalization
    train_preprocess = transforms.Compose([
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0), ratio=(0.75, 1.33)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.2, contrast=0.2),
        base_preprocess,
    ])

    # Calculate normalization stats ONLY from the training data to avoid data leakage
    # ‚úÖ MODIFIED #4: This now correctly creates a dictionary key 'values' for the stats.
    stats = {
        VALUE_COL: {'mean': train_df[VALUE_COL].mean(), 'std': train_df[VALUE_COL].std()},
        'pack_of': {'mean': train_df['pack_of'].mean(), 'std': train_df['pack_of'].std()}
    }

    train_dataset = ProductPriceDataset(train_df, train_preprocess, tokenizer, numerical_cols, stats)
    test_dataset = ProductPriceDataset(test_df, base_preprocess, tokenizer, numerical_cols, stats)

    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)

    # Pass the correct number of numerical features to the model
    NUMERICAL_FEATURE_SIZE = len(numerical_cols)
    model = CLIPPricePredictor(clip_model, numerical_feature_size=NUMERICAL_FEATURE_SIZE).to(Config.DEVICE)
    
    # Freeze the CLIP model parameters initially
    for param in model.clip.parameters():
        param.requires_grad = False

    # Set up optimizer with differential learning rates
    optimizer = torch.optim.AdamW([
        {'params': model.clip.parameters(), 'lr': Config.CLIP_LR},
        {'params': model.regression_head.parameters(), 'lr': Config.HEAD_LR}
    ], weight_decay=Config.WEIGHT_DECAY)

    # HuberLoss is robust to outliers, which is common in price data
    loss_fn = nn.HuberLoss()
    scheduler = CosineAnnealingLR(optimizer, T_max=Config.EPOCHS, eta_min=1e-8)

    # Run the training and evaluation loop
    training_history, best_model = train_and_evaluate(model, train_loader, test_loader, optimizer, scheduler, loss_fn, Config)

    # --- 7. Plotting Results ---
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
    epochs_ran = len(training_history['train_loss'])

    ax1.plot(range(epochs_ran), training_history['train_loss'], label='Training Loss (Huber)', marker='o')
    ax1.plot(range(epochs_ran), training_history['val_loss'], label='Validation Loss (Huber)', marker='o')
    ax1.set_title('Training and Validation Loss', fontsize=14)
    ax1.set_xlabel('Epochs'); ax1.set_ylabel('Loss'); ax1.legend(); ax1.grid(True)

    ax2.plot(range(epochs_ran), training_history['val_smape'], label='Validation SMAPE', color='orange', marker='o')
    ax2.set_title('Validation SMAPE', fontsize=14)
    ax2.set_xlabel('Epochs'); ax2.set_ylabel('SMAPE (%)'); ax2.legend(); ax2.grid(True)

    plt.tight_layout()
    plt.show()

In [9]:
df.shape

(75000, 13)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   sample_id              75000 non-null  int64  
 1   catalog_content        75000 non-null  object 
 2   brand_name             69729 non-null  object 
 3   descriptions           32533 non-null  object 
 4   benefits               60720 non-null  object 
 5   values                 75000 non-null  object 
 6   units                  74059 non-null  object 
 7   pack_of                19707 non-null  float64
 8   item_name              74994 non-null  object 
 9   pack_of.1              26677 non-null  float64
 10  image_link             75000 non-null  object 
 11  price                  75000 non-null  float64
 12  downloaded_image_name  75000 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 7.4+ MB


In [5]:
df = df.head(4000)

In [18]:
df.shape

(75000, 13)

# FINAL CODE FOR 75K DATA TRAINING 

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
from PIL import Image
import pandas as pd
import numpy as np
import open_clip
import os
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torchvision import transforms
import warnings
import json # Added for

In [11]:
df.shape

(75000, 3)

In [8]:
df = df.head(400)

In [9]:
df.shape

(400, 13)

# FOR TEST

In [12]:

# =========================================================================================
# --- üöÄ NEW: Additional Functions for Saving and Prediction üöÄ ---
# =========================================================================================

def save_finetuned_model(model, stats, numerical_cols, save_path):
    """
    Saves the fine-tuned model's state, normalization stats, and numerical column info.
    
    Args:
        model (nn.Module): The trained model object.
        stats (dict): Dictionary containing mean/std for numerical columns.
        numerical_cols (list): List of numerical column names used for training.
        save_path (str): Path to save the model file.
    """
    # Create a dictionary to hold everything needed for inference
    model_package = {
        'model_state_dict': model.state_dict(),
        'stats': stats,
        'numerical_cols': numerical_cols,
        'config': {
            'MODEL_NAME': Config.MODEL_NAME,
            'PRETRAINED_CHECKPOINT': Config.PRETRAINED_CHECKPOINT
        }
    }
    torch.save(model_package, save_path)
    print(f"‚úÖ Model package saved successfully to: {save_path}")

def predict_from_df(df_to_predict, model_path, output_csv_path="predictions.csv"):
    """
    Predicts prices for a given DataFrame using a saved fine-tuned model.
    
    Args:
        df_to_predict (pd.DataFrame): DataFrame with the same structure as the training data.
        model_path (str): Path to the saved model package (.pth file).
        output_csv_path (str): Path to save the final CSV with predictions.
        
    Returns:
        str: The path to the generated predictions CSV file.
    """
    print("--- Starting Prediction Process ---")
    
    # --- 1. Load Model and Configuration ---
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at: {model_path}")
        
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # ‚úÖ FIXED: Added weights_only=False to allow loading the full dictionary
    model_package = torch.load(model_path, map_location=device, weights_only=False)
    
    model_config = model_package['config']
    stats = model_package['stats']
    numerical_cols_loaded = model_package['numerical_cols']
    
    # Initialize the base CLIP model and preprocessor
    clip_model, _, base_preprocess = open_clip.create_model_and_transforms(
        model_config['MODEL_NAME'], pretrained=model_config['PRETRAINED_CHECKPOINT'], device=device
    )
    tokenizer = open_clip.get_tokenizer(model_config['MODEL_NAME'])
    
    # Initialize our custom model architecture and load the saved weights
    NUMERICAL_FEATURE_SIZE = len(numerical_cols_loaded)
    prediction_model = CLIPPricePredictor(clip_model, numerical_feature_size=NUMERICAL_FEATURE_SIZE).to(device)
    prediction_model.load_state_dict(model_package['model_state_dict'])
    prediction_model.eval() # Set model to evaluation mode
    
    print("‚úÖ Model and configuration loaded.")

    # --- 2. Preprocess the Input DataFrame ---
    # Ensure the global 'unit_cols' from training is used for consistent one-hot encoding
    global unit_cols
    unit_cols = [col for col in numerical_cols_loaded if col.startswith('unit_')]
    
    df_pred_processed = preprocess_dataframe(df_to_predict)
    
    # Filter out any rows that have missing essential data after preprocessing
    df_pred_processed.dropna(subset=['description', 'values'], inplace=True)
    df_pred_processed = df_pred_processed[df_pred_processed['description'] != '']
    
    # Store sample_ids to merge back later, ensuring order is maintained
    original_sample_ids = df_pred_processed['sample_id'].tolist()
    
    print(f"‚úÖ Input data preprocessed. Predicting on {len(df_pred_processed)} samples.")

    # --- 3. Create Dataset and DataLoader ---
    pred_dataset = ProductPriceDataset(
        df_pred_processed, 
        base_preprocess, 
        tokenizer, 
        numerical_cols_loaded, 
        stats,
        is_predict=True # Set flag to not look for 'price' column
    )
    
    pred_loader = DataLoader(
        pred_dataset, 
        batch_size=Config.BATCH_SIZE * 2, # Use a larger batch size for faster inference
        shuffle=False, 
        num_workers=Config.NUM_WORKERS, 
        collate_fn=collate_fn
    )

    # --- 4. Run Inference ---
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(pred_loader, desc="Generating Predictions"):
            if batch is None: continue
            
            images = batch['image'].to(device)
            texts = batch['text'].to(device)
            numerical = batch['numerical'].to(device)
            
            pred_log_prices = prediction_model(images, texts, numerical)
            
            # Convert log prices back to actual prices
            pred_prices = torch.expm1(pred_log_prices).cpu().numpy()
            all_predictions.extend(pred_prices)
    
    print("‚úÖ Inference complete.")

    # --- 5. Format and Save Output ---
    results_df = pd.DataFrame({
        'sample_id': original_sample_ids,
        'price': all_predictions
    })
    
    # Format the price to two decimal places
    results_df['price'] = results_df['price'].round(2)
    
    results_df.to_csv(output_csv_path, index=False)
    print(f"üéâ Predictions saved successfully to: {output_csv_path}")
    
    return output_csv_path

# Example of how you would use the prediction function (do not run this part during training)
if __name__ == '__main__':
    # ... (after training completes and model is saved) ...
    
    # 1. Load a new DataFrame you want to predict prices for
    # new_data_df = pd.read_csv('path/to/your/new_data.csv')
    new_data_df = df
    
    # 2. Define the path to your saved model
    saved_model_path = 'final_clip_price_model.pth'
    
    # 3. Call the prediction function
    prediction_csv_file = predict_from_df(new_data_df, saved_model_path, output_csv_path="my_new_predictions.csv")
    
    print(f"Find your results in: {prediction_csv_file}")

--- Starting Prediction Process ---


‚úÖ Model and configuration loaded.
‚úÖ Input data preprocessed. Predicting on 395 samples.


Generating Predictions:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Inference complete.
üéâ Predictions saved successfully to: my_new_predictions.csv
Find your results in: my_new_predictions.csv


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("test.csv")

In [5]:
df.shape

(75000, 3)

In [None]:

# =========================================================================================
# --- üöÄ NEW: Additional Functions for Saving and Prediction üöÄ ---
# =========================================================================================

def save_finetuned_model(model, stats, numerical_cols, save_path):
    """
    Saves the fine-tuned model's state, normalization stats, and numerical column info.
    
    Args:
        model (nn.Module): The trained model object.
        stats (dict): Dictionary containing mean/std for numerical columns.
        numerical_cols (list): List of numerical column names used for training.
        save_path (str): Path to save the model file.
    """
    # Create a dictionary to hold everything needed for inference
    model_package = {
        'model_state_dict': model.state_dict(),
        'stats': stats,
        'numerical_cols': numerical_cols,
        'config': {
            'MODEL_NAME': Config.MODEL_NAME,
            'PRETRAINED_CHECKPOINT': Config.PRETRAINED_CHECKPOINT
        }
    }
    torch.save(model_package, save_path)
    print(f"‚úÖ Model package saved successfully to: {save_path}")

def predict_from_df(df_to_predict, model_path, output_csv_path="predictions.csv"):
    """
    Predicts prices for a given DataFrame using a saved fine-tuned model.
    
    Args:
        df_to_predict (pd.DataFrame): DataFrame with the same structure as the training data.
        model_path (str): Path to the saved model package (.pth file).
        output_csv_path (str): Path to save the final CSV with predictions.
        
    Returns:
        str: The path to the generated predictions CSV file.
    """
    print("--- Starting Prediction Process ---")
    
    # --- 1. Load Model and Configuration ---
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at: {model_path}")
        
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # ‚úÖ FIXED: Added weights_only=False to allow loading the full dictionary
    model_package = torch.load(model_path, map_location=device, weights_only=False)
    
    model_config = model_package['config']
    stats = model_package['stats']
    numerical_cols_loaded = model_package['numerical_cols']
    
    # Initialize the base CLIP model and preprocessor
    clip_model, _, base_preprocess = open_clip.create_model_and_transforms(
        model_config['MODEL_NAME'], pretrained=model_config['PRETRAINED_CHECKPOINT'], device=device
    )
    tokenizer = open_clip.get_tokenizer(model_config['MODEL_NAME'])
    
    # Initialize our custom model architecture and load the saved weights
    NUMERICAL_FEATURE_SIZE = len(numerical_cols_loaded)
    prediction_model = CLIPPricePredictor(clip_model, numerical_feature_size=NUMERICAL_FEATURE_SIZE).to(device)
    prediction_model.load_state_dict(model_package['model_state_dict'])
    prediction_model.eval() # Set model to evaluation mode
    
    print("‚úÖ Model and configuration loaded.")

    # --- 2. Preprocess the Input DataFrame ---
    # Ensure the global 'unit_cols' from training is used for consistent one-hot encoding
    global unit_cols
    unit_cols = [col for col in numerical_cols_loaded if col.startswith('unit_')]
    
    df_pred_processed = preprocess_dataframe(df_to_predict)
    
    # Filter out any rows that have missing essential data after preprocessing
    df_pred_processed.dropna(subset=['description', 'values'], inplace=True)
    df_pred_processed = df_pred_processed[df_pred_processed['description'] != '']
    
    # Store sample_ids to merge back later, ensuring order is maintained
    original_sample_ids = df_pred_processed['sample_id'].tolist()
    
    print(f"‚úÖ Input data preprocessed. Predicting on {len(df_pred_processed)} samples.")

    # --- 3. Create Dataset and DataLoader ---
    pred_dataset = ProductPriceDataset(
        df_pred_processed, 
        base_preprocess, 
        tokenizer, 
        numerical_cols_loaded, 
        stats,
        is_predict=True # Set flag to not look for 'price' column
    )
    
    pred_loader = DataLoader(
        pred_dataset, 
        batch_size=Config.BATCH_SIZE * 2, # Use a larger batch size for faster inference
        shuffle=False, 
        num_workers=Config.NUM_WORKERS, 
        collate_fn=collate_fn
    )

    # --- 4. Run Inference ---
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(pred_loader, desc="Generating Predictions"):
            if batch is None: continue
            
            images = batch['image'].to(device)
            texts = batch['text'].to(device)
            numerical = batch['numerical'].to(device)
            
            pred_log_prices = prediction_model(images, texts, numerical)
            
            # Convert log prices back to actual prices
            pred_prices = torch.expm1(pred_log_prices).cpu().numpy()
            all_predictions.extend(pred_prices)
    
    print("‚úÖ Inference complete.")

    # --- 5. Format and Save Output ---
    results_df = pd.DataFrame({
        'sample_id': original_sample_ids,
        'price': all_predictions
    })
    
    # Format the price to two decimal places
    results_df['price'] = results_df['price'].round(2)
    
    results_df.to_csv(output_csv_path, index=False)
    print(f"üéâ Predictions saved successfully to: {output_csv_path}")
    
    return output_csv_path

# Example of how you would use the prediction function (do not run this part during training)
if __name__ == '__main__':
    # ... (after training completes and model is saved) ...
    
    # 1. Load a new DataFrame you want to predict prices for
    # new_data_df = pd.read_csv('path/to/your/new_data.csv')
    new_data_df = df
    
    # 2. Define the path to your saved model
    saved_model_path = 'final_clip_price_model.pth'
    
    # 3. Call the prediction function
    prediction_csv_file = predict_from_df(new_data_df, saved_model_path, output_csv_path="my_new_predictions.csv")
    
    print(f"Find your results in: {prediction_csv_file}")

In [13]:
def predict_from_df(df_to_predict, model_path, output_csv_path="predictions.csv"):
    """
    Predicts prices for a given DataFrame using a saved fine-tuned model.
    
    Args:
        df_to_predict (pd.DataFrame): DataFrame with the same structure as the training data.
        model_path (str): Path to the saved model package (.pth file).
        output_csv_path (str): Path to save the final CSV with predictions.
        
    Returns:
        str: The path to the generated predictions CSV file.
    """
    print("--- Starting Prediction Process ---")
    
    # --- 1. Load Model and Configuration ---
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at: {model_path}")
        
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # ‚úÖ FIXED: Added weights_only=False to allow loading the full dictionary
    model_package = torch.load(model_path, map_location=device, weights_only=False)
    
    model_config = model_package['config']
    stats = model_package['stats']
    numerical_cols_loaded = model_package['numerical_cols']
    
    # Initialize the base CLIP model and preprocessor
    clip_model, _, base_preprocess = open_clip.create_model_and_transforms(
        model_config['MODEL_NAME'], pretrained=model_config['PRETRAINED_CHECKPOINT'], device=device
    )
    tokenizer = open_clip.get_tokenizer(model_config['MODEL_NAME'])
    
    # Initialize our custom model architecture and load the saved weights
    NUMERICAL_FEATURE_SIZE = len(numerical_cols_loaded)
    prediction_model = CLIPPricePredictor(clip_model, numerical_feature_size=NUMERICAL_FEATURE_SIZE).to(device)
    prediction_model.load_state_dict(model_package['model_state_dict'])
    prediction_model.eval() # Set model to evaluation mode
    
    print("‚úÖ Model and configuration loaded.")

    # --- 2. Preprocess the Input DataFrame ---
    # Ensure the global 'unit_cols' from training is used for consistent one-hot encoding
    global unit_cols
    unit_cols = [col for col in numerical_cols_loaded if col.startswith('unit_')]
    
    df_pred_processed = preprocess_dataframe(df_to_predict)
    
    # Filter out any rows that have missing essential data after preprocessing
    df_pred_processed.dropna(subset=['description', 'values'], inplace=True)
    df_pred_processed = df_pred_processed[df_pred_processed['description'] != '']
    
    # Store sample_ids to merge back later, ensuring order is maintained
    original_sample_ids = df_pred_processed['sample_id'].tolist()
    
    print(f"‚úÖ Input data preprocessed. Predicting on {len(df_pred_processed)} samples.")

    # --- 3. Create Dataset and DataLoader ---
    pred_dataset = ProductPriceDataset(
        df_pred_processed, 
        base_preprocess, 
        tokenizer, 
        numerical_cols_loaded, 
        stats,
        is_predict=True # Set flag to not look for 'price' column
    )
    
    pred_loader = DataLoader(
        pred_dataset, 
        batch_size=Config.BATCH_SIZE * 2, # Use a larger batch size for faster inference
        shuffle=False, 
        num_workers= Config.NUM_WORKERS , 
        collate_fn=collate_fn
    )

    # --- 4. Run Inference ---
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(pred_loader, desc="Generating Predictions"):
            if batch is None: continue
            
            images = batch['image'].to(device)
            texts = batch['text'].to(device)
            numerical = batch['numerical'].to(device)
            
            pred_log_prices = prediction_model(images, texts, numerical)
            
            # Convert log prices back to actual prices
            pred_prices = torch.expm1(pred_log_prices).cpu().numpy()
            all_predictions.extend(pred_prices)
    
    print("‚úÖ Inference complete.")

    # --- 5. Format and Save Output ---
    results_df = pd.DataFrame({
        'sample_id': original_sample_ids,
        'price': all_predictions
    })
    
    # Format the price to two decimal places
    results_df['price'] = results_df['price'].round(2)
    
    results_df.to_csv(output_csv_path, index=False)
    print(f"üéâ Predictions saved successfully to: {output_csv_path}")
    
    return output_csv_path

In [15]:
class CLIPPricePredictor(nn.Module):
    """The core model combining CLIP embeddings with numerical features."""
    def __init__(self, clip_model, numerical_feature_size, dropout_rate=0.4):
        super().__init__()
        self.clip = clip_model
        clip_embedding_dim = self.clip.text_projection.shape[1]
        combined_dim = (clip_embedding_dim * 2) + numerical_feature_size

        self.regression_head = nn.Sequential(
            nn.Linear(combined_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, image, text, numerical_feats):
        with torch.no_grad():
            image_features = self.clip.encode_image(image)
            text_features = self.clip.encode_text(text)

        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        combined = torch.cat([image_features, text_features, numerical_feats], dim=1)
        return self.regression_head(combined).squeeze(-1)

class EarlyStopper:
    """Stops training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss, model):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            print("‚úÖ Validation loss decreased, saving model checkpoint.")
            torch.save(model.state_dict(), 'best_model_checkpoint.pth')
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                print("üõë Early stopping triggered!")
                return True
        return False

In [None]:
if __name__ == '__main__':
    # ... (after training completes and model is saved) ...
    
    # 1. Load a new DataFrame you want to predict prices for
    # new_data_df = pd.read_csv('path/to/your/new_data.csv')
    new_data_df = df
    # 2. Define the path to your saved model
    saved_model_path = 'final_clip_price_model.pth'
    
    # 3. Call the prediction function
    prediction_csv_file = predict_from_df(new_data_df, saved_model_path, output_csv_path="my_new_predictions.csv")
    
    print(f"Find your results in: {prediction_csv_file}")


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   sample_id              75000 non-null  int64 
 1   catalog_content        75000 non-null  object
 2   product_description    32710 non-null  object
 3   values_only            73991 non-null  object
 4   units_only             73988 non-null  object
 5   product_attributes     39304 non-null  object
 6   pack_of                19651 non-null  object
 7   Item_name              75000 non-null  object
 8   ingredients_only       7372 non-null   object
 9   falvors_only           36372 non-null  object
 10  country_origin         2681 non-null   object
 11  benefits               60724 non-null  object
 12  brand_name             74136 non-null  object
 13  image_link             75000 non-null  object
 14  downloaded_image_name  75000 non-null  object
dtypes: int64(1), object

In [1]:
import pandas as pd

In [2]:
df = pd.read_excel("test_final.xlsx")

In [3]:
df.shape

(75000, 15)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   sample_id              75000 non-null  int64 
 1   catalog_content        75000 non-null  object
 2   product_description    32710 non-null  object
 3   values_only            73991 non-null  object
 4   units_only             73988 non-null  object
 5   product_attributes     39304 non-null  object
 6   pack_of                19651 non-null  object
 7   Item_name              75000 non-null  object
 8   ingredients_only       7372 non-null   object
 9   falvors_only           36372 non-null  object
 10  country_origin         2681 non-null   object
 11  benefits               60724 non-null  object
 12  brand_name             74136 non-null  object
 13  image_link             75000 non-null  object
 14  downloaded_image_name  75000 non-null  object
dtypes: int64(1), object

In [5]:
# Rename specific columns and correct the typo
df.rename(columns={
    'Item_name': 'item_name',
    'falvors_only': 'flavors_only', # Correcting the typo
    'brand_name': 'brand_name'
}, inplace=True)

# Display the new column names
print(df.columns)

Index(['sample_id', 'catalog_content', 'product_description', 'values_only',
       'units_only', 'product_attributes', 'pack_of', 'item_name',
       'ingredients_only', 'flavors_only', 'country_origin', 'benefits',
       'brand_name', 'image_link', 'downloaded_image_name'],
      dtype='object')


In [6]:
df.rename(columns={
    'values_only': 'values'
}, inplace=True)

In [27]:
rename_dict = {
    'product_description': 'descriptions',
    'values_only': 'values',
    # 'units_only': 'units', # Not in the target list, so we'll drop or ignore
    # 'product_attributes': 'attributes', # Not in the target list
    'Item_name': 'item_name', # Correcting case
    # 'ingredients_only': 'ingredients', # Not in the target list
    'falvors_only': 'flavors', # Correcting typo and shortening, though 'flavors' is not in target list
    # 'country_origin': 'country', # Not in the target list
}

df.rename(columns=rename_dict, inplace=True)

In [7]:
df.to_csv('output.csv', index=False)

In [None]:
# --- 1. Import Libraries ---
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import train_test_split
from PIL import Image
import pandas as pd
import numpy as np
import open_clip
import os
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torchvision import transforms
import warnings
import json # Added for saving stats

# Suppress annoying but harmless warnings
warnings.filterwarnings("ignore", "Passing `palette` without `max_colors`")

# --- 2. Configuration & Setup ---
class Config:
    """A centralized class for all hyperparameters and settings."""
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    MODEL_NAME = 'ViT-B-32'
    PRETRAINED_CHECKPOINT = 'laion2b_s34b_b79k'
    
    # Adjusted for a larger dataset
    BATCH_SIZE = 128 # Can be increased based on GPU VRAM
    EPOCHS = 2 # With more data, more epochs can be beneficial; EarlyStopping will prevent overfitting.
    NUM_WORKERS = 0 # Use more CPU cores to load data faster

    # Differential Learning Rates for fine-tuning
    CLIP_LR = 1e-6
    HEAD_LR = 1e-4
    WEIGHT_DECAY = 0.01

    # Early Stopping parameters
    PATIENCE = 3
    MIN_DELTA = 0.001

    # Paths (‚ö†Ô∏è IMPORTANT: Update this path if needed)
    # IMAGE_DIR = 'all_images_training_75k_new'
    # MODEL_SAVE_PATH = 'final_clip_price_model.pth' # Path for the final model

# print(f"Using device: {Config.DEVICE}")
# if not os.path.exists(Config.IMAGE_DIR):
#     raise FileNotFoundError(f"Image directory not found: {Config.IMAGE_DIR}. Please check the path.")

# --- 3. Data Loading and Enhanced Feature Engineering ---

# ‚ö†Ô∏è ===================================================================
# ‚ö†Ô∏è IMPORTANT: Load your actual DataFrame 'df' here.
# ‚ö†Ô∏è Example: df = pd.read_csv('path/to/your/data.csv')
# ‚ö†Ô∏è ===================================================================
# This block is a placeholder. Ensure your 'df' is loaded before this script runs.
# For demonstration purposes, I will create a dummy DataFrame.
# In your environment, you should already have 'df' loaded from your file.
# if 'df' not in locals():
#      raise NameError("DataFrame 'df' is not defined. Please load your data where indicated.")

# Add this line to see all your available column names
# print(f"Available columns in your DataFrame: {df.columns.tolist()}")


def create_comprehensive_description(row):
    """Combines multiple text fields into a single rich description for CLIP."""
    parts = []
    # Add brand, item name, and general description for richer context
    if pd.notna(row.get('brand_name')): parts.append(str(row['brand_name']))
    if pd.notna(row.get('item_name')): parts.append(str(row['item_name']))
    if pd.notna(row.get('descriptions')): parts.append(f"Description: {row['descriptions']}")

    # This part is kept for generality, it won't find columns in your current df but won't error
    attributes = [str(row[col]) for col in ['flavours_only', 'sizes_only', 'product_attributes'] if col in row and pd.notna(row[col])]
    if attributes: parts.append(f"Features: {', '.join(attributes)}.")

    if pd.notna(row.get('benefits')): parts.append(f"Benefits: {row['benefits']}")

    return " ".join(parts).replace("  ", " ").strip()

def preprocess_dataframe(df_in):
    """A reusable function to apply all preprocessing steps to a DataFrame."""
    df_processed = df_in.copy()
    
    # Strip leading/trailing whitespace from all column names
    df_processed.columns = df_processed.columns.str.strip()
    
    # Convert 'values' from object to a numeric type.
    df_processed['values'] = pd.to_numeric(df_processed['values'], errors='coerce')

    # Handle missing values. For 'pack_of', assume NaN means a single item.
    df_processed['pack_of'] = df_processed['pack_of'].fillna(1.0).astype(float)
    
    # One-hot encode 'units' if the column exists
    if 'units' in df_processed.columns:
        df_processed['units'] = df_processed['units'].fillna('unknown').astype(str)
        # Use reindex to ensure prediction DF has same unit columns as training DF
        if 'unit_cols' in globals():
             dummies = pd.get_dummies(df_processed['units'], prefix='unit', dtype=float)
             df_processed = pd.concat([df_processed.drop('units', axis=1), dummies], axis=1)
             # Ensure all columns from training are present, fill missing with 0
             for col in unit_cols:
                 if col not in df_processed.columns:
                     df_processed[col] = 0
        else: # First run (training)
             df_processed = pd.get_dummies(df_processed, columns=['units'], prefix='unit', dtype=float)
            
    # Apply text feature engineering
    df_processed['description'] = df_processed.apply(create_comprehensive_description, axis=1)
    df_processed['image_path'] = df_processed['downloaded_image_name'].apply(lambda x: os.path.join(Config.IMAGE_DIR, str(x)))
    
    return df_processed

# --- Preprocessing ---
df_preprocessed = preprocess_dataframe(df)

VALUE_COL = 'values'
if VALUE_COL not in df_preprocessed.columns:
    raise KeyError(f"Column '{VALUE_COL}' not found in DataFrame. Please check the column name.")

# Define columns to be used
unit_cols = [col for col in df_preprocessed.columns if col.startswith('unit_')]
numerical_cols = [VALUE_COL, 'pack_of'] + unit_cols 
feature_cols = ['description', 'price', 'downloaded_image_name', 'image_path', 'sample_id'] + numerical_cols

# Filter for necessary columns and drop rows with essential missing data
df_processed = df_preprocessed[feature_cols].copy()
df_processed.dropna(subset=['price', 'description', VALUE_COL], inplace=True)
df_processed = df_processed[df_processed['description'] != '']

print(f"‚úÖ Created features. Total samples: {len(df_processed)}. Numerical/Categorical features: {numerical_cols}")

train_df, test_df = train_test_split(df_processed, test_size=0.4, random_state=42)
print(f"Data split into {len(train_df)} training and {len(test_df)} testing samples.")


# --- 4. Dataset Class and DataLoaders ---

class ProductPriceDataset(Dataset):
    """Custom PyTorch Dataset for loading product images, text, and numerical features."""
    def __init__(self, df, image_transform, tokenizer, numerical_cols, stats, is_predict=False):
        self.df = df.copy() 
        self.image_transform = image_transform
        self.tokenizer = tokenizer
        self.numerical_cols = numerical_cols
        self.stats = stats
        self.is_predict = is_predict
        self.value_col_name = numerical_cols[0] 

        # Pre-calculate normalized values for efficiency
        for col in [self.value_col_name, 'pack_of']:
            mean = self.stats[col]['mean']
            std = self.stats[col]['std']
            self.df[f'{col}_normalized'] = (self.df[col] - mean) / (std + 1e-6)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        try:
            image = self.image_transform(Image.open(row['image_path']).convert('RGB'))
        except (FileNotFoundError, UnboundLocalError, OSError):
            return None

        text = self.tokenizer(row['description'])[0]
        
        norm_value = torch.tensor(row[f'{self.value_col_name}_normalized'], dtype=torch.float32)
        norm_pack_of = torch.tensor(row['pack_of_normalized'], dtype=torch.float32)
        unit_features = torch.tensor(row[[c for c in self.numerical_cols if c.startswith('unit_')]].values.astype(np.float32))
        numerical_feats = torch.cat([norm_value.unsqueeze(0), norm_pack_of.unsqueeze(0), unit_features])

        if self.is_predict:
            return {'image': image, 'text': text, 'numerical': numerical_feats}
        else:
            log_price = torch.tensor(np.log1p(row['price']), dtype=torch.float32)
            return {'image': image, 'text': text, 'numerical': numerical_feats, 'log_price': log_price}

def collate_fn(batch):
    """Custom collate function to filter out None values from the batch."""
    batch = list(filter(lambda x: x is not None, batch))
    return torch.utils.data.dataloader.default_collate(batch) if batch else None


# --- 5. Model, Training, and Evaluation ---

class CLIPPricePredictor(nn.Module):
    """The core model combining CLIP embeddings with numerical features."""
    def __init__(self, clip_model, numerical_feature_size, dropout_rate=0.4):
        super().__init__()
        self.clip = clip_model
        clip_embedding_dim = self.clip.text_projection.shape[1]
        combined_dim = (clip_embedding_dim * 2) + numerical_feature_size

        self.regression_head = nn.Sequential(
            nn.Linear(combined_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 1)
        )

    def forward(self, image, text, numerical_feats):
        with torch.no_grad():
            image_features = self.clip.encode_image(image)
            text_features = self.clip.encode_text(text)

        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)

        combined = torch.cat([image_features, text_features, numerical_feats], dim=1)
        return self.regression_head(combined).squeeze(-1)

class EarlyStopper:
    """Stops training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=5, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss, model):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            print("‚úÖ Validation loss decreased, saving model checkpoint.")
            torch.save(model.state_dict(), 'best_model_checkpoint.pth')
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                print("üõë Early stopping triggered!")
                return True
        return False

def calculate_smape(preds, targets):
    """Calculates Symmetric Mean Absolute Percentage Error."""
    numerator = torch.abs(preds - targets)
    denominator = (torch.abs(targets) + torch.abs(preds)) / 2
    return (torch.mean(numerator / (denominator + 1e-8)) * 100).item()

def train_and_evaluate(model, train_loader, val_loader, optimizer, scheduler, loss_fn, config):
    """The main training and validation loop."""
    history = {'train_loss': [], 'val_loss': [], 'val_smape': []}
    early_stopper = EarlyStopper(patience=config.PATIENCE, min_delta=config.MIN_DELTA)

    for epoch in range(config.EPOCHS):
        model.train()
        total_train_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.EPOCHS} [Training]")

        for batch in progress_bar:
            if batch is None: continue

            images = batch['image'].to(config.DEVICE)
            texts = batch['text'].to(config.DEVICE)
            numerical = batch['numerical'].to(config.DEVICE)
            log_prices = batch['log_price'].to(config.DEVICE)

            optimizer.zero_grad()
            pred_log_prices = model(images, texts, numerical)
            loss = loss_fn(pred_log_prices, log_prices)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            progress_bar.set_postfix({'log_huber_loss': f'{loss.item():.4f}'})

        avg_train_loss = total_train_loss / len(train_loader)
        history['train_loss'].append(avg_train_loss)

        model.eval()
        total_val_loss, total_val_smape = 0, 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1}/{config.EPOCHS} [Validation]"):
                if batch is None: continue

                images = batch['image'].to(config.DEVICE)
                texts = batch['text'].to(config.DEVICE)
                numerical = batch['numerical'].to(config.DEVICE)
                log_prices = batch['log_price'].to(config.DEVICE)

                pred_log_prices = model(images, texts, numerical)
                total_val_loss += loss_fn(pred_log_prices, log_prices).item()

                true_prices = torch.expm1(log_prices)
                pred_prices = torch.expm1(pred_log_prices)
                total_val_smape += calculate_smape(pred_prices, true_prices)

        avg_val_loss = total_val_loss / len(val_loader)
        avg_val_smape = total_val_smape / len(val_loader)
        history['val_loss'].append(avg_val_loss)
        history['val_smape'].append(avg_val_smape)

        print(f"Epoch {epoch+1}/{config.EPOCHS} -> Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val SMAPE: {avg_val_smape:.2f}% | LR: {scheduler.get_last_lr()[0]:.2e}")

        scheduler.step()
        if early_stopper.early_stop(avg_val_loss, model):
            break

    print("Loading best model state from training checkpoint.")
    model.load_state_dict(torch.load('best_model_checkpoint.pth'))
    return history, model


# --- 6. Main Execution Block ---
# if __name__ == '__main__':
#     # Initialize CLIP model, tokenizer, and image transformations
#     clip_model, _, base_preprocess = open_clip.create_model_and_transforms(
#         Config.MODEL_NAME, pretrained=Config.PRETRAINED_CHECKPOINT, device=Config.DEVICE
#     )
#     tokenizer = open_clip.get_tokenizer(Config.MODEL_NAME)

#     # Add data augmentation for the training set
#     train_preprocess = transforms.Compose([
#         transforms.RandomResizedCrop(224, scale=(0.8, 1.0), ratio=(0.75, 1.33)),
#         transforms.RandomHorizontalFlip(p=0.5),
#         transforms.ColorJitter(brightness=0.2, contrast=0.2),
#         base_preprocess,
#     ])

#     # Calculate normalization stats ONLY from the training data
#     stats = {
#         VALUE_COL: {'mean': train_df[VALUE_COL].mean(), 'std': train_df[VALUE_COL].std()},
#         'pack_of': {'mean': train_df['pack_of'].mean(), 'std': train_df['pack_of'].std()}
#     }

#     train_dataset = ProductPriceDataset(train_df, train_preprocess, tokenizer, numerical_cols, stats)
#     test_dataset = ProductPriceDataset(test_df, base_preprocess, tokenizer, numerical_cols, stats)

#     train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)
#     test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=collate_fn)

#     # Pass the correct number of numerical features to the model
#     NUMERICAL_FEATURE_SIZE = len(numerical_cols)
#     model = CLIPPricePredictor(clip_model, numerical_feature_size=NUMERICAL_FEATURE_SIZE).to(Config.DEVICE)
    
#     for param in model.clip.parameters():
#         param.requires_grad = False

#     optimizer = torch.optim.AdamW([
#         {'params': model.clip.parameters(), 'lr': Config.CLIP_LR},
#         {'params': model.regression_head.parameters(), 'lr': Config.HEAD_LR}
#     ], weight_decay=Config.WEIGHT_DECAY)

#     loss_fn = nn.HuberLoss()
#     scheduler = CosineAnnealingLR(optimizer, T_max=Config.EPOCHS, eta_min=1e-8)

#     # Run training
#     training_history, best_model = train_and_evaluate(model, train_loader, test_loader, optimizer, scheduler, loss_fn, Config)

#     # --- 7. Save Final Model and Plot Results ---
#     print("\n--- Training Finished ---")
#     save_finetuned_model(best_model, stats, numerical_cols, Config.MODEL_SAVE_PATH)
    
#     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
#     epochs_ran = len(training_history['train_loss'])

#     ax1.plot(range(epochs_ran), training_history['train_loss'], label='Training Loss (Huber)', marker='o')
#     ax1.plot(range(epochs_ran), training_history['val_loss'], label='Validation Loss (Huber)', marker='o')
#     ax1.set_title('Training and Validation Loss', fontsize=14); ax1.set_xlabel('Epochs'); ax1.set_ylabel('Loss'); ax1.legend(); ax1.grid(True)

#     ax2.plot(range(epochs_ran), training_history['val_smape'], label='Validation SMAPE', color='orange', marker='o')
#     ax2.set_title('Validation SMAPE', fontsize=14); ax2.set_xlabel('Epochs'); ax2.set_ylabel('SMAPE (%)'); ax2.legend(); ax2.grid(True)

#     plt.tight_layout()
#     plt.show()

# =========================================================================================
# --- üöÄ NEW: Additional Functions for Saving and Prediction üöÄ ---
# =========================================================================================

def save_finetuned_model(model, stats, numerical_cols, save_path):
    """
    Saves the fine-tuned model's state, normalization stats, and numerical column info.
    
    Args:
        model (nn.Module): The trained model object.
        stats (dict): Dictionary containing mean/std for numerical columns.
        numerical_cols (list): List of numerical column names used for training.
        save_path (str): Path to save the model file.
    """
    # Create a dictionary to hold everything needed for inference
    model_package = {
        'model_state_dict': model.state_dict(),
        'stats': stats,
        'numerical_cols': numerical_cols,
        'config': {
            'MODEL_NAME': Config.MODEL_NAME,
            'PRETRAINED_CHECKPOINT': Config.PRETRAINED_CHECKPOINT
        }
    }
    torch.save(model_package, save_path)
    print(f"‚úÖ Model package saved successfully to: {save_path}")

def predict_from_df(df_to_predict, model_path, output_csv_path="predictions.csv"):
    """
    Predicts prices for a given DataFrame using a saved fine-tuned model.
    
    Args:
        df_to_predict (pd.DataFrame): DataFrame with the same structure as the training data.
        model_path (str): Path to the saved model package (.pth file).
        output_csv_path (str): Path to save the final CSV with predictions.
        
    Returns:
        str: The path to the generated predictions CSV file.
    """
    print("--- Starting Prediction Process ---")
    
    # --- 1. Load Model and Configuration ---
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at: {model_path}")
        
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # ‚úÖ FIXED: Added weights_only=False to allow loading the full dictionary
    model_package = torch.load(model_path, map_location=device, weights_only=False)
    
    model_config = model_package['config']
    stats = model_package['stats']
    numerical_cols_loaded = model_package['numerical_cols']
    
    # Initialize the base CLIP model and preprocessor
    clip_model, _, base_preprocess = open_clip.create_model_and_transforms(
        model_config['MODEL_NAME'], pretrained=model_config['PRETRAINED_CHECKPOINT'], device=device
    )
    tokenizer = open_clip.get_tokenizer(model_config['MODEL_NAME'])
    
    # Initialize our custom model architecture and load the saved weights
    NUMERICAL_FEATURE_SIZE = len(numerical_cols_loaded)
    prediction_model = CLIPPricePredictor(clip_model, numerical_feature_size=NUMERICAL_FEATURE_SIZE).to(device)
    prediction_model.load_state_dict(model_package['model_state_dict'])
    prediction_model.eval() # Set model to evaluation mode
    
    print("‚úÖ Model and configuration loaded.")

    # --- 2. Preprocess the Input DataFrame ---
    # Ensure the global 'unit_cols' from training is used for consistent one-hot encoding
    global unit_cols
    unit_cols = [col for col in numerical_cols_loaded if col.startswith('unit_')]
    
    df_pred_processed = preprocess_dataframe(df_to_predict)
    
    # Filter out any rows that have missing essential data after preprocessing
    df_pred_processed.dropna(subset=['description', 'values'], inplace=True)
    df_pred_processed = df_pred_processed[df_pred_processed['description'] != '']
    
    # Store sample_ids to merge back later, ensuring order is maintained
    original_sample_ids = df_pred_processed['sample_id'].tolist()
    
    print(f"‚úÖ Input data preprocessed. Predicting on {len(df_pred_processed)} samples.")

    # --- 3. Create Dataset and DataLoader ---
    pred_dataset = ProductPriceDataset(
        df_pred_processed, 
        base_preprocess, 
        tokenizer, 
        numerical_cols_loaded, 
        stats,
        is_predict=True # Set flag to not look for 'price' column
    )
    
    pred_loader = DataLoader(
        pred_dataset, 
        batch_size=Config.BATCH_SIZE * 2, # Use a larger batch size for faster inference
        shuffle=False, 
        num_workers= Config.NUM_WORKERS , 
        collate_fn=collate_fn
    )

    # --- 4. Run Inference ---
    all_predictions = []
    with torch.no_grad():
        for batch in tqdm(pred_loader, desc="Generating Predictions"):
            if batch is None: continue
            
            images = batch['image'].to(device)
            texts = batch['text'].to(device)
            numerical = batch['numerical'].to(device)
            
            pred_log_prices = prediction_model(images, texts, numerical)
            
            # Convert log prices back to actual prices
            pred_prices = torch.expm1(pred_log_prices).cpu().numpy()
            all_predictions.extend(pred_prices)
    
    print("‚úÖ Inference complete.")

    # --- 5. Format and Save Output ---
    results_df = pd.DataFrame({
        'sample_id': original_sample_ids,
        'price': all_predictions
    })
    
    # Format the price to two decimal places
    results_df['price'] = results_df['price'].round(2)
    
    results_df.to_csv(output_csv_path, index=False)
    print(f"üéâ Predictions saved successfully to: {output_csv_path}")
    
    return output_csv_path

# Example of how you would use the prediction function (do not run this part during training)
"""
if __name__ == '__main__':
    # ... (after training completes and model is saved) ...
    
    # 1. Load a new DataFrame you want to predict prices for
    # new_data_df = pd.read_csv('path/to/your/new_data.csv')
    
    # 2. Define the path to your saved model
    # saved_model_path = 'final_clip_price_model.pth'
    
    # 3. Call the prediction function
    # prediction_csv_file = predict_from_df(new_data_df, saved_model_path, output_csv_path="my_new_predictions.csv")
    
    # print(f"Find your results in: {prediction_csv_file}")
"""

In [None]:
if __name__ == '__main__':
    # ... (after training completes and model is saved) ...
    
    # 1. Load a new DataFrame you want to predict prices for
    # new_data_df = pd.read_csv('path/to/your/new_data.csv')
    new_data_df = df
    # 2. Define the path to your saved model
    saved_model_path = 'final_clip_price_model.pth'
    
    # 3. Call the prediction function
    prediction_csv_file = predict_from_df(new_data_df, saved_model_path, output_csv_path="my_new_predictions.csv")
    
    print(f"Find your results in: {prediction_csv_file}")
