In [None]:
import pandas as pd


In [None]:
data = pd.read_csv('/kaggle/input/dataset/student_resource/dataset/train.csv')

In [None]:
import numpy as np

# Make an explicit copy first
data_no_outliers = data.copy()

# Now safely add the new column
data_no_outliers.loc[:, 'price_log'] = np.log1p(data_no_outliers['price'])


In [None]:
import pandas as pd
import re
import string

# Function to clean text
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Remove emojis and other non-text characters
    text = re.sub(r"[^\w\s]", "", text)
    return text

# Apply to the column
data_no_outliers['catalog_content'] = data_no_outliers['catalog_content'].apply(clean_text)


In [None]:
Q1 = data_no_outliers['price'].quantile(0.25)
Q3 = data_no_outliers['price'].quantile(0.75)
IQR = Q3 - Q1

# Define limits
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

# Filter data
data_no_outliers = data_no_outliers[(data_no_outliers['price'] >= lower_limit) & (data_no_outliers['price'] <= upper_limit)]

print("Before:", len(data_no_outliers))
print("After removing outliers:", len(data_no_outliers))

In [None]:
!pip install transformers torch scikit-learn

In [None]:
!pip install transformers==4.41.2 -q

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ==================== CONFIG ====================
CONFIG = {
    'bert_model': 'distilbert-base-uncased',  # Changed to DistilBERT to avoid tokenizer issues
    'max_length': 256,
    'batch_size': 16,
    'epochs': 2,
    'learning_rate': 2e-5,
    'dropout': 0.3,
    'test_size': 0.2,
    'random_state': 42,
    'use_log_transform': True  # NEW: Enable log transformation
}

# ==================== METRIC FUNCTIONS ====================
def mean_absolute_percentage_error(y_true, y_pred):
    """Calculate MAPE"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    """Calculate SMAPE - The competition metric!"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Avoid division by zero
    mask = denominator > 0
    smape = np.mean(np.abs(y_pred[mask] - y_true[mask]) / denominator[mask]) * 100
    return smape

# ==================== DATASET ====================
class ProductDataset(Dataset):
    def __init__(self, texts, prices, tokenizer, max_length):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        price = self.prices[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'price': torch.tensor(price, dtype=torch.float)
        }

# ==================== MODEL ====================
class BERTPricePredictor(nn.Module):
    def __init__(self, bert_model_name, dropout=0.3):
        super(BERTPricePredictor, self).__init__()
        
        # Load pre-trained BERT
        self.bert = AutoModel.from_pretrained(bert_model_name)
        
        # Get BERT hidden size
        bert_hidden_size = self.bert.config.hidden_size  # 768 for bert-base
        
        # MLP for price prediction
        self.regressor = nn.Sequential(
            nn.Linear(bert_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    
    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation (first token)
        cls_output = outputs.last_hidden_state[:, 0, :]
        
        # Predict price
        price = self.regressor(cls_output)
        
        return price.squeeze()

# ==================== TRAINING FUNCTIONS ====================
def train_epoch(model, dataloader, optimizer, scheduler, device, criterion, use_log):
    model.train()
    total_loss = 0
    predictions = []
    actuals = []
    
    progress_bar = tqdm(dataloader, desc='Training')
    
    for batch in progress_bar:
        # Move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        prices = batch['price'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        
        # Calculate loss
        loss = criterion(outputs, prices)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        # Track metrics (convert back from log if needed)
        total_loss += loss.item()
        pred_np = outputs.detach().cpu().numpy()
        actual_np = prices.cpu().numpy()
        
        # Convert from log to original scale for metrics
        if use_log:
            pred_np = np.expm1(pred_np)
            actual_np = np.expm1(actual_np)
        
        predictions.extend(pred_np)
        actuals.extend(actual_np)
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(dataloader)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    mape = mean_absolute_percentage_error(actuals, predictions)
    smape = symmetric_mean_absolute_percentage_error(actuals, predictions)
    
    return avg_loss, rmse, mae, r2, mape, smape

def evaluate(model, dataloader, device, criterion, use_log):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            prices = batch['price'].to(device)
            
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, prices)
            
            total_loss += loss.item()
            
            pred_np = outputs.cpu().numpy()
            actual_np = prices.cpu().numpy()
            
            # Convert from log to original scale for metrics
            if use_log:
                pred_np = np.expm1(pred_np)
                actual_np = np.expm1(actual_np)
            
            predictions.extend(pred_np)
            actuals.extend(actual_np)
    
    avg_loss = total_loss / len(dataloader)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    mape = mean_absolute_percentage_error(actuals, predictions)
    smape = symmetric_mean_absolute_percentage_error(actuals, predictions)
    
    return avg_loss, rmse, mae, r2, mape, smape, predictions, actuals

# ==================== MAIN TRAINING ====================
def main(df=None):
    # Load your data
    if df is None:
        try:
            df = data_no_outliers.copy()
        except NameError:
            raise ValueError("Please pass your dataframe: main(data_no_outliers)")
    else:
        df = df.copy()
    
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"\nPrice statistics:")
    print(df['price'].describe())
    
    # Prepare data
    X = df['catalog_content'].values
    
    # Apply log transformation if enabled
    if CONFIG['use_log_transform']:
        print("\n‚úÖ Using LOG TRANSFORMATION for prices")
        y = np.log1p(df['price'].values)  # log(1 + price)
    else:
        print("\n‚ùå NOT using log transformation")
        y = df['price'].values
    
    # Train-validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=CONFIG['test_size'], 
        random_state=CONFIG['random_state']
    )
    
    print(f"\nTrain size: {len(X_train)}")
    print(f"Validation size: {len(X_val)}")
    
    # Load tokenizer with workaround for HuggingFace Hub error
    print(f"\nLoading tokenizer: {CONFIG['bert_model']}")
    
    try:
        # Try normal loading first
        tokenizer = AutoTokenizer.from_pretrained(CONFIG['bert_model'])
        print("‚úÖ Tokenizer loaded successfully")
    except Exception as e:
        print(f"‚ö†Ô∏è Error loading tokenizer: {str(e)[:100]}")
        print("Trying alternative methods...")
        
        # Fallback 1: Use cached version
        try:
            tokenizer = AutoTokenizer.from_pretrained(CONFIG['bert_model'], local_files_only=True)
            print("‚úÖ Loaded from cache")
        except:
            # Fallback 2: Use distilbert (compatible tokenizer)
            print("Switching to distilbert-base-uncased...")
            CONFIG['bert_model'] = 'distilbert-base-uncased'
            tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
            print("‚úÖ Using DistilBERT tokenizer instead")
    
    # Create datasets
    train_dataset = ProductDataset(X_train, y_train, tokenizer, CONFIG['max_length'])
    val_dataset = ProductDataset(X_val, y_val, tokenizer, CONFIG['max_length'])
    
    # Create dataloaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=CONFIG['batch_size'], 
        shuffle=True,
        num_workers=2
    )
    val_loader = DataLoader(
        val_dataset, 
        batch_size=CONFIG['batch_size'], 
        shuffle=False,
        num_workers=2
    )
    
    # Initialize model
    print(f"\nInitializing model: {CONFIG['bert_model']}")
    model = BERTPricePredictor(CONFIG['bert_model'], CONFIG['dropout'])
    model.to(device)
    
    # Loss function
    criterion = nn.MSELoss()
    
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    
    # Scheduler
    total_steps = len(train_loader) * CONFIG['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Training loop
    print("\n" + "="*70)
    print("STARTING TRAINING - 2 EPOCHS (WITH LOG TRANSFORM)")
    print("="*70)
    
    best_val_smape = float('inf')
    best_val_rmse = float('inf')
    history = {
        'train_loss': [], 'train_rmse': [], 'train_mae': [], 'train_r2': [], 'train_mape': [], 'train_smape': [],
        'val_loss': [], 'val_rmse': [], 'val_mae': [], 'val_r2': [], 'val_mape': [], 'val_smape': []
    }
    
    for epoch in range(CONFIG['epochs']):
        print(f"\n{'='*70}")
        print(f"EPOCH {epoch+1}/{CONFIG['epochs']}")
        print(f"{'='*70}")
        
        # Train
        train_loss, train_rmse, train_mae, train_r2, train_mape, train_smape = train_epoch(
            model, train_loader, optimizer, scheduler, device, criterion, CONFIG['use_log_transform']
        )
        
        # Validate
        val_loss, val_rmse, val_mae, val_r2, val_mape, val_smape, val_preds, val_actuals = evaluate(
            model, val_loader, device, criterion, CONFIG['use_log_transform']
        )
        
        # Save history
        history['train_loss'].append(train_loss)
        history['train_rmse'].append(train_rmse)
        history['train_mae'].append(train_mae)
        history['train_r2'].append(train_r2)
        history['train_mape'].append(train_mape)
        history['train_smape'].append(train_smape)
        history['val_loss'].append(val_loss)
        history['val_rmse'].append(val_rmse)
        history['val_mae'].append(val_mae)
        history['val_r2'].append(val_r2)
        history['val_mape'].append(val_mape)
        history['val_smape'].append(val_smape)
        
        # Print metrics
        print(f"\nüìä TRAINING RESULTS:")
        print(f"   Loss: {train_loss:.4f} | RMSE: {train_rmse:.4f} | MAE: {train_mae:.4f} | R¬≤: {train_r2:.4f}")
        print(f"   MAPE: {train_mape:.2f}% | SMAPE: {train_smape:.2f}%")
        
        print(f"\nüìä VALIDATION RESULTS:")
        print(f"   Loss: {val_loss:.4f} | RMSE: {val_rmse:.4f} | MAE: {val_mae:.4f} | R¬≤: {val_r2:.4f}")
        print(f"   MAPE: {val_mape:.2f}% | SMAPE: {val_smape:.2f}% ‚≠ê (COMPETITION METRIC)")
        
        # Save best model based on SMAPE (competition metric)
        if val_smape < best_val_smape:
            best_val_smape = val_smape
            best_val_rmse = val_rmse
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_smape': val_smape,
                'val_rmse': val_rmse,
                'config': CONFIG
            }, 'best_bert_model_log.pt')
            print(f"\n‚úÖ Best model saved! (Val SMAPE: {val_smape:.2f}%)")
    
    print("\n" + "="*70)
    print("üéâ TRAINING COMPLETE!")
    print("="*70)
    print(f"üèÜ Best Validation SMAPE: {best_val_smape:.2f}% (Competition Metric)")
    print(f"üìà Best Validation RMSE: {best_val_rmse:.4f}")
    print(f"üíæ Model saved as: best_bert_model_log.pt")
    print("="*70)
    
    return model, tokenizer, history

# ==================== PREDICTION FUNCTION ====================
def predict_prices(model, texts, tokenizer, device, use_log_transform=True, batch_size=16):
    """Predict prices for new data"""
    model.eval()
    predictions = []
    
    # Create dummy prices for dataset
    dummy_prices = np.zeros(len(texts))
    dataset = ProductDataset(texts, dummy_prices, tokenizer, CONFIG['max_length'])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask)
            pred_np = outputs.cpu().numpy()
            
            # Convert from log to original scale if using log transform
            if use_log_transform:
                pred_np = np.expm1(pred_np)  # exp(x) - 1
            
            predictions.extend(pred_np)
    
    return np.array(predictions)

# ==================== RUN TRAINING ====================
if __name__ == "__main__":
    # Train the model
    model, tokenizer, history = main(data_no_outliers)
    
    # Print final summary
    print("\n" + "="*70)
    print("üìä TRAINING SUMMARY")
    print("="*70)
    for epoch in range(len(history['val_smape'])):
        print(f"Epoch {epoch+1}:")
        print(f"  Val SMAPE: {history['val_smape'][epoch]:.2f}% | Val RMSE: {history['val_rmse'][epoch]:.4f}")
    print("="*70)

In [None]:
# ==================== CONTINUE TRAINING FROM CHECKPOINT ====================
def continue_training(checkpoint_path, df, additional_epochs=1):
    """Continue training from a saved checkpoint"""
    
    print(f"Loading checkpoint from: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, weights_only=False)
    
    # Prepare data (same as before)
    X = df['catalog_content'].values
    if CONFIG['use_log_transform']:
        y = np.log1p(df['price'].values)
    else:
        y = df['price'].values
    
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=CONFIG['test_size'], random_state=CONFIG['random_state']
    )
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['bert_model'])
    
    # Create datasets and dataloaders
    train_dataset = ProductDataset(X_train, y_train, tokenizer, CONFIG['max_length'])
    val_dataset = ProductDataset(X_val, y_val, tokenizer, CONFIG['max_length'])
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2)
    
    # Recreate model
    model = BERTPricePredictor(CONFIG['bert_model'], CONFIG['dropout'])
    model.to(device)
    
    # Load saved weights
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"‚úÖ Loaded model from epoch {checkpoint['epoch'] + 1}")
    print(f"   Previous best val SMAPE: {checkpoint['val_smape']:.2f}%")
    
    # Recreate optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    # Create scheduler for remaining epochs
    total_steps = len(train_loader) * additional_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    
    criterion = nn.MSELoss()
    
    # Track best metrics
    best_val_smape = checkpoint['val_smape']
    start_epoch = checkpoint['epoch'] + 1
    
    print(f"\n{'='*70}")
    print(f"CONTINUING TRAINING FOR {additional_epochs} MORE EPOCH(S)")
    print(f"Starting from epoch {start_epoch + 1}")
    print(f"{'='*70}\n")
    
    # Training loop
    for epoch in range(additional_epochs):
        current_epoch = start_epoch + epoch + 1
        print(f"\n{'='*70}")
        print(f"EPOCH {current_epoch} (Continuation)")
        print(f"{'='*70}")
        
        # Train
        train_loss, train_rmse, train_mae, train_r2, train_mape, train_smape = train_epoch(
            model, train_loader, optimizer, scheduler, device, criterion, CONFIG['use_log_transform']
        )
        
        # Validate
        val_loss, val_rmse, val_mae, val_r2, val_mape, val_smape, _, _ = evaluate(
            model, val_loader, device, criterion, CONFIG['use_log_transform']
        )
        
        # Print metrics
        print(f"\nüìä TRAINING RESULTS:")
        print(f"   Loss: {train_loss:.4f} | RMSE: {train_rmse:.4f} | MAE: {train_mae:.4f} | R¬≤: {train_r2:.4f}")
        print(f"   MAPE: {train_mape:.2f}% | SMAPE: {train_smape:.2f}%")
        
        print(f"\nüìä VALIDATION RESULTS:")
        print(f"   Loss: {val_loss:.4f} | RMSE: {val_rmse:.4f} | MAE: {val_mae:.4f} | R¬≤: {val_r2:.4f}")
        print(f"   MAPE: {val_mape:.2f}% | SMAPE: {val_smape:.2f}% ‚≠ê")
        
        # Save if better
        if val_smape < best_val_smape:
            best_val_smape = val_smape
            torch.save({
                'epoch': current_epoch - 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_smape': val_smape,
                'val_rmse': val_rmse,
                'config': CONFIG
            }, 'best_bert_model_log.pt')
            print(f"\n‚úÖ Improved! New best model saved! (Val SMAPE: {val_smape:.2f}%)")
        else:
            print(f"\n‚ö†Ô∏è No improvement (Best SMAPE: {best_val_smape:.2f}%)")
    
    print(f"\nüéâ Training continuation complete!")
    print(f"üèÜ Best SMAPE: {best_val_smape:.2f}%")
    
    return model, tokenizer


In [None]:
# Add the continue_training function to your code, then run:
model, tokenizer = continue_training('best_bert_model_log.pt', data_no_outliers, additional_epochs=1)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ==================== CONFIG ====================
CONFIG = {
    'bert_model': 'distilbert-base-uncased',
    'max_length': 256,
    'batch_size': 16,
    'epochs': 3,
    'learning_rate': 2e-5,
    'dropout': 0.3,
    'test_size': 0.2,
    'random_state': 42,
    'use_log_transform': True,
    'n_clusters': 20,  # NEW: Number of clusters
    'cluster_embed_dim': 64  # NEW: Cluster embedding dimension
}

# ==================== METRIC FUNCTIONS ====================
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    mask = denominator > 0
    smape = np.mean(np.abs(y_pred[mask] - y_true[mask]) / denominator[mask]) * 100
    return smape

# ==================== CLUSTERING FUNCTIONS ====================
def extract_bert_embeddings(texts, model_name, tokenizer, device, batch_size=32, max_length=256):
    """
    Extract BERT [CLS] embeddings for clustering
    """
    print(f"\nüîç Extracting BERT embeddings for {len(texts)} samples...")
    
    # Load model for embedding extraction
    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.to(device)
    bert_model.eval()
    
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc='Extracting embeddings'):
            batch_texts = texts[i:i+batch_size]
            
            # Tokenize
            encoding = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            )
            
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            # Get [CLS] embeddings
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            embeddings.append(cls_embeddings)
    
    # Clean up
    del bert_model
    torch.cuda.empty_cache()
    
    embeddings = np.vstack(embeddings)
    print(f"‚úÖ Extracted embeddings shape: {embeddings.shape}")
    
    return embeddings

def perform_clustering(embeddings, n_clusters=20):
    """
    Perform K-Means clustering on embeddings
    """
    print(f"\nüéØ Performing K-Means clustering with {n_clusters} clusters...")
    
    kmeans = KMeans(
        n_clusters=n_clusters,
        random_state=42,
        n_init=10,
        max_iter=300,
        verbose=0
    )
    
    cluster_labels = kmeans.fit_predict(embeddings)
    
    print(f"‚úÖ Clustering complete!")
    print(f"\nCluster distribution:")
    unique, counts = np.unique(cluster_labels, return_counts=True)
    for cluster_id, count in zip(unique, counts):
        print(f"   Cluster {cluster_id:2d}: {count:5d} samples ({count/len(cluster_labels)*100:.1f}%)")
    
    return kmeans, cluster_labels

def analyze_clusters(df, cluster_labels, n_samples=3):
    """
    Analyze what each cluster contains
    """
    print(f"\nüìä CLUSTER ANALYSIS:")
    print("="*80)
    
    df_with_clusters = df.copy()
    df_with_clusters['cluster_id'] = cluster_labels
    
    for i in range(CONFIG['n_clusters']):
        cluster_data = df_with_clusters[df_with_clusters['cluster_id'] == i]
        
        print(f"\nüè∑Ô∏è  CLUSTER {i} ({len(cluster_data)} samples)")
        print(f"   Price range: ${cluster_data['price'].min():.2f} - ${cluster_data['price'].max():.2f}")
        print(f"   Mean price: ${cluster_data['price'].mean():.2f}")
        print(f"   Median price: ${cluster_data['price'].median():.2f}")
        print(f"\n   Sample products:")
        
        for idx, row in cluster_data.head(n_samples).iterrows():
            text_preview = row['catalog_content'][:80] + "..."
            print(f"   ‚Ä¢ ${row['price']:.2f} - {text_preview}")

# ==================== NEW DATASET WITH CLUSTERS ====================
class ProductDataset_WithClusters(Dataset):
    def __init__(self, texts, prices, cluster_ids, tokenizer, max_length):
        self.texts = texts
        self.prices = prices
        self.cluster_ids = cluster_ids
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        price = self.prices[idx]
        cluster_id = self.cluster_ids[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'cluster_id': torch.tensor(cluster_id, dtype=torch.long),
            'price': torch.tensor(price, dtype=torch.float)
        }

# ==================== NEW MODEL WITH CLUSTERING ====================
class BERTPricePredictor_WithClusters(nn.Module):
    def __init__(self, bert_model_name, n_clusters=20, cluster_embed_dim=64, dropout=0.3):
        super(BERTPricePredictor_WithClusters, self).__init__()
        
        # Load pre-trained BERT
        self.bert = AutoModel.from_pretrained(bert_model_name)
        
        # Get BERT hidden size
        bert_hidden_size = self.bert.config.hidden_size  # 768 for distilbert
        
        # Cluster embedding layer
        self.cluster_embedding = nn.Embedding(n_clusters, cluster_embed_dim)
        
        # Combined size
        total_size = bert_hidden_size + cluster_embed_dim  # 768 + 64 = 832
        
        # MLP for price prediction
        self.regressor = nn.Sequential(
            nn.Linear(total_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    
    def forward(self, input_ids, attention_mask, cluster_ids):
        # Get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]  # [batch, 768]
        
        # Get cluster embeddings
        cluster_emb = self.cluster_embedding(cluster_ids)  # [batch, 64]
        
        # Concatenate BERT + cluster embeddings
        combined = torch.cat([cls_output, cluster_emb], dim=1)  # [batch, 832]
        
        # Predict price
        price = self.regressor(combined)
        
        return price.squeeze()

# ==================== TRAINING FUNCTIONS ====================
def train_epoch_with_clusters(model, dataloader, optimizer, scheduler, device, criterion, use_log):
    model.train()
    total_loss = 0
    predictions = []
    actuals = []
    
    progress_bar = tqdm(dataloader, desc='Training')
    
    for batch in progress_bar:
        # Move to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        cluster_ids = batch['cluster_id'].to(device)
        prices = batch['price'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, cluster_ids)
        
        # Calculate loss
        loss = criterion(outputs, prices)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        # Track metrics
        total_loss += loss.item()
        pred_np = outputs.detach().cpu().numpy()
        actual_np = prices.cpu().numpy()
        
        # Convert from log if needed
        if use_log:
            pred_np = np.expm1(pred_np)
            actual_np = np.expm1(actual_np)
        
        predictions.extend(pred_np)
        actuals.extend(actual_np)
        
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(dataloader)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    mape = mean_absolute_percentage_error(actuals, predictions)
    smape = symmetric_mean_absolute_percentage_error(actuals, predictions)
    
    return avg_loss, rmse, mae, r2, mape, smape

def evaluate_with_clusters(model, dataloader, device, criterion, use_log):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            cluster_ids = batch['cluster_id'].to(device)
            prices = batch['price'].to(device)
            
            outputs = model(input_ids, attention_mask, cluster_ids)
            loss = criterion(outputs, prices)
            
            total_loss += loss.item()
            pred_np = outputs.cpu().numpy()
            actual_np = prices.cpu().numpy()
            
            if use_log:
                pred_np = np.expm1(pred_np)
                actual_np = np.expm1(actual_np)
            
            predictions.extend(pred_np)
            actuals.extend(actual_np)
    
    avg_loss = total_loss / len(dataloader)
    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)
    mape = mean_absolute_percentage_error(actuals, predictions)
    smape = symmetric_mean_absolute_percentage_error(actuals, predictions)
    
    return avg_loss, rmse, mae, r2, mape, smape, predictions, actuals

# ==================== MAIN: CREATE CLUSTERS & TRAIN ====================
def train_with_clusters(df, old_checkpoint_path=None):
    """
    Complete pipeline: Extract embeddings ‚Üí Cluster ‚Üí Train model with clusters
    """
    print("="*80)
    print("üöÄ TRAINING WITH CLUSTERING INTEGRATION")
    print("="*80)
    
    # Step 1: Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['bert_model'])
    
    # Step 2: Extract BERT embeddings for clustering
    bert_embeddings = extract_bert_embeddings(
        texts=df['catalog_content'].tolist(),
        model_name=CONFIG['bert_model'],
        tokenizer=tokenizer,
        device=device,
        batch_size=32
    )
    
    # Step 3: Perform clustering
    kmeans, cluster_labels = perform_clustering(bert_embeddings, CONFIG['n_clusters'])
    
    # Step 4: Analyze clusters
    analyze_clusters(df, cluster_labels, n_samples=2)
    
    # Step 5: Save clustering model
    print(f"\nüíæ Saving clustering model...")
    with open('kmeans_model.pkl', 'wb') as f:
        pickle.dump(kmeans, f)
    print("‚úÖ Saved: kmeans_model.pkl")
    
    # Step 6: Prepare training data
    X = df['catalog_content'].values
    cluster_ids = cluster_labels
    
    if CONFIG['use_log_transform']:
        y = np.log1p(df['price'].values)
    else:
        y = df['price'].values
    
    # Train-validation split (same indices for all)
    X_train, X_val, y_train, y_val, cluster_train, cluster_val = train_test_split(
        X, y, cluster_ids,
        test_size=CONFIG['test_size'],
        random_state=CONFIG['random_state']
    )
    
    print(f"\nüìä Data split:")
    print(f"   Train size: {len(X_train)}")
    print(f"   Val size: {len(X_val)}")
    
    # Step 7: Create datasets with clusters
    train_dataset = ProductDataset_WithClusters(X_train, y_train, cluster_train, tokenizer, CONFIG['max_length'])
    val_dataset = ProductDataset_WithClusters(X_val, y_val, cluster_val, tokenizer, CONFIG['max_length'])
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=2)
    
    # Step 8: Initialize NEW model with clustering
    print(f"\nüèóÔ∏è  Initializing model with clustering support...")
    model = BERTPricePredictor_WithClusters(
        CONFIG['bert_model'],
        n_clusters=CONFIG['n_clusters'],
        cluster_embed_dim=CONFIG['cluster_embed_dim'],
        dropout=CONFIG['dropout']
    )
    
    # Optional: Load BERT weights from old checkpoint (transfer learning)
    if old_checkpoint_path:
        print(f"\nüîÑ Loading BERT weights from old checkpoint: {old_checkpoint_path}")
        old_checkpoint = torch.load(old_checkpoint_path, weights_only=False)
        
        # Load only BERT weights (not the regressor since architecture changed)
        old_state_dict = old_checkpoint['model_state_dict']
        bert_weights = {k: v for k, v in old_state_dict.items() if k.startswith('bert.')}
        
        model_dict = model.state_dict()
        model_dict.update(bert_weights)
        model.load_state_dict(model_dict, strict=False)
        print(f"‚úÖ Transferred BERT weights from previous model")
    
    model.to(device)
    
    # Step 9: Training setup
    criterion = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    
    total_steps = len(train_loader) * CONFIG['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    
    # Step 10: Training loop
    print("\n" + "="*80)
    print(f"üéì TRAINING WITH CLUSTERS - {CONFIG['epochs']} EPOCHS")
    print("="*80)
    
    best_val_smape = float('inf')
    history = {
        'train_loss': [], 'train_smape': [],
        'val_loss': [], 'val_smape': []
    }
    
    for epoch in range(CONFIG['epochs']):
        print(f"\n{'='*80}")
        print(f"EPOCH {epoch+1}/{CONFIG['epochs']}")
        print(f"{'='*80}")
        
        # Train
        train_loss, train_rmse, train_mae, train_r2, train_mape, train_smape = train_epoch_with_clusters(
            model, train_loader, optimizer, scheduler, device, criterion, CONFIG['use_log_transform']
        )
        
        # Validate
        val_loss, val_rmse, val_mae, val_r2, val_mape, val_smape, _, _ = evaluate_with_clusters(
            model, val_loader, device, criterion, CONFIG['use_log_transform']
        )
        
        # Save history
        history['train_loss'].append(train_loss)
        history['train_smape'].append(train_smape)
        history['val_loss'].append(val_loss)
        history['val_smape'].append(val_smape)
        
        # Print metrics
        print(f"\nüìä TRAINING: Loss: {train_loss:.4f} | SMAPE: {train_smape:.2f}%")
        print(f"üìä VALIDATION: Loss: {val_loss:.4f} | SMAPE: {val_smape:.2f}% ‚≠ê")
        
        # Save best model
        if val_smape < best_val_smape:
            best_val_smape = val_smape
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_smape': val_smape,
                'config': CONFIG
            }, 'best_bert_cluster_model.pt')
            print(f"\n‚úÖ Best model saved! (Val SMAPE: {val_smape:.2f}%)")
    
    print("\n" + "="*80)
    print("üéâ TRAINING COMPLETE!")
    print("="*80)
    print(f"üèÜ Best Validation SMAPE: {best_val_smape:.2f}%")
    print(f"üíæ Model saved as: best_bert_cluster_model.pt")
    print(f"üíæ Clustering saved as: kmeans_model.pkl")
    print("="*80)
    
    return model, tokenizer, kmeans, history

# ==================== PREDICTION WITH CLUSTERS ====================
def predict_with_clusters(model, texts, tokenizer, kmeans, device, use_log_transform=True, batch_size=16):
    """
    Predict prices for new data using clustering
    """
    print(f"\nüîÆ Predicting prices for {len(texts)} samples...")
    
    # Step 1: Extract embeddings for test data
    bert_embeddings = extract_bert_embeddings(
        texts=texts,
        model_name=CONFIG['bert_model'],
        tokenizer=tokenizer,
        device=device,
        batch_size=32
    )
    
    # Step 2: Predict clusters
    print(f"üéØ Assigning clusters...")
    cluster_ids = kmeans.predict(bert_embeddings)
    print(f"‚úÖ Clusters assigned")
    
    # Step 3: Predict prices
    model.eval()
    predictions = []
    
    dummy_prices = np.zeros(len(texts))
    dataset = ProductDataset_WithClusters(texts, dummy_prices, cluster_ids, tokenizer, CONFIG['max_length'])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting prices'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            cluster_batch = batch['cluster_id'].to(device)
            
            outputs = model(input_ids, attention_mask, cluster_batch)
            pred_np = outputs.cpu().numpy()
            
            if use_log_transform:
                pred_np = np.expm1(pred_np)
            
            predictions.extend(pred_np)
    
    return np.array(predictions)



In [None]:
# ==================== RUN TRAINING ====================
if __name__ == "__main__":
    # Train with clustering (optionally transfer BERT weights from old model)
    model, tokenizer, kmeans, history = train_with_clusters(
        df=data_no_outliers,
        old_checkpoint_path='best_bert_model_log.pt'  # Your existing checkpoint
    )

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ==================== CONFIG (MUST MATCH TRAINING) ====================
CONFIG = {
    'bert_model': 'distilbert-base-uncased',
    'max_length': 256,
    'batch_size': 16,
    'use_log_transform': True,
    'n_clusters': 20,
    'cluster_embed_dim': 64,
    'dropout': 0.3
}

# ==================== MODEL DEFINITIONS ====================

# OLD MODEL (Without Clustering)
class BERTPricePredictor_Old(nn.Module):
    def __init__(self, bert_model_name, dropout=0.3):
        super(BERTPricePredictor_Old, self).__init__()
        
        self.bert = AutoModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size
        
        self.regressor = nn.Sequential(
            nn.Linear(bert_hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        price = self.regressor(cls_output)
        return price.squeeze()

# NEW MODEL (With Clustering)
class BERTPricePredictor_WithClusters(nn.Module):
    def __init__(self, bert_model_name, n_clusters=20, cluster_embed_dim=64, dropout=0.3):
        super(BERTPricePredictor_WithClusters, self).__init__()
        
        self.bert = AutoModel.from_pretrained(bert_model_name)
        bert_hidden_size = self.bert.config.hidden_size
        
        self.cluster_embedding = nn.Embedding(n_clusters, cluster_embed_dim)
        
        total_size = bert_hidden_size + cluster_embed_dim
        
        self.regressor = nn.Sequential(
            nn.Linear(total_size, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
    
    def forward(self, input_ids, attention_mask, cluster_ids):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cluster_emb = self.cluster_embedding(cluster_ids)
        combined = torch.cat([cls_output, cluster_emb], dim=1)
        price = self.regressor(combined)
        return price.squeeze()

# ==================== DATASET CLASSES ====================

class ProductDataset_Old(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

class ProductDataset_WithClusters(Dataset):
    def __init__(self, texts, cluster_ids, tokenizer, max_length):
        self.texts = texts
        self.cluster_ids = cluster_ids
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        cluster_id = self.cluster_ids[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'cluster_id': torch.tensor(cluster_id, dtype=torch.long)
        }

# ==================== HELPER FUNCTIONS ====================

def extract_bert_embeddings(texts, model_name, tokenizer, device, batch_size=32):
    """Extract BERT embeddings for clustering"""
    print(f"üîç Extracting BERT embeddings for {len(texts)} samples...")
    
    bert_model = AutoModel.from_pretrained(model_name)
    bert_model.to(device)
    bert_model.eval()
    
    embeddings = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc='Extracting embeddings'):
            batch_texts = texts[i:i+batch_size]
            
            encoding = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=CONFIG['max_length'],
                return_tensors='pt'
            )
            
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            embeddings.append(cls_embeddings)
    
    del bert_model
    torch.cuda.empty_cache()
    
    return np.vstack(embeddings)

# ==================== PREDICTION FUNCTIONS ====================

def predict_with_old_model(model, texts, tokenizer, device, batch_size=16):
    """Predict prices using old model (no clustering)"""
    print("\nüîÆ Predicting with OLD MODEL (no clustering)...")
    
    model.eval()
    predictions = []
    
    dataset = ProductDataset_Old(texts, tokenizer, CONFIG['max_length'])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            outputs = model(input_ids, attention_mask)
            pred_np = outputs.cpu().numpy()
            
            # Convert from log if needed
            if CONFIG['use_log_transform']:
                pred_np = np.expm1(pred_np)
            
            predictions.extend(pred_np)
    
    return np.array(predictions)

def predict_with_new_model(model, texts, tokenizer, kmeans, device, batch_size=16):
    """Predict prices using new model (with clustering)"""
    print("\nüîÆ Predicting with NEW MODEL (with clustering)...")
    
    # Step 1: Extract embeddings
    embeddings = extract_bert_embeddings(
        texts=texts,
        model_name=CONFIG['bert_model'],
        tokenizer=tokenizer,
        device=device,
        batch_size=32
    )
    
    # Step 2: Predict clusters
    print("üéØ Assigning clusters...")
    cluster_ids = kmeans.predict(embeddings)
    print(f"‚úÖ Clusters assigned")
    
    print(f"\nCluster distribution in test set:")
    unique, counts = np.unique(cluster_ids, return_counts=True)
    for cluster_id, count in zip(unique, counts):
        print(f"   Cluster {cluster_id:2d}: {count:5d} samples ({count/len(cluster_ids)*100:.1f}%)")
    
    # Step 3: Predict prices
    model.eval()
    predictions = []
    
    dataset = ProductDataset_WithClusters(texts, cluster_ids, tokenizer, CONFIG['max_length'])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting prices'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            cluster_batch = batch['cluster_id'].to(device)
            
            outputs = model(input_ids, attention_mask, cluster_batch)
            pred_np = outputs.cpu().numpy()
            
            # Convert from log if needed
            if CONFIG['use_log_transform']:
                pred_np = np.expm1(pred_np)
            
            predictions.extend(pred_np)
    
    return np.array(predictions)

# ==================== MAIN FUNCTION ====================

def create_test_predictions(
    test_csv_path='/kaggle/input/dataset/student_resource/dataset/test.csv',
    old_model_path='best_bert_model_log.pt',
    new_model_path='best_bert_cluster_model.pt',
    kmeans_path='kmeans_model.pkl',
    use_clustering=True  # Set to False to use old model
):
    """
    Create predictions for Kaggle test set
    
    Args:
        test_csv_path: Path to test.csv
        old_model_path: Path to old model checkpoint
        new_model_path: Path to new model with clustering
        kmeans_path: Path to KMeans model
        use_clustering: If True, use clustering model; if False, use old model
    """
    
    print("="*80)
    print("üöÄ CREATING KAGGLE TEST SET PREDICTIONS")
    print("="*80)
    
    # Load test data
    print(f"\nüìÇ Loading test data from: {test_csv_path}")
    test_df = pd.read_csv(test_csv_path)
    print(f"‚úÖ Loaded {len(test_df)} test samples")
    print(f"   Columns: {test_df.columns.tolist()}")
    
    # Verify required columns
    if 'sample_id' not in test_df.columns or 'catalog_content' not in test_df.columns:
        raise ValueError("Test CSV must have 'sample_id' and 'catalog_content' columns")
    
    # Load tokenizer
    print(f"\nüîß Loading tokenizer: {CONFIG['bert_model']}")
    tokenizer = AutoTokenizer.from_pretrained(CONFIG['bert_model'])
    print("‚úÖ Tokenizer loaded")
    
    # Extract texts
    texts = test_df['catalog_content'].tolist()
    
    # ==================== PREDICTION ====================
    
    if use_clustering:
        print("\n" + "="*80)
        print("üìä USING NEW MODEL WITH CLUSTERING")
        print("="*80)
        
        # Load clustering model
        print(f"\nüì¶ Loading K-Means from: {kmeans_path}")
        try:
            with open(kmeans_path, 'rb') as f:
                kmeans = pickle.load(f)
            print(f"‚úÖ Loaded clustering model ({CONFIG['n_clusters']} clusters)")
        except FileNotFoundError:
            print(f"‚ùå ERROR: Clustering model not found at {kmeans_path}")
            print("   Please train the clustering model first!")
            return None
        
        # Load new model
        print(f"\nüèóÔ∏è  Loading new model from: {new_model_path}")
        try:
            new_model = BERTPricePredictor_WithClusters(
                CONFIG['bert_model'],
                n_clusters=CONFIG['n_clusters'],
                cluster_embed_dim=CONFIG['cluster_embed_dim'],
                dropout=CONFIG['dropout']
            )
            checkpoint = torch.load(new_model_path, weights_only=False)
            new_model.load_state_dict(checkpoint['model_state_dict'])
            new_model.to(device)
            print(f"‚úÖ Model loaded successfully")
            print(f"   Training SMAPE: {checkpoint.get('val_smape', 'N/A')}")
        except FileNotFoundError:
            print(f"‚ùå ERROR: Model not found at {new_model_path}")
            print("   Please train the clustering model first!")
            return None
        
        # Predict
        predictions = predict_with_new_model(
            model=new_model,
            texts=texts,
            tokenizer=tokenizer,
            kmeans=kmeans,
            device=device,
            batch_size=CONFIG['batch_size']
        )
        
        submission_filename = 'submission_clustering.csv'
        
    else:
        print("\n" + "="*80)
        print("üìä USING OLD MODEL (NO CLUSTERING)")
        print("="*80)
        
        # Load old model
        print(f"\nüèóÔ∏è  Loading old model from: {old_model_path}")
        try:
            old_model = BERTPricePredictor_Old(CONFIG['bert_model'], CONFIG['dropout'])
            checkpoint = torch.load(old_model_path, weights_only=False)
            old_model.load_state_dict(checkpoint['model_state_dict'])
            old_model.to(device)
            print(f"‚úÖ Model loaded successfully")
            print(f"   Training SMAPE: {checkpoint.get('val_smape', 'N/A')}")
        except FileNotFoundError:
            print(f"‚ùå ERROR: Model not found at {old_model_path}")
            return None
        
        # Predict
        predictions = predict_with_old_model(
            model=old_model,
            texts=texts,
            tokenizer=tokenizer,
            device=device,
            batch_size=CONFIG['batch_size']
        )
        
        submission_filename = 'submission_no_clustering.csv'
    
    # ==================== POST-PROCESSING ====================
    
    print("\nüìä Prediction statistics:")
    print(f"   Min price:    ${predictions.min():.2f}")
    print(f"   Max price:    ${predictions.max():.2f}")
    print(f"   Mean price:   ${predictions.mean():.2f}")
    print(f"   Median price: ${np.median(predictions):.2f}")
    print(f"   Std dev:      ${predictions.std():.2f}")
    
    # Ensure positive prices (competition requirement)
    negative_count = (predictions < 0).sum()
    if negative_count > 0:
        print(f"\n‚ö†Ô∏è  Found {negative_count} negative predictions, clipping to 0")
        predictions = np.maximum(predictions, 0)
    
    # Check for NaN or inf
    invalid_count = (~np.isfinite(predictions)).sum()
    if invalid_count > 0:
        print(f"‚ö†Ô∏è  Found {invalid_count} invalid predictions (NaN/inf), replacing with median")
        median_price = np.nanmedian(predictions[np.isfinite(predictions)])
        predictions[~np.isfinite(predictions)] = median_price
    
    # ==================== CREATE SUBMISSION ====================
    
    print("\nüìù Creating submission file...")
    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': predictions
    })
    
    # Verify submission format
    print(f"\n‚úÖ Submission shape: {submission.shape}")
    print(f"   Expected: ({len(test_df)}, 2)")
    
    if len(submission) != len(test_df):
        print("‚ùå ERROR: Submission has wrong number of rows!")
        return None
    
    # Save submission
    submission.to_csv(submission_filename, index=False)
    
    print("\n" + "="*80)
    print("üéâ SUBMISSION FILE CREATED!")
    print("="*80)
    print(f"üìÅ File: {submission_filename}")
    print(f"üìä Rows: {len(submission)}")
    print(f"üíµ Price range: ${predictions.min():.2f} - ${predictions.max():.2f}")
    print("\nüöÄ Ready to submit to Kaggle!")
    print("="*80)
    
    # Show sample predictions
    print("\nüìã Sample predictions:")
    print(submission.head(10))
    
    return submission

# ==================== USAGE ====================

if __name__ == "__main__":
    """
    USAGE INSTRUCTIONS:
    
    1. To use CLUSTERING model (recommended if it improved your SMAPE):
       submission = create_test_predictions(
           use_clustering=True
       )
    
    2. To use OLD model (no clustering):
       submission = create_test_predictions(
           use_clustering=False
       )
    
    3. Custom paths:
       submission = create_test_predictions(
           test_csv_path='/kaggle/input/dataset/student_resource/dataset/test.csv',
           old_model_path='best_bert_model_log.pt',
           new_model_path='best_bert_cluster_model.pt',
           kmeans_path='kmeans_model.pkl',
           use_clustering=True
       )
    """
    
    # ==================== OPTION 1: USE CLUSTERING MODEL ====================
    print("\nüéØ OPTION 1: Creating predictions with CLUSTERING model")
    print("="*80)
    
    submission_clustering = create_test_predictions(
        test_csv_path='/kaggle/input/dataset/student_resource/dataset/test.csv',
        old_model_path='best_bert_model_log.pt',
        new_model_path='best_bert_cluster_model.pt',
        kmeans_path='kmeans_model.pkl',
        use_clustering=True  # Use clustering model
    )
    
    # ==================== OPTION 2: USE OLD MODEL (BACKUP) ====================
    # Uncomment below to also create predictions with old model for comparison
    
    # print("\n\nüéØ OPTION 2: Creating predictions with OLD model (no clustering)")
    # print("="*80)
    # 
    # submission_old = create_test_predictions(
    #     test_csv_path='/kaggle/input/dataset/student_resource/dataset/test.csv',
    #     old_model_path='best_bert_model_log.pt',
    #     new_model_path='best_bert_cluster_model.pt',
    #     kmeans_path='kmeans_model.pkl',
    #     use_clustering=False  # Use old model
    # )
    
    print("\n‚úÖ ALL DONE!")
    print("üì§ Upload submission_clustering.csv to Kaggle!")

In [None]:
# This will create: submission_clustering.csv
submission = create_test_predictions(
    test_csv_path='/kaggle/input/dataset/student_resource/dataset/test.csv',
    use_clustering=True  # Set to False for old model
)