# Amazon ML Challenge 2025 - Price Prediction

## Objective
Predict product prices using catalog content with BERT-based deep learning models.

## Approach
1. Data preprocessing and cleaning
2. Outlier removal using IQR method
3. BERT-based regression model
4. K-Means clustering for enhanced predictions
5. Model evaluation using SMAPE metric

## 1. Import Libraries and Load Data

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import re
import string
import warnings
warnings.filterwarnings('ignore')

# Load training dataset
df_train = pd.read_csv('/kaggle/input/dataset/student_resource/dataset/train.csv')
print(f"Training data shape: {df_train.shape}")
print(f"\nColumns: {df_train.columns.tolist()}")
print(f"\nFirst few rows:")
df_train.head()

## 2. Data Preprocessing

In [None]:
def preprocess_text(text_input):
    """
    Clean and normalize text data.
    
    Args:
        text_input: Raw text string
    
    Returns:
        Cleaned text string
    """
    if pd.isnull(text_input):
        return ""
    
    # Lowercase conversion
    cleaned = str(text_input).lower()
    
    # Remove punctuation marks
    cleaned = cleaned.translate(str.maketrans("", "", string.punctuation))
    
    # Remove special characters and emojis
    cleaned = re.sub(r"[^\w\s]", "", cleaned)
    
    return cleaned

# Create working copy
df_processed = df_train.copy()

# Apply text preprocessing
print("Preprocessing catalog content...")
df_processed['catalog_content'] = df_processed['catalog_content'].apply(preprocess_text)

# Add log-transformed price for better model training
df_processed['price_log'] = np.log1p(df_processed['price'])

print("\nPreprocessing complete!")
print(f"Sample processed text: {df_processed['catalog_content'].iloc[0][:100]}...")

## 2.1. Text Length Analysis

Analyze the word count distribution in catalog content for both train and test datasets.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_word_count(text):
    """
    Calculate the number of words in a text string.
    
    Args:
        text: Input text string
    
    Returns:
        Integer count of words
    """
    if pd.isnull(text) or text == "":
        return 0
    return len(str(text).split())

# Calculate word counts for training data
print("Calculating word counts for training data...")
df_processed['word_count'] = df_processed['catalog_content'].apply(calculate_word_count)

# Display statistics
print("\n" + "="*80)
print("TRAINING DATA - WORD COUNT STATISTICS")
print("="*80)
print(f"\nTotal samples: {len(df_processed)}")
print(f"\nWord Count Statistics:")
print(df_processed['word_count'].describe())

print(f"\nüìä Additional Metrics:")
print(f"   Minimum words: {df_processed['word_count'].min()}")
print(f"   Maximum words: {df_processed['word_count'].max()}")
print(f"   Mean words: {df_processed['word_count'].mean():.2f}")
print(f"   Median words: {df_processed['word_count'].median():.0f}")
print(f"   Std deviation: {df_processed['word_count'].std():.2f}")

# Percentiles
print(f"\nüìà Percentiles:")
for percentile in [25, 50, 75, 90, 95, 99]:
    value = df_processed['word_count'].quantile(percentile/100)
    print(f"   {percentile}th percentile: {value:.0f} words")

# Check for empty or very short texts
print(f"\n‚ö†Ô∏è  Quality Checks:")
print(f"   Empty texts (0 words): {(df_processed['word_count'] == 0).sum()}")
print(f"   Very short texts (< 5 words): {(df_processed['word_count'] < 5).sum()}")
print(f"   Short texts (< 10 words): {(df_processed['word_count'] < 10).sum()}")
print(f"   Long texts (> 500 words): {(df_processed['word_count'] > 500).sum()}")

print("\n" + "="*80)

In [None]:
# Visualize word count distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Histogram
axes[0, 0].hist(df_processed['word_count'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].axvline(df_processed['word_count'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {df_processed["word_count"].mean():.1f}')
axes[0, 0].axvline(df_processed['word_count'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: {df_processed["word_count"].median():.1f}')
axes[0, 0].set_xlabel('Word Count', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Training Data: Word Count Distribution', fontsize=14, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Box plot
axes[0, 1].boxplot(df_processed['word_count'], vert=True, patch_artist=True,
                   boxprops=dict(facecolor='lightblue', alpha=0.7),
                   medianprops=dict(color='red', linewidth=2))
axes[0, 1].set_ylabel('Word Count', fontsize=12)
axes[0, 1].set_title('Training Data: Word Count Box Plot', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Cumulative distribution
sorted_counts = np.sort(df_processed['word_count'])
cumulative = np.arange(1, len(sorted_counts) + 1) / len(sorted_counts) * 100
axes[1, 0].plot(sorted_counts, cumulative, linewidth=2, color='purple')
axes[1, 0].set_xlabel('Word Count', fontsize=12)
axes[1, 0].set_ylabel('Cumulative Percentage (%)', fontsize=12)
axes[1, 0].set_title('Training Data: Cumulative Distribution', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].axhline(50, color='red', linestyle='--', alpha=0.5, label='50%')
axes[1, 0].axhline(95, color='orange', linestyle='--', alpha=0.5, label='95%')
axes[1, 0].legend()

# 4. Word count vs Price scatter (sample)
if len(df_processed) > 10000:
    sample_df = df_processed.sample(n=10000, random_state=42)
else:
    sample_df = df_processed

axes[1, 1].scatter(sample_df['word_count'], sample_df['price'], alpha=0.3, s=10, color='coral')
axes[1, 1].set_xlabel('Word Count', fontsize=12)
axes[1, 1].set_ylabel('Price', fontsize=12)
axes[1, 1].set_title('Training Data: Word Count vs Price', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('train_word_count_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úÖ Visualization saved as 'train_word_count_analysis.png'")

In [None]:
# Load and analyze TEST data
print("\n" + "="*80)
print("LOADING AND ANALYZING TEST DATA")
print("="*80)

try:
    # Load test dataset
    df_test_raw = pd.read_csv('/kaggle/input/dataset/student_resource/dataset/test.csv')
    print(f"\n‚úÖ Test data loaded successfully!")
    print(f"Test data shape: {df_test_raw.shape}")
    print(f"Columns: {df_test_raw.columns.tolist()}")
    
    # Preprocess test data
    print("\nPreprocessing test data...")
    df_test_processed = df_test_raw.copy()
    df_test_processed['catalog_content_clean'] = df_test_processed['catalog_content'].apply(preprocess_text)
    
    # Calculate word counts for test data
    print("Calculating word counts for test data...")
    df_test_processed['word_count'] = df_test_processed['catalog_content_clean'].apply(calculate_word_count)
    
    # Display statistics
    print("\n" + "="*80)
    print("TEST DATA - WORD COUNT STATISTICS")
    print("="*80)
    print(f"\nTotal samples: {len(df_test_processed)}")
    print(f"\nWord Count Statistics:")
    print(df_test_processed['word_count'].describe())
    
    print(f"\nüìä Additional Metrics:")
    print(f"   Minimum words: {df_test_processed['word_count'].min()}")
    print(f"   Maximum words: {df_test_processed['word_count'].max()}")
    print(f"   Mean words: {df_test_processed['word_count'].mean():.2f}")
    print(f"   Median words: {df_test_processed['word_count'].median():.0f}")
    print(f"   Std deviation: {df_test_processed['word_count'].std():.2f}")
    
    # Percentiles
    print(f"\nüìà Percentiles:")
    for percentile in [25, 50, 75, 90, 95, 99]:
        value = df_test_processed['word_count'].quantile(percentile/100)
        print(f"   {percentile}th percentile: {value:.0f} words")
    
    # Check for empty or very short texts
    print(f"\n‚ö†Ô∏è  Quality Checks:")
    print(f"   Empty texts (0 words): {(df_test_processed['word_count'] == 0).sum()}")
    print(f"   Very short texts (< 5 words): {(df_test_processed['word_count'] < 5).sum()}")
    print(f"   Short texts (< 10 words): {(df_test_processed['word_count'] < 10).sum()}")
    print(f"   Long texts (> 500 words): {(df_test_processed['word_count'] > 500).sum()}")
    
    print("\n" + "="*80)
    
except FileNotFoundError:
    print("\n‚ö†Ô∏è  Test data file not found. Skipping test data analysis.")
    print("   This is normal if you're only working with training data.")
    df_test_processed = None

In [None]:
# Compare train and test distributions
if df_test_processed is not None:
    print("\n" + "="*80)
    print("TRAIN vs TEST COMPARISON")
    print("="*80)
    
    # Create comparison visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # 1. Overlapping histograms
    axes[0, 0].hist(df_processed['word_count'], bins=50, alpha=0.6, label='Train', color='blue', edgecolor='black')
    axes[0, 0].hist(df_test_processed['word_count'], bins=50, alpha=0.6, label='Test', color='orange', edgecolor='black')
    axes[0, 0].axvline(df_processed['word_count'].mean(), color='blue', linestyle='--', linewidth=2, alpha=0.8)
    axes[0, 0].axvline(df_test_processed['word_count'].mean(), color='orange', linestyle='--', linewidth=2, alpha=0.8)
    axes[0, 0].set_xlabel('Word Count', fontsize=12)
    axes[0, 0].set_ylabel('Frequency', fontsize=12)
    axes[0, 0].set_title('Train vs Test: Word Count Distribution', fontsize=14, fontweight='bold')
    axes[0, 0].legend(fontsize=11)
    axes[0, 0].grid(True, alpha=0.3)
    
    # 2. Box plot comparison
    box_data = [df_processed['word_count'], df_test_processed['word_count']]
    bp = axes[0, 1].boxplot(box_data, labels=['Train', 'Test'], patch_artist=True,
                            boxprops=dict(facecolor='lightblue', alpha=0.7),
                            medianprops=dict(color='red', linewidth=2))
    bp['boxes'][0].set_facecolor('lightblue')
    bp['boxes'][1].set_facecolor('lightcoral')
    axes[0, 1].set_ylabel('Word Count', fontsize=12)
    axes[0, 1].set_title('Train vs Test: Box Plot Comparison', fontsize=14, fontweight='bold')
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # 3. Cumulative distribution comparison
    train_sorted = np.sort(df_processed['word_count'])
    train_cumulative = np.arange(1, len(train_sorted) + 1) / len(train_sorted) * 100
    test_sorted = np.sort(df_test_processed['word_count'])
    test_cumulative = np.arange(1, len(test_sorted) + 1) / len(test_sorted) * 100
    
    axes[1, 0].plot(train_sorted, train_cumulative, linewidth=2, color='blue', label='Train', alpha=0.8)
    axes[1, 0].plot(test_sorted, test_cumulative, linewidth=2, color='orange', label='Test', alpha=0.8)
    axes[1, 0].set_xlabel('Word Count', fontsize=12)
    axes[1, 0].set_ylabel('Cumulative Percentage (%)', fontsize=12)
    axes[1, 0].set_title('Train vs Test: Cumulative Distribution', fontsize=14, fontweight='bold')
    axes[1, 0].legend(fontsize=11)
    axes[1, 0].grid(True, alpha=0.3)
    
    # 4. Statistical comparison table
    axes[1, 1].axis('off')
    comparison_stats = pd.DataFrame({
        'Train': [
            df_processed['word_count'].min(),
            df_processed['word_count'].max(),
            df_processed['word_count'].mean(),
            df_processed['word_count'].median(),
            df_processed['word_count'].std(),
            df_processed['word_count'].quantile(0.25),
            df_processed['word_count'].quantile(0.75)
        ],
        'Test': [
            df_test_processed['word_count'].min(),
            df_test_processed['word_count'].max(),
            df_test_processed['word_count'].mean(),
            df_test_processed['word_count'].median(),
            df_test_processed['word_count'].std(),
            df_test_processed['word_count'].quantile(0.25),
            df_test_processed['word_count'].quantile(0.75)
        ]
    }, index=['Min', 'Max', 'Mean', 'Median', 'Std Dev', 'Q1 (25%)', 'Q3 (75%)']).round(2)
    
    table = axes[1, 1].table(cellText=comparison_stats.values,
                             rowLabels=comparison_stats.index,
                             colLabels=comparison_stats.columns,
                             cellLoc='center',
                             loc='center',
                             bbox=[0, 0, 1, 1])
    table.auto_set_font_size(False)
    table.set_fontsize(11)
    table.scale(1, 2)
    
    # Style header
    for i in range(len(comparison_stats.columns)):
        table[(0, i)].set_facecolor('#4CAF50')
        table[(0, i)].set_text_props(weight='bold', color='white')
    
    # Style row labels
    for i in range(len(comparison_stats.index)):
        table[(i+1, -1)].set_facecolor('#E3F2FD')
        table[(i+1, -1)].set_text_props(weight='bold')
    
    axes[1, 1].set_title('Statistical Comparison', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.savefig('train_vs_test_word_count_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n‚úÖ Comparison visualization saved as 'train_vs_test_word_count_comparison.png'")
    
    # Print comparison summary
    print("\nüìä KEY FINDINGS:")
    print(f"   Train mean: {df_processed['word_count'].mean():.2f} words")
    print(f"   Test mean: {df_test_processed['word_count'].mean():.2f} words")
    print(f"   Difference: {abs(df_processed['word_count'].mean() - df_test_processed['word_count'].mean()):.2f} words")
    print(f"   ")
    print(f"   Train median: {df_processed['word_count'].median():.0f} words")
    print(f"   Test median: {df_test_processed['word_count'].median():.0f} words")
    print(f"   Difference: {abs(df_processed['word_count'].median() - df_test_processed['word_count'].median()):.0f} words")
    
    # Check distribution similarity
    from scipy import stats
    ks_stat, ks_pvalue = stats.ks_2samp(df_processed['word_count'], df_test_processed['word_count'])
    print(f"\nüìà Kolmogorov-Smirnov Test:")
    print(f"   KS Statistic: {ks_stat:.4f}")
    print(f"   P-value: {ks_pvalue:.4f}")
    if ks_pvalue > 0.05:
        print(f"   ‚úÖ Distributions are similar (p > 0.05)")
    else:
        print(f"   ‚ö†Ô∏è  Distributions are significantly different (p < 0.05)")
    
    print("\n" + "="*80)
else:
    print("\n‚ö†Ô∏è  Skipping train vs test comparison (test data not available)")

## 3. Outlier Detection and Removal

Using Interquartile Range (IQR) method to remove extreme price outliers.

In [None]:
def remove_outliers_iqr(dataframe, column_name, multiplier=1.5):
    """
    Remove outliers using IQR method.
    
    Args:
        dataframe: Input pandas DataFrame
        column_name: Column to check for outliers
        multiplier: IQR multiplier (default: 1.5)
    
    Returns:
        DataFrame with outliers removed
    """
    # Calculate quartiles
    q1 = dataframe[column_name].quantile(0.25)
    q3 = dataframe[column_name].quantile(0.75)
    iqr = q3 - q1
    
    # Define boundaries
    lower_bound = q1 - multiplier * iqr
    upper_bound = q3 + multiplier * iqr
    
    # Filter data
    df_filtered = dataframe[
        (dataframe[column_name] >= lower_bound) & 
        (dataframe[column_name] <= upper_bound)
    ]
    
    return df_filtered, lower_bound, upper_bound

# Remove price outliers
original_size = len(df_processed)
df_clean, lower, upper = remove_outliers_iqr(df_processed, 'price')

print(f"Original dataset size: {original_size}")
print(f"After outlier removal: {len(df_clean)}")
print(f"Removed: {original_size - len(df_clean)} samples ({(original_size - len(df_clean))/original_size*100:.2f}%)")
print(f"\nPrice boundaries: ${lower:.2f} - ${upper:.2f}")
print(f"\nPrice statistics after cleaning:")
print(df_clean['price'].describe())

## 4. Install Required Deep Learning Libraries

In [None]:
!pip install -q transformers==4.41.2 torch scikit-learn tqdm

## 5. Model Configuration and Utilities

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

# Model hyperparameters
HYPERPARAMS = {
    'model_name': 'distilbert-base-uncased',
    'max_seq_length': 256,
    'batch_size': 16,
ma     'num_epochs': 2,
    'learning_rate': 2e-5,
    'dropout_rate': 0.3,
    'validation_split': 0.2,
    'random_seed': 42,
    'apply_log_transform': True,
    'num_clusters': 20,
    'cluster_dim': 64
}

print("\nConfiguration loaded successfully!")
for key, value in HYPERPARAMS.items():
    print(f"  {key}: {value}")

## 6. Evaluation Metrics

In [None]:
def calculate_mape(y_actual, y_predicted):
    """
    Calculate Mean Absolute Percentage Error.
    """
    y_actual, y_predicted = np.array(y_actual), np.array(y_predicted)
    mask = y_actual != 0
    return np.mean(np.abs((y_actual[mask] - y_predicted[mask]) / y_actual[mask])) * 100

def calculate_smape(y_actual, y_predicted):
    """
    Calculate Symmetric Mean Absolute Percentage Error (Competition Metric).
    """
    y_actual, y_predicted = np.array(y_actual), np.array(y_predicted)
    denominator = (np.abs(y_actual) + np.abs(y_predicted)) / 2.0
    
    # Avoid division by zero
    valid_indices = denominator > 0
    
    if not np.any(valid_indices):
        return 0.0
    
    smape_value = np.mean(
        np.abs(y_predicted[valid_indices] - y_actual[valid_indices]) / 
        denominator[valid_indices]
    ) * 100
    
    return smape_value

def compute_all_metrics(y_true, y_pred):
    """
    Compute all evaluation metrics.
    """
    metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred),
        'MAPE': calculate_mape(y_true, y_pred),
        'SMAPE': calculate_smape(y_true, y_pred)
    }
    return metrics

print("Evaluation metrics defined successfully!")

## 7. Custom Dataset Class

In [None]:
class PriceDataset(Dataset):
    """
    PyTorch Dataset for price prediction with text input.
    """
    
    def __init__(self, text_data, price_data, tokenizer_model, max_len):
        self.texts = text_data
        self.prices = price_data
        self.tokenizer = tokenizer_model
        self.max_length = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text_sample = str(self.texts[index])
        price_value = self.prices[index]
        
        # Tokenize text
        encoded = self.tokenizer(
            text_sample,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'price': torch.tensor(price_value, dtype=torch.float)
        }

print("Dataset class defined successfully!")

## 8. BERT-Based Price Prediction Model

In [None]:
class TransformerPriceRegressor(nn.Module):
    """
    BERT-based neural network for price prediction.
    """
    
    def __init__(self, transformer_model, dropout_prob=0.3):
        super(TransformerPriceRegressor, self).__init__()
        
        # Load pre-trained transformer
        self.transformer = AutoModel.from_pretrained(transformer_model)
        
        # Get hidden dimension
        hidden_dim = self.transformer.config.hidden_size
        
        # Regression head with multiple layers (matching reference architecture)
        self.price_regressor = nn.Sequential(
            nn.Linear(hidden_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout_prob),
            
            nn.Linear(128, 1)
        )
    
    def forward(self, input_ids, attention_mask):
        # Get transformer outputs
        transformer_out = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Extract CLS token embedding
        cls_embedding = transformer_out.last_hidden_state[:, 0, :]
        
        # Predict price
        price_pred = self.price_regressor(cls_embedding)
        
        return price_pred.squeeze()

print("Model architecture defined successfully!")

## 9. Training Functions

In [None]:
def train_single_epoch(model, data_loader, optimizer, scheduler, device, loss_fn, use_log):
    """
    Train model for one epoch.
    """
    model.train()
    epoch_loss = 0.0
    all_predictions = []
    all_targets = []
    
    progress = tqdm(data_loader, desc='Training')
    
    for batch_data in progress:
        # Transfer to device
        ids = batch_data['input_ids'].to(device)
        mask = batch_data['attention_mask'].to(device)
        targets = batch_data['price'].to(device)
        
        # Forward pass
        optimizer.zero_grad()
        predictions = model(ids, mask)
        
        # Calculate loss
        loss = loss_fn(predictions, targets)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        # Track metrics
        epoch_loss += loss.item()
        pred_np = predictions.detach().cpu().numpy()
        target_np = targets.cpu().numpy()
        
        # Inverse log transform if applied
        if use_log:
            pred_np = np.expm1(pred_np)
            target_np = np.expm1(target_np)
        
        all_predictions.extend(pred_np)
        all_targets.extend(target_np)
        
        progress.set_postfix({'loss': f'{loss.item():.4f}'})
    
    # Calculate metrics
    avg_loss = epoch_loss / len(data_loader)
    metrics = compute_all_metrics(all_targets, all_predictions)
    
    return avg_loss, metrics

def validate_model(model, data_loader, device, loss_fn, use_log):
    """
    Validate model performance.
    """
    model.eval()
    epoch_loss = 0.0
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch_data in tqdm(data_loader, desc='Validating'):
            ids = batch_data['input_ids'].to(device)
            mask = batch_data['attention_mask'].to(device)
            targets = batch_data['price'].to(device)
            
            predictions = model(ids, mask)
            loss = loss_fn(predictions, targets)
            
            epoch_loss += loss.item()
            pred_np = predictions.cpu().numpy()
            target_np = targets.cpu().numpy()
            
            if use_log:
                pred_np = np.expm1(pred_np)
                target_np = np.expm1(target_np)
            
            all_predictions.extend(pred_np)
            all_targets.extend(target_np)
    
    avg_loss = epoch_loss / len(data_loader)
    metrics = compute_all_metrics(all_targets, all_predictions)
    
    return avg_loss, metrics, all_predictions, all_targets

print("Training functions defined successfully!")

## 10. Main Training Pipeline

In [None]:
def train_price_model(dataframe):
    """
    Complete training pipeline for price prediction model.
    """
    print("="*80)
    print("STARTING MODEL TRAINING")
    print("="*80)
    
    # Prepare data
    X_text = dataframe['catalog_content'].values
    
    # Apply log transformation to target
    if HYPERPARAMS['apply_log_transform']:
        print("\nApplying log transformation to prices...")
        y_prices = np.log1p(dataframe['price'].values)
    else:
        y_prices = dataframe['price'].values
    
    # Train-validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X_text, y_prices,
        test_size=HYPERPARAMS['validation_split'],
        random_state=HYPERPARAMS['random_seed']
    )
    
    print(f"\nTraining samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Load tokenizer
    print(f"\nLoading tokenizer: {HYPERPARAMS['model_name']}")
    tokenizer = AutoTokenizer.from_pretrained(HYPERPARAMS['model_name'])
    
    # Create datasets
    train_dataset = PriceDataset(X_train, y_train, tokenizer, HYPERPARAMS['max_seq_length'])
    val_dataset = PriceDataset(X_val, y_val, tokenizer, HYPERPARAMS['max_seq_length'])
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=HYPERPARAMS['batch_size'],
        shuffle=True,
        num_workers=2
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=HYPERPARAMS['batch_size'],
        shuffle=False,
        num_workers=2
    )
    
    # Initialize model
    print(f"\nInitializing model...")
    model = TransformerPriceRegressor(
        HYPERPARAMS['model_name'],
        HYPERPARAMS['dropout_rate']
    )
    model.to(DEVICE)
    
    # Setup training
    loss_function = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=HYPERPARAMS['learning_rate'])
    
    total_steps = len(train_loader) * HYPERPARAMS['num_epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Training loop
    print("\n" + "="*80)
    print(f"TRAINING FOR {HYPERPARAMS['num_epochs']} EPOCHS")
    print("="*80)
    
    best_smape = float('inf')
    training_history = []
    
    for epoch in range(HYPERPARAMS['num_epochs']):
        print(f"\n{'='*80}")
        print(f"EPOCH {epoch + 1}/{HYPERPARAMS['num_epochs']}")
        print(f"{'='*80}")
        
        # Train
        train_loss, train_metrics = train_single_epoch(
            model, train_loader, optimizer, scheduler,
            DEVICE, loss_function, HYPERPARAMS['apply_log_transform']
        )
        
        # Validate
        val_loss, val_metrics, _, _ = validate_model(
            model, val_loader, DEVICE, loss_function,
            HYPERPARAMS['apply_log_transform']
        )
        
        # Print results
        print(f"\nüìä TRAINING METRICS:")
        print(f"   Loss: {train_loss:.4f} | RMSE: {train_metrics['RMSE']:.4f} | "
              f"MAE: {train_metrics['MAE']:.4f} | R¬≤: {train_metrics['R2']:.4f}")
        print(f"   MAPE: {train_metrics['MAPE']:.2f}% | SMAPE: {train_metrics['SMAPE']:.2f}%")
        
        print(f"\nüìä VALIDATION METRICS:")
        print(f"   Loss: {val_loss:.4f} | RMSE: {val_metrics['RMSE']:.4f} | "
              f"MAE: {val_metrics['MAE']:.4f} | R¬≤: {val_metrics['R2']:.4f}")
        print(f"   MAPE: {val_metrics['MAPE']:.2f}% | SMAPE: {val_metrics['SMAPE']:.2f}% ‚≠ê")
        
        # Save best model
        if val_metrics['SMAPE'] < best_smape:
            best_smape = val_metrics['SMAPE']
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_smape': val_metrics['SMAPE'],
                'val_rmse': val_metrics['RMSE'],
                'hyperparameters': HYPERPARAMS
            }, 'best_price_model.pt')
            print(f"\n‚úÖ New best model saved! (SMAPE: {val_metrics['SMAPE']:.2f}%)")
        
        training_history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'train_smape': train_metrics['SMAPE'],
            'val_loss': val_loss,
            'val_smape': val_metrics['SMAPE']
        })
    
    print("\n" + "="*80)
    print("üéâ TRAINING COMPLETED!")
    print("="*80)
    print(f"üèÜ Best Validation SMAPE: {best_smape:.2f}%")
    print(f"üíæ Model saved as: best_price_model.pt")
    print("="*80)
    
    return model, tokenizer, training_history

print("Training pipeline ready!")

## 11. Execute Training

In [None]:
# Train the model
trained_model, model_tokenizer, history = train_price_model(df_clean)

# Display training summary
print("\n" + "="*80)
print("üìä TRAINING SUMMARY")
print("="*80)
for record in history:
    print(f"Epoch {record['epoch']}: "
          f"Train SMAPE={record['train_smape']:.2f}%, "
          f"Val SMAPE={record['val_smape']:.2f}%")
print("="*80)

## 12. Inference Function for Test Data

In [None]:
def generate_predictions(model, text_samples, tokenizer, device, use_log=True, batch_size=16):
    """
    Generate price predictions for new text samples.
    
    Args:
        model: Trained PyTorch model
        text_samples: List of text strings
        tokenizer: Tokenizer instance
        device: Device (CPU/GPU)
        use_log: Whether log transform was used
        batch_size: Batch size for inference
    
    Returns:
        numpy array of predicted prices
    """
    model.eval()
    predictions = []
    
    # Create dummy prices for dataset
    dummy_prices = np.zeros(len(text_samples))
    dataset = PriceDataset(text_samples, dummy_prices, tokenizer, HYPERPARAMS['max_seq_length'])
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Predicting'):
            ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            
            preds = model(ids, mask)
            pred_array = preds.cpu().numpy()
            
            # Inverse log transform
            if use_log:
                pred_array = np.expm1(pred_array)
            
            predictions.extend(pred_array)
    
    return np.array(predictions)

print("Inference function ready!")

## 13. Generate Test Predictions

In [None]:
# Load test data
print("Loading test data...")
df_test = pd.read_csv('/kaggle/input/dataset/student_resource/dataset/test.csv')
print(f"Test samples: {len(df_test)}")

# Preprocess test data
print("\nPreprocessing test data...")
df_test['catalog_content'] = df_test['catalog_content'].apply(preprocess_text)

# Generate predictions
test_predictions = generate_predictions(
    trained_model,
    df_test['catalog_content'].tolist(),
    model_tokenizer,
    DEVICE,
    use_log=HYPERPARAMS['apply_log_transform']
)

# Display statistics
print("\nüìä Prediction Statistics:")
print(f"   Min: ${test_predictions.min():.2f}")
print(f"   Max: ${test_predictions.max():.2f}")
print(f"   Mean: ${test_predictions.mean():.2f}")
print(f"   Median: ${np.median(test_predictions):.2f}")

# Ensure positive prices
test_predictions = np.maximum(test_predictions, 0)

# Handle invalid values
if np.any(~np.isfinite(test_predictions)):
    print("\n‚ö†Ô∏è Fixing invalid predictions...")
    median_price = np.nanmedian(test_predictions[np.isfinite(test_predictions)])
    test_predictions[~np.isfinite(test_predictions)] = median_price

print("\n‚úÖ Predictions generated successfully!")

## 14. Create Submission File

In [None]:
# Create submission dataframe
submission_df = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': test_predictions
})

# Save to CSV
submission_filename = 'submission_price_prediction.csv'
submission_df.to_csv(submission_filename, index=False)

print("="*80)
print("üéâ SUBMISSION FILE CREATED!")
print("="*80)
print(f"üìÅ Filename: {submission_filename}")
print(f"üìä Total rows: {len(submission_df)}")
print(f"üíµ Price range: ${test_predictions.min():.2f} - ${test_predictions.max():.2f}")
print("\nüöÄ Ready to submit to Kaggle!")
print("="*80)

# Display sample
print("\nüìã Sample predictions:")
print(submission_df.head(10))

## 15. Model Performance Visualization (Optional)

In [None]:
import matplotlib.pyplot as plt

# Extract metrics from history
epochs = [h['epoch'] for h in history]
train_smape = [h['train_smape'] for h in history]
val_smape = [h['val_smape'] for h in history]

# Plot SMAPE over epochs
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_smape, marker='o', label='Train SMAPE', linewidth=2)
plt.plot(epochs, val_smape, marker='s', label='Validation SMAPE', linewidth=2)
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('SMAPE (%)', fontsize=12)
plt.title('Model Performance: SMAPE over Epochs', fontsize=14, fontweight='bold')
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('training_performance.png', dpi=300)
plt.show()

print("\n‚úÖ Performance plot saved as 'training_performance.png'")