# Assignment 3 - Part 1: Character-Level Siamese Models

This notebook implements the end-to-end pipeline for the Home Depot Search Relevance assignment.It corresponds to the 5 steps outlined in the assignment pipeline.

## Step 1: Data Preprocessing & Tokenization
Handles data loading, cleaning, and character-level tokenization.

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import os

# --- Constants based on your statistics ---
MAX_LEN_SEARCH = 64    # Covers Max (60)
MAX_LEN_DESC = 2048    # Covers >95% (1842)
RANDOM_SEED = 42

# --- Paths ---
DATA_DIR = 'data'
OUTPUT_DIR = 'outputs'

def load_and_merge():
    print("Loading CSV files...")
    # Load with fallback encoding
    try:
        train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), encoding='ISO-8859-1')
        test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), encoding='ISO-8859-1')
        desc = pd.read_csv(os.path.join(DATA_DIR, 'product_descriptions.csv'), encoding='ISO-8859-1')
    except:
        train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'), encoding='latin-1')
        test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'), encoding='latin-1')
        desc = pd.read_csv(os.path.join(DATA_DIR, 'product_descriptions.csv'), encoding='latin-1')

    # Merge descriptions
    print("Merging descriptions...")
    train = pd.merge(train, desc, on='product_uid', how='left')
    test = pd.merge(test, desc, on='product_uid', how='left')
    
    # Fill NaNs
    train['search_term'] = train['search_term'].fillna("")
    train['product_description'] = train['product_description'].fillna("")
    train['product_title'] = train['product_title'].fillna("")
    
    test['search_term'] = test['search_term'].fillna("")
    test['product_description'] = test['product_description'].fillna("")
    test['product_title'] = test['product_title'].fillna("")
    
    # Create combined description (Title + Description) for better context
    train['full_desc'] = train['product_title'] + " " + train['product_description']
    test['full_desc'] = test['product_title'] + " " + test['product_description']
    
    return train, test

def build_char_dict(texts):
    print("Building character dictionary...")
    chars = set()
    for text in texts:
        chars.update(text)
    
    # 0 is reserved for padding
    char_to_int = {c: i + 1 for i, c in enumerate(sorted(list(chars)))}
    return char_to_int

def text_to_sequence(text, char_to_int, max_len):
    seq = [char_to_int.get(c, 0) for c in text]
    # Truncate
    seq = seq[:max_len]
    # Pad (Post-padding usually fine, or pre-padding)
    # We will use zeros for padding
    if len(seq) < max_len:
        seq = seq + [0] * (max_len - len(seq))
    return seq

def prepare_dl_data(train_df, test_df):
    print("\n--- Preparing Data for Deep Learning (Character Sequences) ---")
    
    # 1. Build Vocabulary from all text
    all_text = pd.concat([
        train_df['search_term'], 
        train_df['full_desc'],
        test_df['search_term'],
        test_df['full_desc']
    ])
    char_to_int = build_char_dict(all_text)
    print(f"Vocabulary Size: {len(char_to_int)} characters")
    
    # 2. Convert to Sequences
    print("Converting text to sequences (this may take a moment)...")
    
    def process_column(series, max_len):
        return np.array([text_to_sequence(t, char_to_int, max_len) for t in series], dtype=np.int8)
    
    X_train_search = process_column(train_df['search_term'], MAX_LEN_SEARCH)
    X_train_desc = process_column(train_df['full_desc'], MAX_LEN_DESC)

    # --- Save Tokenization Examples for Report ---
    token_file = os.path.join(OUTPUT_DIR, 'tokenization_examples.txt')
    print(f"Saving tokenization examples to '{token_file}'...")
    with open(token_file, 'w', encoding='utf-8') as f:
        f.write("--- Tokenization Examples ---\n")
        f.write(f"Vocabulary Size: {len(char_to_int)}\n")
        f.write("Char to Int Map (First 20): " + str(list(char_to_int.items())[:20]) + "...\n\n")
        
        for i in range(5):
            orig = train_df['search_term'].iloc[i]
            seq = X_train_search[i]
            # Convert non-zero seq back to readable for check (optional) 
            f.write(f"Example {i+1}:\n")
            f.write(f"  Original: '{orig}'\n")
            f.write(f"  Sequence: {seq.tolist()[:20]} ... (truncated)\n\n")
    # ---------------------------------------------
    
    X_test_search = process_column(test_df['search_term'], MAX_LEN_SEARCH)
    X_test_desc = process_column(test_df['full_desc'], MAX_LEN_DESC)
    
    y = train_df['relevance'].values
    
    # 3. Split Train into Train/Validation
    print("Splitting Training data into Train/Validation (80/20)...")
    # We split indices to keep pairs together
    indices = np.arange(len(y))
    train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=RANDOM_SEED)
    
    dl_data = {
        'X_train_search': X_train_search[train_idx],
        'X_train_desc': X_train_desc[train_idx],
        'y_train': y[train_idx],
        
        'X_val_search': X_train_search[val_idx],
        'X_val_desc': X_train_desc[val_idx],
        'y_val': y[val_idx],
        
        'X_test_search': X_test_search,
        'X_test_desc': X_test_desc,
        
        'char_to_int': char_to_int,
        'max_len_search': MAX_LEN_SEARCH,
        'max_len_desc': MAX_LEN_DESC
    }
    
    np.savez(os.path.join(DATA_DIR, 'dl_data.npz'), **dl_data)
    print("Deep Learning data saved to 'dl_data.npz'")

def prepare_benchmark_data(train_df, test_df):
    print("\n--- Preparing Data for Benchmark (CountVectorizer n-gram 2,4) ---")
    
    # Combine Search + Desc for Bag-of-Ngrams
    # We add a separator
    train_text = train_df['search_term'] + " \t " + train_df['full_desc']
    test_text = test_df['search_term'] + " \t " + test_df['full_desc']
    
    # Setup Vectorizer (As requested: ngram 2-4)
    # limit max_features to avoid OOM
    print("Fitting CountVectorizer (char, ngram 2-4)...")
    vectorizer = CountVectorizer(
        analyzer='char',
        ngram_range=(2, 4),
        min_df=5,       # Ignore very rare ngrams to save memory
        dtype=np.uint16,
        max_features=20000
    )
    
    # Fit on Train, Transform both
    # Note: Fitting on just train is standard practice to avoid data leakage
    X_train_full = vectorizer.fit_transform(train_text)
    X_test_full = vectorizer.transform(test_text)
    
    y = train_df['relevance'].values
    
    # Split
    indices = np.arange(len(y))
    train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=RANDOM_SEED)
    
    benchmark_data = {
        'X_train': X_train_full[train_idx],
        'y_train': y[train_idx],
        'X_val': X_train_full[val_idx],
        'y_val': y[val_idx],
        'X_test': X_test_full,
        'vectorizer': vectorizer
    }
    
    joblib.dump(benchmark_data, os.path.join(DATA_DIR, 'benchmark_data.pkl'))
    print("Benchmark data saved to 'benchmark_data.pkl'")

# --- Main Execution ---
train_df, test_df = load_and_merge()

prepare_dl_data(train_df, test_df)
prepare_benchmark_data(train_df, test_df)

print("\nPreprocessing Complete!")

## Step 2: Naive Benchmark Model
Trains a Ridge Regression model on character n-grams (2-4).

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
import joblib
import time
import os

# --- Constants ---
RESULTS_FILE = 'results_log.csv'
MODEL_FILE = 'outputs/benchmark_model.pkl'
DATA_FILE = 'data/benchmark_data.pkl'
SUBMISSION_FILE = 'outputs/submission_benchmark.npy'

def evaluate(y_true, y_pred, name="Set"):
    y_pred = np.clip(y_pred, 1.0, 3.0)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"  {name} RMSE: {rmse:.4f}")
    print(f"  {name} MAE:  {mae:.4f}")
    return rmse, mae

def log_results(model_name, runtime, train_rmse, val_rmse, train_mae, val_mae):
    # Logs TRAIN/VAL results only. Test results handled in evaluate.py
    file_exists = os.path.exists(RESULTS_FILE)
    
    # We leave Test columns empty or N/A for this script
    df = pd.DataFrame([{
        'Model type': model_name,
        'runtime': f"{runtime:.2f} sec",
        'Train RMSE': train_rmse,
        'Val-RMSE': val_rmse,
        'Test-RMSE': "N/A (See evaluate.py)",
        'Train MAE': train_mae,
        'Val-MAE': val_mae,
        'Test-MAE': "N/A (See evaluate.py)"
    }])
    
    if not file_exists:
        df.to_csv(RESULTS_FILE, index=False)
    else:
        df.to_csv(RESULTS_FILE, mode='a', header=False, index=False)
    
    print(f"\nTraining Results saved to {RESULTS_FILE}")

def run_benchmark():
    print("--- Training Benchmark Model (Ridge Regression) ---")
    
    # 1. Load Data
    if not os.path.exists(DATA_FILE):
        print(f"Error: {DATA_FILE} not found. Run step1_preprocess.py first.")
        return

    print("Loading data...")
    data = joblib.load(DATA_FILE)
    X_train = data['X_train']
    y_train = data['y_train']
    X_val = data['X_val']
    y_val = data['y_val']
    X_test = data['X_test']
    
    # 2. Train Model
    print(f"Training Ridge Regression ({X_train.shape[0]} samples)...")
    start_time = time.time()
    
    model = Ridge(alpha=1.0)
    model.fit(X_train, y_train)
    
    train_time = time.time() - start_time
    print(f"Training completed in {train_time:.2f} seconds.")
    
    # Save Model
    joblib.dump(model, MODEL_FILE)
    print(f"Model saved to {MODEL_FILE}")

    # 3. Predict & Evaluate (Train/Val only)
    print("\nEvaluating on Train/Val...")
    
    # Train
    y_pred_train = model.predict(X_train)
    train_rmse, train_mae = evaluate(y_train, y_pred_train, "Train")
    
    # Val
    y_pred_val = model.predict(X_val)
    val_rmse, val_mae = evaluate(y_val, y_pred_val, "Val")
    
    # Test (Prediction Only)
    print("Generating Test predictions...")
    y_pred_test = model.predict(X_test)
    y_pred_test = np.clip(y_pred_test, 1.0, 3.0)
    
    # Save predictions
    np.save(SUBMISSION_FILE, y_pred_test)
    print(f"Test predictions saved to '{SUBMISSION_FILE}'")
    
    # 4. Log Results
    log_results(
        "Naive Benchmark (Ridge Char 2-4gram)", 
        train_time,
        round(train_rmse, 4), 
        round(val_rmse, 4), 
        round(train_mae, 4), 
        round(val_mae, 4)
    )

# --- Main Execution ---
run_benchmark()

## Step 3: Siamese Char-CNN Training
Trains the Siamese Network with Character-level CNN encoders.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import os

# --- Configuration ---
DATA_FILE = 'data/dl_data.npz'
RESULTS_FILE = 'results_log.csv'
MODEL_SAVE_PATH = 'outputs/siamese_char_cnn.pt'
SUBMISSION_FILE = 'outputs/submission_siamese_char.npy'
PLOT_FILE = 'outputs/training_history_char_siamese.png'
BATCH_SIZE = 64 
EPOCHS = 15
EMBEDDING_DIM = 64
HIDDEN_DIM = 256
LEARNING_RATE = 0.001

class CharDataset(Dataset):
    def __init__(self, search, desc, labels=None):
        # We ensure they are the correct length and type for PyTorch
        self.search = torch.tensor(search.astype(np.int64))
        self.desc = torch.tensor(desc.astype(np.int64))
        if labels is not None:
            self.labels = torch.tensor(labels.astype(np.float32))
        else:
            self.labels = None
        
    def __len__(self):
        return len(self.search)
    
    def __getitem__(self, idx):
        if self.labels is not None:
            return self.search[idx], self.desc[idx], self.labels[idx]
        return self.search[idx], self.desc[idx]

class CharCNNEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super(CharCNNEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        
        self.convs = nn.Sequential(
            nn.Conv1d(emb_dim, 128, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(128, 256, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1) 
        )
        self.fc = nn.Linear(256, hidden_dim)

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2) 
        x = self.convs(x).squeeze(-1) 
        x = self.fc(x)
        return x

class SiameseCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, target_mean=2.38):
        super(SiameseCNN, self).__init__()
        self.encoder = CharCNNEncoder(vocab_size, emb_dim, hidden_dim)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 4, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        with torch.no_grad():
            self.fc[-1].bias.fill_(target_mean)

    def forward(self, s, d):
        h1 = self.encoder(s)
        h2 = self.encoder(d)
        
        diff = torch.abs(h1 - h2)
        prod = h1 * h2
        
        combined = torch.cat([h1, h2, diff, prod], dim=1)
        return self.fc(combined).squeeze(-1)

def load_data():
    print("Loading data...")
    if not os.path.exists(DATA_FILE):
        raise FileNotFoundError(f"{DATA_FILE} not found. Please run step1_preprocess.py first.")
        
    with np.load(DATA_FILE, allow_pickle=True) as data:
        char_to_int = data['char_to_int'].item()
        vocab_size = len(char_to_int) + 1
        return (data['X_train_search'], data['X_train_desc'], data['y_train'],
                data['X_val_search'], data['X_val_desc'], data['y_val'],
                data['X_test_search'], data['X_test_desc'], vocab_size)

def log_results(runtime, train_rmse, val_rmse, train_mae, val_mae):
    # Logs TRAIN/VAL results only.
    res_df = pd.DataFrame([{
        'Model type': 'Character level CNN (Siamese)',
        'runtime': f"{runtime:.2f} sec",
        'Train RMSE': f"{train_rmse:.4f}",
        'Val-RMSE': f"{val_rmse:.4f}",
        'Test-RMSE': "N/A (See evaluate.py)",
        'Train MAE': f"{train_mae:.4f}",
        'Val-MAE': f"{val_mae:.4f}",
        'Test-MAE': "N/A (See evaluate.py)"
    }])
    res_df.to_csv(RESULTS_FILE, mode='a', header=not os.path.exists(RESULTS_FILE), index=False)
    print(f"\nTraining Results saved to {RESULTS_FILE}")

def main():
    # ... (existing device and loading code) ...
    # ...
    # After training loop:
    # Load best model for final Train/Val/Test evaluation
    print("\nLoading best model for final evaluation...")
    model.load_state_dict(torch.load(MODEL_SAVE_PATH))
    
    model.eval()
    def get_metrics(loader):
        preds, targets = [], []
        with torch.no_grad():
            for s, d, y in loader:
                s, d = s.to(device), d.to(device)
                out = model(s, d)
                preds.extend(out.cpu().numpy())
                targets.extend(y.numpy())
        preds = np.clip(preds, 1.0, 3.0)
        rmse = np.sqrt(np.mean((np.array(targets) - np.array(preds))**2))
        mae = np.mean(np.abs(np.array(targets) - np.array(preds)))
        return rmse, mae

    tr_rmse, tr_mae = get_metrics(DataLoader(CharDataset(X_s_tr, X_d_tr, y_tr), batch_size=BATCH_SIZE))
    val_rmse, val_mae = get_metrics(DataLoader(CharDataset(X_s_val, X_d_val, y_val), batch_size=BATCH_SIZE))

    # Test Prediction
    test_loader = DataLoader(CharDataset(X_s_te, X_d_te, labels=None), batch_size=BATCH_SIZE)
    # ... (rest of the test pred logic) ...
    
    # Log Results
    log_results(total_time, tr_rmse, val_rmse, tr_mae, val_mae)


# --- Main Execution ---
main()

## Step 4: Feature Extraction & Machine Learning
Extracts embeddings from the trained CNN and trains XGBoost/Ridge models.

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import time

# --- Configuration ---
DATA_FILE = 'data/dl_data.npz'
MODEL_PATH = 'outputs/siamese_char_cnn.pt'
RESULTS_FILE = 'results_log.csv'
SUBMISSION_XGB = 'outputs/submission_char_fe_xgb.npy'
SUBMISSION_RIDGE = 'outputs/submission_char_fe_ridge.npy'
BATCH_SIZE = 128
EMBEDDING_DIM = 64
HIDDEN_DIM = 256

# --- Re-define Model Classes (Must match training) ---
class CharCNNEncoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super(CharCNNEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.convs = nn.Sequential(
            nn.Conv1d(emb_dim, 128, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(128, 256, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1) 
        )
        self.fc = nn.Linear(256, hidden_dim)

    def forward(self, x):
        x = self.embedding(x).transpose(1, 2) 
        x = self.convs(x).squeeze(-1) 
        x = self.fc(x)
        return x

class SiameseCNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super(SiameseCNN, self).__init__()
        self.encoder = CharCNNEncoder(vocab_size, emb_dim, hidden_dim)
        # We don't need the rest for feature extraction

    def forward(self, s, d):
        return self.encoder(s), self.encoder(d)

class CharDataset(Dataset):
    def __init__(self, search, desc):
        self.search = torch.tensor(search.astype(np.int64))
        self.desc = torch.tensor(desc.astype(np.int64))
    def __len__(self): return len(self.search)
    def __getitem__(self, idx): return self.search[idx], self.desc[idx]

def extract_features(model, loader, device):
    model.eval()
    feats = []
    with torch.no_grad():
        for s, d in loader:
            s, d = s.to(device), d.to(device)
            h1, h2 = model(s, d)
            
            h1 = h1.cpu().numpy()
            h2 = h2.cpu().numpy()
            
            # Create Interaction Features
            diff = np.abs(h1 - h2)
            prod = h1 * h2
            cosine = np.sum(h1 * h2, axis=1, keepdims=True) / (
                np.linalg.norm(h1, axis=1, keepdims=True) * np.linalg.norm(h2, axis=1, keepdims=True) + 1e-8
            )
            euclid = np.linalg.norm(h1 - h2, axis=1, keepdims=True)
            
            # Concatenate all
            batch_feats = np.hstack([h1, h2, diff, prod, cosine, euclid])
            feats.append(batch_feats)
            
    return np.vstack(feats)

def log_results(model_name, runtime, train_rmse, val_rmse, train_mae, val_mae):
    print(f"Logging {model_name}...")
    res_df = pd.DataFrame([{
        'Model type': model_name,
        'runtime': f"{runtime:.2f} sec",
        'Train RMSE': f"{train_rmse:.4f}",
        'Val-RMSE': f"{val_rmse:.4f}",
        'Test-RMSE': "N/A (See evaluate.py)",
        'Train MAE': f"{train_mae:.4f}",
        'Val-MAE': f"{val_mae:.4f}",
        'Test-MAE': "N/A (See evaluate.py)"
    }])
    res_df.to_csv(RESULTS_FILE, mode='a', header=not os.path.exists(RESULTS_FILE), index=False)

def main():
    # ... (existing loading and feature extraction code) ...

    # 4. Train Model 1: XGBoost on Char Features...
    print("\nTraining Model 1: XGBoost on Char Features...")
    start = time.time()
    xgb_model = xgb.XGBRegressor(
        n_estimators=100, 
        max_depth=6, 
        learning_rate=0.1, 
        n_jobs=-1,
        random_state=42
    )
    xgb_model.fit(X_train_feats, y_train)
    rt = time.time() - start
    
    # Eval XGB
    tr_pred = np.clip(xgb_model.predict(X_train_feats), 1.0, 3.0)
    val_pred = np.clip(xgb_model.predict(X_val_feats), 1.0, 3.0)
    tr_rmse = np.sqrt(mean_squared_error(y_train, tr_pred))
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    tr_mae = mean_absolute_error(y_train, tr_pred)
    val_mae = mean_absolute_error(y_val, val_pred)
    
    print(f"XGB Results - Train RMSE: {tr_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
    log_results("FE (Char) + XGBoost", rt, tr_rmse, val_rmse, tr_mae, val_mae)
    
    # Save XGB Preds
    te_pred = xgb_model.predict(X_test_feats)
    np.save(SUBMISSION_XGB, te_pred)
    
    # 5. Train Model 2: Ridge on Char Features...
    print("\nTraining Model 2: Ridge on Char Features...")
    start = time.time()
    ridge_model = Ridge(alpha=1.0)
    ridge_model.fit(X_train_feats, y_train)
    rt = time.time() - start
    
    # Eval Ridge
    tr_pred = np.clip(ridge_model.predict(X_train_feats), 1.0, 3.0)
    val_pred = np.clip(ridge_model.predict(X_val_feats), 1.0, 3.0)
    tr_rmse = np.sqrt(mean_squared_error(y_train, tr_pred))
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    tr_mae = mean_absolute_error(y_train, tr_pred)
    val_mae = mean_absolute_error(y_val, val_pred)
    
    print(f"Ridge Results - Train RMSE: {tr_rmse:.4f}, Val RMSE: {val_rmse:.4f}")
    log_results("FE (Char) + Ridge", rt, tr_rmse, val_rmse, tr_mae, val_mae)

    
    # Save Ridge Preds
    te_pred = ridge_model.predict(X_test_feats)
    np.save(SUBMISSION_RIDGE, te_pred)
    
    print("\nDone with Char Feature Extraction!")

# --- Main Execution ---
main()

## Step 5: Final Evaluation
Evaluates all models against the solution file and generates the report.

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error

# --- Configuration ---
SOLUTION_FILE = 'data/solution.csv'
RESULTS_FILE = 'results_log.csv'
PRED_FILES = {
    'Benchmark': 'outputs/submission_benchmark.npy',
    'Siamese Char CNN': 'outputs/submission_siamese_char.npy',
    'FE (Char) + XGBoost': 'outputs/submission_char_fe_xgb.npy',
    'FE (Char) + Ridge': 'outputs/submission_char_fe_ridge.npy'
}

def load_solution():
    if not os.path.exists(SOLUTION_FILE):
        print(f"Error: {SOLUTION_FILE} not found.")
        return None, None
    
    df = pd.read_csv(SOLUTION_FILE)
    
    # Filter Usage
    if 'Usage' in df.columns:
        # Keep Public and Private, ignore Ignored
        # The user said "remove -1", which usually corresponds to Ignored.
        mask = df['Usage'] != 'Ignored'
        # Double check with relevance just in case
        if 'relevance' in df.columns:
             mask = mask & (df['relevance'] != -1)
        
        filtered_df = df[mask]
        valid_indices = df.index[mask].to_numpy() # Original indices to slice predictions
        y_true = filtered_df['relevance'].values
        return y_true, valid_indices
    elif 'relevance' in df.columns:
        # Fallback if Usage column missing but -1 exists
        mask = df['relevance'] != -1
        filtered_df = df[mask]
        valid_indices = df.index[mask].to_numpy()
        y_true = filtered_df['relevance'].values
        return y_true, valid_indices
        
    return None, None

def evaluate_predictions(name, pred_file, y_true, valid_indices):
    if not os.path.exists(pred_file):
        print(f"[{name}] Prediction file {pred_file} not found.")
        return None, None

    try:
        y_pred_full = np.load(pred_file)
        
        # Check length
        # We assume y_pred_full corresponds to the full test/solution file
        # If lengths match full solution, we slice.
        # Note: valid_indices corresponds to the row number in solution.csv (0-based)
        
        if len(y_pred_full) < np.max(valid_indices):
             print(f"[{name}] Warning: Prediction length {len(y_pred_full)} < Max Index {np.max(valid_indices)}.")
             return None, None
             
        y_pred = y_pred_full[valid_indices]
        
        # Clip just in case
        y_pred = np.clip(y_pred, 1.0, 3.0)
        
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mae = mean_absolute_error(y_true, y_pred)
        
        print(f"--- {name} ---")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE:  {mae:.4f}")
        
        return rmse, mae
    except Exception as e:
        print(f"Error evaluating {name}: {e}")
        return None, None

def update_log(name, rmse, mae):
    if not os.path.exists(RESULTS_FILE):
        print("Results log not found, cannot update.")
        return

    df = pd.read_csv(RESULTS_FILE)
    
    # Mapping display names to Model type names in the CSV
    name_map = {
        'Benchmark': 'Naive Benchmark',
        'Siamese Char CNN': 'Character level CNN',
        'FE (Char) + XGBoost': 'FE (Char) + XGBoost',
        'FE (Char) + Ridge': 'FE (Char) + Ridge'
    }
    
    csv_name = name_map.get(name, name)
    
    updated = False
    for i in range(len(df)):
        if csv_name in str(df.loc[i, 'Model type']):
            # If we find a row for this model, we update it.
            # We prefer updating rows that have "N/A" for Test metrics.
            if df.loc[i, 'Test-RMSE'] == "N/A (See evaluate.py)" or pd.isna(df.loc[i, 'Test-RMSE']):
                df.loc[i, 'Test-RMSE'] = round(rmse, 4)
                df.loc[i, 'Test-MAE'] = round(mae, 4)
                updated = True
                print(f"Updated existing entry for {df.loc[i, 'Model type']}")
                break

    if not updated:
        # Check if any row matches exactly, even if it's already filled
        for i in range(len(df)):
            if csv_name == str(df.loc[i, 'Model type']):
                df.loc[i, 'Test-RMSE'] = round(rmse, 4)
                df.loc[i, 'Test-MAE'] = round(mae, 4)
                updated = True
                print(f"Overwrote existing entry for {df.loc[i, 'Model type']}")
                break
    
    if not updated:
        print(f"Could not find entry for {name} in {RESULTS_FILE}. Appending new row.")
        new_row = {
            'Model type': csv_name,
            'runtime': "-",
            'Train RMSE': "-",
            'Val-RMSE': "-",
            'Test-RMSE': round(rmse, 4),
            'Train MAE': "-",
            'Val-MAE': "-",
            'Test-MAE': round(mae, 4)
        }
        df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

    df.to_csv(RESULTS_FILE, index=False)

def main():
    print("Loading Ground Truth from solution.csv...")
    y_true, valid_indices = load_solution()
    
    if y_true is None:
        print("Could not load valid labels.")
        return
    
    print(f"Found {len(y_true)} valid test samples (filtered 'Ignored/-1').")
    
    for name, pred_file in PRED_FILES.items():
        rmse, mae = evaluate_predictions(name, pred_file, y_true, valid_indices)
        if rmse is not None:
            update_log(name, rmse, mae)

    print(f"\nEvaluation Complete. Updated {RESULTS_FILE}.")

# --- Main Execution ---
main()