In [3]:
import numpy as np
import pandas as pd
import pickle
import os
import time
import torch
import faiss
from sklearn.preprocessing import StandardScaler

def gpu_neighbor_selection(train_data, train_ids, test_ids, k=5, 
                          output_path=None, use_embedding=True, batch_size=1024,
                          similarity_threshold=None, min_neighbors=1):
    """
    GPU-accelerated neighbor selection using FAISS and PyTorch,
    with filtering to keep only sufficiently similar neighbors.
    
    Parameters:
    -----------
    train_data : pandas.DataFrame
        The complete dataset containing both training and testing listings
    train_ids : list
        List of listing IDs to use as potential neighbors (training set)
    test_ids : list
        List of listing IDs that need neighbors (test set)
    k : int
        Number of neighbors to find for each listing
    output_path : str, optional
        Path to save the neighbor dictionary
    use_embedding : bool, default=True
        Whether to use neural embeddings for listings
    batch_size : int, default=1024
        Batch size for GPU processing
    similarity_threshold : float, optional
        Maximum distance threshold for keeping neighbors (None=auto-determine)
    min_neighbors : int, default=1
        Minimum number of neighbors to keep per listing
        
    Returns:
    --------
    dict
        Dictionary mapping test listing IDs to lists of their most similar train listing IDs
    """
    # Check if CUDA is available and ensure we use GPU
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. This function requires a GPU.")
    
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    start_time = time.time()
    
    # Filter data to create train and test datasets from the same source
    train_df = train_data[train_data['listing_id'].isin(train_ids)].copy()
    test_df = train_data[train_data['listing_id'].isin(test_ids)].copy()
    
    # Get one representative record per listing (most recent)
    train_df = train_df.sort_values('date', ascending=False).drop_duplicates('listing_id')
    test_df = test_df.sort_values('date', ascending=False).drop_duplicates('listing_id')
    
    print(f"Processing {len(train_df)} train and {len(test_df)} test listings")
    
    # Define feature groups
    property_features = ['accommodates', 'bedrooms', 'bathrooms', 'amenity_count', 
                         'luxury_score', 'essential_score', 'bedroom_ratio']
    amenity_features = [col for col in train_data.columns if col.startswith('has_')]
    location_features = ['latitude', 'longitude', 'neighbourhood_cleansed_encoded']
    
    # Get available features
    available_property = [f for f in property_features if f in train_df.columns]
    available_amenity = [f for f in amenity_features if f in train_df.columns]
    available_location = [f for f in location_features if f in train_df.columns]
    
    # Fill missing values
    for df in [train_df, test_df]:
        for col in available_property:
            if col in df.columns and df[col].isnull().any():
                df[col] = df[col].fillna(df[col].median())
        for col in available_amenity:
            if col in df.columns and df[col].isnull().any():
                df[col] = df[col].fillna(0)
        for col in available_location:
            if col in df.columns and df[col].isnull().any():
                if col in ['latitude', 'longitude']:
                    df[col] = df[col].fillna(df[col].median())
                else:
                    df[col] = df[col].fillna(0)
    
    # Normalize features
    all_features = available_property + available_amenity + available_location
    if not all_features:
        raise ValueError("No features available for similarity calculation")
    
    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_df[all_features].values)
    test_features = scaler.transform(test_df[all_features].values)
    
    # Define feature weights
    feature_weights = np.ones(len(all_features), dtype=np.float32)
    
    # Assign weights to different feature types
    for i, feature in enumerate(all_features):
        if feature in available_location:
            # Higher weight for location features
            feature_weights[i] = 2.0
        elif feature in ['accommodates', 'bedrooms', 'bathrooms']:
            # Medium weight for key property features
            feature_weights[i] = 1.5
    
    # Normalize weights
    feature_weights = feature_weights / feature_weights.sum()
    
    # Convert to PyTorch tensors
    train_tensor = torch.FloatTensor(train_features).cuda()
    test_tensor = torch.FloatTensor(test_features).cuda()
    weights_tensor = torch.FloatTensor(feature_weights).cuda()
    
    # Option to use neural embeddings to improve similarity calculation
    if use_embedding:
        class EmbeddingNetwork(torch.nn.Module):
            def __init__(self, input_dim, embedding_dim=32):
                super(EmbeddingNetwork, self).__init__()
                self.fc1 = torch.nn.Linear(input_dim, 64)
                self.bn1 = torch.nn.BatchNorm1d(64)
                self.fc2 = torch.nn.Linear(64, embedding_dim)
                
            def forward(self, x):
                x = torch.relu(self.bn1(self.fc1(x)))
                x = self.fc2(x)
                # Normalize embeddings to unit length
                x = torch.nn.functional.normalize(x, p=2, dim=1)
                return x
        
        # Train a simple embedding network
        print("Training embedding network for improved similarity...")
        embedding_dim = 32
        model = EmbeddingNetwork(len(all_features), embedding_dim).cuda()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        
        # Fixed contrastive loss that works with different batch sizes
        def batch_contrastive_loss(embeddings):
            batch_size = embeddings.size(0)
            
            # If batch size is too small, use a simpler loss
            if batch_size < 4:
                # Simple regularization loss
                return torch.mean(torch.norm(embeddings, dim=1))
            
            # Matrix of pairwise distances within the batch only
            dists = torch.cdist(embeddings, embeddings)
            
            # For each embedding, find closest and furthest embeddings within batch
            # Exclude self comparisons (diagonal elements)
            mask = torch.ones_like(dists, dtype=torch.bool)
            mask.fill_diagonal_(0)
            
            # Use masked_select to get non-diagonal elements and reshape
            non_diag_dists = torch.masked_select(dists, mask).reshape(batch_size, batch_size - 1)
            
            # Find closest and furthest for each point
            closest = non_diag_dists.min(dim=1)[0]   # Closest point to each embedding
            furthest = non_diag_dists.max(dim=1)[0]  # Furthest point from each embedding
            
            # Triplet-like loss: minimize closest distance, maximize furthest distance
            # This encourages compact, well-separated clusters
            loss = closest.mean() - furthest.mean() * 0.1
            
            return loss
        
        # Train embedding model with batching for memory efficiency
        model.train()
        num_train = train_tensor.shape[0]
        num_epochs = 20
        
        # Adjust batch size if needed
        actual_batch_size = min(batch_size, num_train)
        print(f"Using batch size: {actual_batch_size}")
        
        for epoch in range(num_epochs):
            total_loss = 0
            num_batches = 0
            
            # Shuffle data each epoch
            indices = torch.randperm(num_train)
            train_tensor_shuffled = train_tensor[indices]
            
            # Process in batches
            for i in range(0, num_train, actual_batch_size):
                end_idx = min(i + actual_batch_size, num_train)
                curr_batch_size = end_idx - i
                
                # Skip tiny batches
                if curr_batch_size < 2:
                    continue
                
                batch = train_tensor_shuffled[i:end_idx]
                
                optimizer.zero_grad()
                embeddings = model(batch)
                loss = batch_contrastive_loss(embeddings)
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                num_batches += 1
            
            avg_loss = total_loss / max(1, num_batches)
            if (epoch + 1) % 5 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {avg_loss:.6f}")
        
        # Generate embeddings for all listings
        model.eval()
        with torch.no_grad():
            # Process train data in batches
            train_embeddings = []
            for i in range(0, train_tensor.shape[0], actual_batch_size):
                end_idx = min(i + actual_batch_size, train_tensor.shape[0])
                batch = train_tensor[i:end_idx]
                emb = model(batch).cpu().numpy()
                train_embeddings.append(emb)
            train_embeddings = np.vstack(train_embeddings)
            
            # Process test data in batches
            test_embeddings = []
            for i in range(0, test_tensor.shape[0], actual_batch_size):
                end_idx = min(i + actual_batch_size, test_tensor.shape[0])
                batch = test_tensor[i:end_idx]
                emb = model(batch).cpu().numpy()
                test_embeddings.append(emb)
            test_embeddings = np.vstack(test_embeddings)
    else:
        # Without embedding model, just use weighted features
        train_weighted = train_tensor * weights_tensor
        test_weighted = test_tensor * weights_tensor
        
        # Normalize vectors
        train_embeddings = torch.nn.functional.normalize(train_weighted, p=2, dim=1).cpu().numpy()
        test_embeddings = torch.nn.functional.normalize(test_weighted, p=2, dim=1).cpu().numpy()
    
    # Use FAISS for efficient GPU-accelerated nearest neighbor search
    print("Building FAISS index for fast similarity search...")
    dimension = train_embeddings.shape[1]
    
    # Choose appropriate index type based on data size
    if len(train_embeddings) < 10000:
        # For smaller datasets, exact search is fine
        index = faiss.IndexFlatL2(dimension)
    else:
        # For larger datasets, use approximate search
        nlist = min(4096, int(len(train_embeddings) / 30))  # Rule of thumb
        quantizer = faiss.IndexFlatL2(dimension)
        index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)
        # Need to train the index
        index.train(train_embeddings)
    
    # Use GPU for the index - no fallback
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)
    print("Using GPU FAISS index")
    
    # Add vectors to the index
    index.add(train_embeddings)
    
    # If using IVF index, set the number of cells to probe
    if hasattr(index, 'nprobe'):
        index.nprobe = min(50, nlist)  # Trade-off between speed and accuracy
    
    # Retrieve more neighbors than needed to allow for filtering
    search_k = min(k * 3, len(train_embeddings))  # Get 3x more candidates for filtering
    search_k = max(search_k, k + 5)  # Ensure we have at least k+5 candidates
    
    # Perform the search
    print(f"Searching for {search_k} candidate neighbors per test listing...")
    distances, indices = index.search(test_embeddings, search_k)
    
    # Auto-determine similarity threshold if not provided
    if similarity_threshold is None:
        # Use statistics from the distance distribution to set a threshold
        # Take median of the first nearest neighbor distance and multiply by a factor
        first_nn_distances = distances[:, 0]  # First nearest neighbor for each test listing
        median_first_nn = np.median(first_nn_distances)
        similarity_threshold = median_first_nn * 2.5  # Allow distances up to 2.5x the median first-nn distance
        print(f"Auto-determined similarity threshold: {similarity_threshold:.4f}")
    
    # Create neighbor dictionary with similarity filtering
    neighbor_dict = {}
    total_filtered_out = 0
    listings_with_few_neighbors = 0
    
    for i, test_id in enumerate(test_df['listing_id'].values):
        # Get distances and indices for this test listing
        listing_distances = distances[i]
        listing_indices = indices[i]
        
        # Filter by similarity threshold
        valid_mask = listing_distances < similarity_threshold
        valid_indices = listing_indices[valid_mask]
        valid_distances = listing_distances[valid_mask]
        
        # Ensure we have at least min_neighbors
        if len(valid_indices) < min_neighbors and len(listing_indices) > 0:
            # If we don't have enough neighbors after filtering, take the closest ones
            # regardless of threshold (up to min_neighbors)
            listings_with_few_neighbors += 1
            valid_indices = listing_indices[:min_neighbors]
            valid_distances = listing_distances[:min_neighbors]
        
        # Convert indices to listing IDs and track the associated distances
        neighbors_with_distances = [
            (train_df['listing_id'].iloc[idx], float(dist)) 
            for idx, dist in zip(valid_indices, valid_distances)
            if idx >= 0 and idx < len(train_df)
        ]
        
        # Sort by distance (most similar first)
        neighbors_with_distances.sort(key=lambda x: x[1])
        
        # Limit to k neighbors and separate IDs and distances
        neighbors_with_distances = neighbors_with_distances[:k]
        neighbors = [nwd[0] for nwd in neighbors_with_distances]
        filtered_distances = [nwd[1] for nwd in neighbors_with_distances]
        
        # Count filtered neighbors
        total_filtered_out += min(search_k, len(listing_indices)) - len(neighbors)
        
        # Store in dictionary with distance information
        neighbor_dict[test_id] = {
            'neighbors': neighbors,
            'distances': filtered_distances
        }
    
    print(f"Found neighbors for {len(neighbor_dict)} test listings")
    print(f"Filtered out {total_filtered_out} neighbors in total due to similarity threshold")
    print(f"{listings_with_few_neighbors} listings had fewer than requested neighbors after filtering")
    
    # Get average number of neighbors per listing after filtering
    avg_neighbors = np.mean([len(info['neighbors']) for info in neighbor_dict.values()])
    print(f"Average number of neighbors per listing after filtering: {avg_neighbors:.2f}")
    
    # Save results if path provided
    if output_path:
        with open(output_path, 'wb') as f:
            pickle.dump(neighbor_dict, f)
        print(f"Saved neighbor dictionary to {output_path}")
        
        # Save a readable version with distances
        neighbor_rows = []
        for test_id, info in neighbor_dict.items():
            for i, (neighbor_id, distance) in enumerate(zip(info['neighbors'], info['distances'])):
                neighbor_rows.append({
                    'test_listing_id': test_id,
                    'neighbor_listing_id': neighbor_id,
                    'rank': i+1,
                    'distance': distance
                })
        
        neighbor_df = pd.DataFrame(neighbor_rows)
        
        # Only save if we have neighbors
        if not neighbor_df.empty:
            neighbor_df.to_csv(f"{os.path.splitext(output_path)[0]}.csv", index=False)
            print(f"Saved human-readable neighbor list to {os.path.splitext(output_path)[0]}.csv")
    
    # Generate analysis of neighbor quality
    if 'latitude' in train_df.columns and 'longitude' in train_df.columns:
        # Create a mapping from listing ID to coordinates
        locations = {}
        for _, row in pd.concat([train_df, test_df])[['listing_id', 'latitude', 'longitude']].iterrows():
            locations[row['listing_id']] = (row['latitude'], row['longitude'])
        
        # Calculate average geographic distance to neighbors
        geo_distances = []
        for test_id, info in neighbor_dict.items():
            if test_id in locations:
                test_loc = locations[test_id]
                for neighbor_id in info['neighbors']:
                    if neighbor_id in locations:
                        neighbor_loc = locations[neighbor_id]
                        # Simple Euclidean distance (not haversine, but sufficient for analysis)
                        dist = np.sqrt((test_loc[0] - neighbor_loc[0])**2 + 
                                       (test_loc[1] - neighbor_loc[1])**2)
                        geo_distances.append(dist)
        
        if geo_distances:
            avg_dist = np.mean(geo_distances)
            median_dist = np.median(geo_distances)
            print(f"Average geographic distance to neighbors: {avg_dist:.6f}")
            print(f"Median geographic distance to neighbors: {median_dist:.6f}")
    
    end_time = time.time()
    print(f"GPU-accelerated neighbor selection completed in {end_time - start_time:.2f} seconds")
    
    # Return a simplified version of the neighbor dictionary for easier use
    simplified_neighbor_dict = {test_id: info['neighbors'] for test_id, info in neighbor_dict.items()}
    return simplified_neighbor_dict


def evaluate_neighbor_quality(train_data, train_ids, test_ids, neighbor_dict):
    """
    Evaluate the quality of the selected neighbors.
    
    Parameters:
    -----------
    train_data : pandas.DataFrame
        The complete dataset containing both training and testing listings
    train_ids : list
        List of listing IDs used as potential neighbors (training set)
    test_ids : list
        List of listing IDs that need neighbors (test set)
    neighbor_dict : dict
        Dictionary mapping test listing IDs to neighbor listing IDs
        
    Returns:
    --------
    dict
        Dictionary with evaluation metrics
    """
    # Filter data to create train and test datasets
    train_df = train_data[train_data['listing_id'].isin(train_ids)].copy()
    test_df = train_data[train_data['listing_id'].isin(test_ids)].copy()
    
    results = {}
    
    # Convert neighbor_dict to standard format if it's the detailed version
    simplified_dict = {}
    for test_id, value in neighbor_dict.items():
        if isinstance(value, dict) and 'neighbors' in value:
            simplified_dict[test_id] = value['neighbors']
        else:
            simplified_dict[test_id] = value
    
    neighbor_dict = simplified_dict
    
    # 1. Evaluate geographic proximity
    if 'latitude' in train_data.columns and 'longitude' in train_data.columns:
        # Create location lookup
        locations = {}
        for _, row in pd.concat([train_df, test_df])[['listing_id', 'latitude', 'longitude']].iterrows():
            locations[row['listing_id']] = (row['latitude'], row['longitude'])
        
        geo_distances = []
        for test_id, neighbors in neighbor_dict.items():
            if test_id in locations:
                test_loc = locations[test_id]
                for neighbor_id in neighbors:
                    if neighbor_id in locations:
                        neighbor_loc = locations[neighbor_id]
                        # Convert to radians for haversine
                        lat1, lon1 = np.radians(test_loc)
                        lat2, lon2 = np.radians(neighbor_loc)
                        
                        # Haversine formula
                        dlon = lon2 - lon1
                        dlat = lat2 - lat1
                        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
                        c = 2 * np.arcsin(np.sqrt(a))
                        r = 6371  # Radius of earth in km
                        dist = c * r
                        
                        geo_distances.append(dist)
        
        results['avg_geo_distance_km'] = np.mean(geo_distances) if geo_distances else float('nan')
        results['median_geo_distance_km'] = np.median(geo_distances) if geo_distances else float('nan')
    
    # 2. Evaluate amenity similarity
    amenity_cols = [col for col in train_data.columns if col.startswith('has_')]
    if amenity_cols:
        train_amenities = {}
        for _, row in train_df.drop_duplicates('listing_id')[['listing_id'] + amenity_cols].iterrows():
            train_amenities[row['listing_id']] = row[amenity_cols].values
            
        test_amenities = {}
        for _, row in test_df.drop_duplicates('listing_id')[['listing_id'] + amenity_cols].iterrows():
            test_amenities[row['listing_id']] = row[amenity_cols].values
        
        jaccard_sims = []
        for test_id, neighbors in neighbor_dict.items():
            if test_id in test_amenities:
                test_amn = test_amenities[test_id]
                for neighbor_id in neighbors:
                    if neighbor_id in train_amenities:
                        neighbor_amn = train_amenities[neighbor_id]
                        
                        # Jaccard similarity
                        intersection = np.sum(np.minimum(test_amn, neighbor_amn))
                        union = np.sum(np.maximum(test_amn, neighbor_amn))
                        if union > 0:
                            sim = intersection / union
                            jaccard_sims.append(sim)
        
        results['avg_amenity_similarity'] = np.mean(jaccard_sims) if jaccard_sims else float('nan')
    
    # 3. Evaluate price prediction quality
    # Create lookup with date and price
    train_prices = {}
    price_col = 'original_price' if 'original_price' in train_df.columns else 'price'
    
    for _, row in train_df[['listing_id', 'date', price_col]].iterrows():
        lid = row['listing_id']
        date = pd.to_datetime(row['date']).date()
        price = row[price_col]
        
        if lid not in train_prices:
            train_prices[lid] = {}
        train_prices[lid][date] = price
    
    # Evaluate how well neighbors predict test prices
    price_errors = []
    for _, row in test_df[['listing_id', 'date', price_col]].iterrows():
        test_id = row['listing_id']
        date = pd.to_datetime(row['date']).date()
        actual_price = row[price_col]
        
        if test_id in neighbor_dict:
            neighbor_prices = []
            for neighbor_id in neighbor_dict[test_id]:
                if neighbor_id in train_prices and date in train_prices[neighbor_id]:
                    neighbor_prices.append(train_prices[neighbor_id][date])
            
            if neighbor_prices:
                predicted_price = np.mean(neighbor_prices)
                error = abs(predicted_price - actual_price)
                price_errors.append(error)
    
    if price_errors:
        results['avg_price_error'] = np.mean(price_errors)
        results['median_price_error'] = np.median(price_errors)
        results['rmse'] = np.sqrt(np.mean(np.square(price_errors)))
    
    # 4. Add neighbor count statistics
    neighbor_counts = [len(neighbors) for neighbors in neighbor_dict.values()]
    if neighbor_counts:
        results['avg_neighbor_count'] = np.mean(neighbor_counts)
        results['min_neighbor_count'] = np.min(neighbor_counts)
        results['max_neighbor_count'] = np.max(neighbor_counts)
        results['listings_with_neighbors_pct'] = 100 * len([c for c in neighbor_counts if c > 0]) / len(neighbor_counts)
    
    return results


def run_neighbor_selection(train_path, train_ids_path, test_ids_path, output_dir="./output", 
                           k=5, use_embedding=True, similarity_threshold=None, min_neighbors=1):
    """
    Run the neighbor selection process and save the results.
    Uses GPU-only implementation without CPU fallback.
    Only keeps sufficiently similar neighbors.
    
    Parameters:
    -----------
    train_path : str
        Path to the training data CSV
    train_ids_path : str
        Path to file with training listing IDs
    test_ids_path : str
        Path to file with test listing IDs
    output_dir : str
        Directory to save output files
    k : int
        Number of neighbors to find for each test listing
    use_embedding : bool
        Whether to use neural embeddings for similarity
    similarity_threshold : float, optional
        Maximum distance threshold for keeping neighbors (None=auto-determine)
    min_neighbors : int, default=1
        Minimum number of neighbors to keep per listing
    """
    import os
    import torch
    
    # Verify CUDA is available
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available. This implementation requires a GPU.")
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Load data
    print(f"Loading data from {train_path}")
    train_data = pd.read_csv(train_path)
    
    # Load listing IDs
    print(f"Loading train IDs from {train_ids_path}")
    with open(train_ids_path, 'r') as f:
        train_ids = [int(line.strip()) for line in f.readlines()]
    
    print(f"Loading test IDs from {test_ids_path}")
    with open(test_ids_path, 'r') as f:
        test_ids = [int(line.strip()) for line in f.readlines()]
    
    print(f"Loaded {len(train_ids)} train IDs and {len(test_ids)} test IDs")
    
    # Convert date column to datetime if needed
    if 'date' in train_data.columns and not pd.api.types.is_datetime64_any_dtype(train_data['date']):
        train_data['date'] = pd.to_datetime(train_data['date'])
    
    # Create calculated features if not already present
    if 'amenity_count' not in train_data.columns:
        amenity_cols = [col for col in train_data.columns if col.startswith('has_')]
        if amenity_cols:
            train_data['amenity_count'] = train_data[amenity_cols].sum(axis=1)
    
    if 'bedroom_ratio' not in train_data.columns and 'bedrooms' in train_data.columns and 'accommodates' in train_data.columns:
        train_data['bedroom_ratio'] = train_data['bedrooms'] / train_data['accommodates'].clip(lower=1)
    
    # Output path for the neighbor dictionary
    output_path = os.path.join(output_dir, "neighbor_dict.pkl")
    
    # Run GPU-accelerated neighbor selection with similarity filtering
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    neighbor_type = "neural embeddings" if use_embedding else "weighted features"
    threshold_type = "auto-determined" if similarity_threshold is None else f"user-specified ({similarity_threshold:.4f})"
    
    print(f"Finding neighbors using {neighbor_type} with {threshold_type} similarity threshold")
    print(f"Will keep a minimum of {min_neighbors} neighbors per listing")
    
    neighbors = gpu_neighbor_selection(
        train_data, train_ids, test_ids, k=k,
        output_path=output_path, use_embedding=use_embedding,
        similarity_threshold=similarity_threshold, min_neighbors=min_neighbors
    )
    
    # Evaluate neighbor quality
    print("\nEvaluating neighbor quality...")
    quality_metrics = evaluate_neighbor_quality(train_data, train_ids, test_ids, neighbors)
    
    # Print evaluation results
    print("\nNeighbor Quality Evaluation:")
    for metric, value in quality_metrics.items():
        if isinstance(value, float):
            print(f"  {metric}: {value:.4f}")
        else:
            print(f"  {metric}: {value}")
    
    # Save quality metrics
    with open(os.path.join(output_dir, "neighbor_quality.txt"), 'w') as f:
        f.write("Neighbor Quality Evaluation:\n")
        f.write(f"Number of neighbors requested per listing: {k}\n")
        f.write(f"Using similarity filtering: Yes\n")
        f.write(f"Similarity threshold: {threshold_type}\n")
        f.write(f"Minimum neighbors per listing: {min_neighbors}\n")
        f.write(f"Total test listings: {len(test_ids)}\n")
        f.write(f"Total train listings: {len(train_ids)}\n\n")
        for metric, value in quality_metrics.items():
            if isinstance(value, float):
                f.write(f"{metric}: {value:.6f}\n")
            else:
                f.write(f"{metric}: {value}\n")
    
    print(f"\nNeighbor selection complete. Results saved to {output_dir}")
    return neighbors

In [4]:
neighbors = run_neighbor_selection(
    train_path=r"C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train_up3.csv",
    train_ids_path=r"C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train_ids.txt",
    test_ids_path=r"C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\test_ids.txt",
    output_dir="./neighbor_data",
    k=5,                         # Maximum neighbors per listing
    use_embedding=True,          # Use neural embeddings for better similarity
    similarity_threshold=None,   # None = auto-determine the threshold
    min_neighbors=1              # Ensure at least this many neighbors per listing
)

Loading data from C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train_up3.csv
Loading train IDs from C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train_ids.txt
Loading test IDs from C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\test_ids.txt
Loaded 6291 train IDs and 1573 test IDs
Using GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Finding neighbors using neural embeddings with auto-determined similarity threshold
Will keep a minimum of 1 neighbors per listing
Using GPU: NVIDIA GeForce RTX 4060 Laptop GPU
Processing 6291 train and 1573 test listings
Training embedding network for improved similarity...
Using batch size: 1024
Epoch 5/20, Avg Loss: 0.021838
Epoch 10/20, Avg Loss: -0.085147
Epoch 15/20, Avg Loss: -0.132964
Epoch 20/20, Avg Loss: -0.157095
Building FAISS index for fast similarity search...
Using GPU FAISS index
Searching for 15 candidate neighbors per test listing...
Auto-determined similarity