This is the implementation of sentence embedding models for item-item matrixes

In [1]:
import json
import numpy as np
from scipy.sparse import csr_matrix, save_npz
from sentence_transformers import SentenceTransformer

# --- Configuration ---
META_FILE = 'Data/meta-District_of_Columbia.json'
MAP_FILE = '20_core_1215/restaurant_item_map_rest_20.json'
OUTPUT_FILE = 'Z_matrixes/item_sim_cosine.npz'

# Jitter factor to guarantee Positive Definiteness for Cholesky.
# Since we are using raw cosine (-1 to 1), we need a robust epsilon.
# 1e-5 is standard for ensuring numerical stability in Cholesky.
CHOLESKY_EPSILON = 1e-5

def main():
    # 1. Load the map to ensure matrix alignment
    print(f"Loading map from {MAP_FILE}...")
    try:
        with open(MAP_FILE, 'r') as f:
            item_map = json.load(f)
            # Sort by integer index to ensure row N corresponds to index N
            sorted_indices = sorted(item_map.keys(), key=int)
            ordered_gmap_ids = [item_map[idx] for idx in sorted_indices]
    except FileNotFoundError:
        print(f"Error: Map file not found at {MAP_FILE}")
        return

    num_items = len(ordered_gmap_ids)

    # 2. Extract descriptions
    print(f"Extracting descriptions from {META_FILE}...")
    desc_map = {}
    target_ids = set(ordered_gmap_ids)

    try:
        with open(META_FILE, 'r') as f:
            for line in f:
                if not line.strip(): continue
                try:
                    data = json.loads(line)
                    gmap_id = data.get('gmap_id')
                    if gmap_id in target_ids:
                        desc = data.get('description')
                        # Normalize description text
                        if desc is None:
                            desc = ""
                        elif isinstance(desc, list):
                            desc = " ".join(desc)
                        else:
                            desc = str(desc)

                        if desc.strip(): # Only store if meaningful
                            desc_map[gmap_id] = desc
                except json.JSONDecodeError:
                    continue
    except FileNotFoundError:
        print(f"Error: Metadata file not found at {META_FILE}")
        return

    # 3. Prepare Valid Sentences
    print("Filtering valid descriptions...")
    valid_sentences = []
    valid_indices = [] # The actual row indices (0 to N-1) that have data

    for i, gmap_id in enumerate(ordered_gmap_ids):
        text = desc_map.get(gmap_id, "")
        if text:
            valid_sentences.append(text)
            valid_indices.append(i)

    num_valid = len(valid_indices)
    print("-" * 40)
    print(f"Total Items: {num_items}")
    print(f"Items with Description: {num_valid}")
    print(f"Missing Descriptions: {num_items - num_valid}")
    print("-" * 40)

    # 4. Initialize Full Matrix (Dense)
    # We initialize with 0.0.
    # In raw cosine similarity:
    #   0.0  = Orthogonal (Unrelated / Neutral) -> This avoids "punishing" (which would be -1.0)
    #   1.0  = Identical
    #  -1.0  = Opposite
    print("Initializing full matrix with Neutral (0.0) values...")
    final_sim_matrix = np.zeros((num_items, num_items), dtype=np.float32)

    # 5. Compute and Fill Valid Block
    if num_valid > 0:
        print("Loading SentenceTransformer model...")
        model = SentenceTransformer("all-MiniLM-L6-v2")

        print("Encoding and computing Cosine Similarity...")
        embeddings = model.encode(valid_sentences)

        # Compute raw cosine similarity (-1 to 1)
        # We do NOT scale to [0,1]. Cholesky works fine with negative values
        # as long as the matrix itself is Positive Definite.
        valid_sims = model.similarity(embeddings, embeddings)

        # Convert to numpy
        valid_sim_block = valid_sims.cpu().numpy()

        # Place the valid block into the correct positions in the large matrix
        print("Mapping valid block to full matrix...")
        rows, cols = np.ix_(valid_indices, valid_indices)
        final_sim_matrix[rows, cols] = valid_sim_block
    else:
        print("Warning: No valid descriptions found. Matrix is Identity.")

    # 6. Ensure Positive Definiteness (The Cholesky Requirement)
    print("Applying Cholesky Regularization...")

    # Set the diagonal to exactly 1.0 (Self-similarity)
    np.fill_diagonal(final_sim_matrix, 1.0)

    # Add Epsilon Jitter to diagonal
    # This lifts the eigenvalues. Since missing items are 0 everywhere except
    # the diagonal (where they are 1.0), they act as Identity blocks,
    # which are naturally Positive Definite.
    # The valid block is a Gram matrix (naturally PSD). Jitter makes it PD.
    final_sim_matrix += np.eye(num_items) * CHOLESKY_EPSILON

    # 7. Save
    print(f"Saving to {OUTPUT_FILE}...")
    sim_matrix_sparse = csr_matrix(final_sim_matrix)
    save_npz(OUTPUT_FILE, sim_matrix_sparse)

    print("Success. Matrix generated:")
    print(f" - Missing items treated as Orthogonal (0.0 correlation)")
    print(f" - Raw Cosine Similarity used (Range -1 to 1)")
    print(f" - Jitter added for Cholesky stability.")

if __name__ == "__main__":
    main()

Loading map from 20_core_1215/restaurant_item_map_rest_20.json...
Extracting descriptions from Data/meta-District_of_Columbia.json...
Filtering valid descriptions...
----------------------------------------
Total Items: 3484
Items with Description: 2435
Missing Descriptions: 1049
----------------------------------------
Initializing full matrix with Neutral (0.0) values...
Loading SentenceTransformer model...
Encoding and computing Cosine Similarity...
Mapping valid block to full matrix...
Applying Cholesky Regularization...
Saving to Z_matrixes/item_sim_cosine.npz...
Success. Matrix generated:
 - Missing items treated as Orthogonal (0.0 correlation)
 - Raw Cosine Similarity used (Range -1 to 1)
 - Jitter added for Cholesky stability.


Variable 'final_sim_matrix' saved to: Output_Data/Core10_item_sim_cosine_DENSE.npy
