This creates an item-item matrix for ALL businesses, not only restaurants. One hot encoded categories.

In [7]:
import json
import numpy as np
from scipy.sparse import csr_matrix, identity, save_npz
import os
import sys

# --- Configuration ---
META_FILE_PATH = 'Data/meta-District_of_Columbia.json'
INPUT_DIR = 'Output_Data'
OUTPUT_DIR = 'Output_Data'
# Use the "ALL" item map generated by restaurant_user_item_matrix_all.py
ITEM_MAP_FILE = os.path.join(INPUT_DIR, 'restaurant_item_map_all.json')
Z_MATRIX_FILE = os.path.join(OUTPUT_DIR, 'item_item_matrix_all.npz')

# --- Load Item Mapping ---
print(f"Loading ALL item mapping from {ITEM_MAP_FILE}...")
try:
    with open(ITEM_MAP_FILE, 'r') as f:
        # The map is stored as {"0": "gmap_id_A", "1": "gmap_id_B"}
        # We need to invert it to map ID -> Index for matrix construction
        loaded_map = json.load(f)
        item_id_to_index = {v: int(k) for k, v in loaded_map.items()}

    num_items = len(item_id_to_index)
    print(f"Loaded {num_items} items (Restaurants + Others) from mapping.")

except FileNotFoundError:
    print(f"Error: {ITEM_MAP_FILE} not found. Please run the 'All' user-item matrix generator first.")
    sys.exit(1)

# --- Load Metadata & Extract Categories ---
print(f"Processing metadata from {META_FILE_PATH}...")

# Data structures for building the Feature Matrix F
# Rows = Items (aligned with item_id_to_index)
# Cols = Categories (we will learn these dynamically)

item_categories_map = {} # gmap_id -> list of categories
all_categories = set()

try:
    with open(META_FILE_PATH, 'r') as f:
        for line in f:
            if not line.strip():
                continue
            try:
                record = json.loads(line)
                gmap_id = record.get('gmap_id')
                categories = record.get('category')

                # Only process if we have the ID in our map and categories exist
                if gmap_id in item_id_to_index and categories:
                    # Clean/Normalize categories if needed (stripping whitespace)
                    clean_cats = [str(c).strip() for c in categories if c]
                    item_categories_map[gmap_id] = clean_cats
                    all_categories.update(clean_cats)

            except json.JSONDecodeError:
                continue
except FileNotFoundError:
    print(f"Error: Metadata file not found at {META_FILE_PATH}")
    sys.exit(1)

print(f"Found {len(all_categories)} unique categories across {len(item_categories_map)} matched items.")

# --- Build One-Hot Feature Matrix (F) ---
print("Building One-Hot Encoded Feature Matrix F...")

# Map categories to column indices
sorted_categories = sorted(list(all_categories))
category_to_index = {cat: i for i, cat in enumerate(sorted_categories)}
num_features = len(sorted_categories)

# Arrays for sparse matrix construction
rows = []
cols = []
data = []

# Iterate through our UNIVERSE of items (from the map) to ensure row alignment
# If an item had no metadata or no categories, it simply has no entries (row of zeros)
for gmap_id, row_idx in item_id_to_index.items():
    cats = item_categories_map.get(gmap_id, [])

    for cat in cats:
        if cat in category_to_index:
            col_idx = category_to_index[cat]
            rows.append(row_idx)
            cols.append(col_idx)
            data.append(1) # Binary feature

# Create Sparse Matrix F (Items x Features)
F = csr_matrix((data, (rows, cols)), shape=(num_items, num_features), dtype=np.float32)

print(f"Feature Matrix F shape: {F.shape}")
print(f"F Non-zeros: {F.nnz}")

# --- Compute Similarity Matrix Z ---
# Formula: Z = (1/m) * (F * F.T)
# Then enforce diagonal = 1

print("Computing Z = F * F.T (Common Neighbors)...")
# Note: Result will be N x N (Items x Items)
FFT = F.dot(F.T)

# Find max element 'm'
print("Normalizing...")
m = FFT.max()

if m == 0:
    print("Warning: Max element is 0 (no categories found?). Returning Identity matrix.")
    Z = identity(num_items, format='csr')
else:
    # Scale by 1/m
    Z = FFT.multiply(1.0 / m)

# Enforce diagonal elements to be 1
# Z <- Z + I - diag(diag(Z))
print("Enforcing diagonal = 1...")
# Efficient way for CSR/CSC/LIL to set diagonal values
Z.setdiag(1.0)

# Ensure it's CSR for saving
Z = Z.tocsr()

# --- Save Result ---
print(f"Saving Item-Item Z-matrix (All) to {Z_MATRIX_FILE}...")
print(f"Z Matrix Shape: {Z.shape}")
print(f"Z Matrix Non-zeros: {Z.nnz}")

try:
    save_npz(Z_MATRIX_FILE, Z)
    print("Success.")
except Exception as e:
    print(f"Error saving matrix: {e}")

Loading ALL item mapping from Output_Data\restaurant_item_map_all.json...
Loaded 11003 items (Restaurants + Others) from mapping.
Processing metadata from Data/meta-District_of_Columbia.json...
Found 1799 unique categories across 10980 matched items.
Building One-Hot Encoded Feature Matrix F...
Feature Matrix F shape: (11003, 1799)
F Non-zeros: 25299
Computing Z = F * F.T (Common Neighbors)...
Normalizing...
Enforcing diagonal = 1...
Saving Item-Item Z-matrix (All) to Output_Data\item_item_matrix_all.npz...
Z Matrix Shape: (11003, 11003)
Z Matrix Non-zeros: 3672149
Success.
