This is a different version of category item-item sim. This uses jaccard similarity to prevent rewarding restaurants with loads of categories.

In [5]:
import json
import os
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix, save_npz  # <--- Added imports

# --- Configuration ---
ID_MAP_PATH = '20_core_1215/restaurant_item_map_rest_20.json'
META_DATA_PATH = 'Data/meta-District_of_Columbia.json'
OUTPUT_DIR = 'Z_matrixes'

# Matrix file
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'item_item_jaccard.npz')
# ID mapping file (Required because save_npz can't store IDs inside)
OUTPUT_ID_FILE = os.path.join(OUTPUT_DIR, 'item_item_jaccard_ids.json')

def load_json_lines(path):
    with open(path, 'r') as f:
        for line in f:
            yield json.loads(line)

def main():
    print("Loading Index Map...")
    with open(ID_MAP_PATH, 'r') as f:
        raw_map = json.load(f)

    # --- Robust Map Detection ---
    first_key = list(raw_map.keys())[0]
    first_val = list(raw_map.values())[0]

    # Check if map is {Index: ID} or {ID: Index}
    if str(first_key).isdigit() and not str(first_val).isdigit():
        print("Detected map format: {Index: ID}. Inverting...")
        sorted_items = sorted(raw_map.items(), key=lambda x: int(x[0]))
        sorted_gmap_ids = [v for k, v in sorted_items]
    else:
        print("Detected map format: {ID: Index}.")
        sorted_items = sorted(raw_map.items(), key=lambda x: x[1])
        sorted_gmap_ids = [k for k, v in sorted_items]

    sorted_gmap_ids = [str(uid) for uid in sorted_gmap_ids]
    target_ids_set = set(sorted_gmap_ids)
    n_items = len(sorted_gmap_ids)
    print(f"Total items: {n_items}")

    print("Parsing Metadata...")
    category_data = {}
    matched_count = 0

    for entry in load_json_lines(META_DATA_PATH):
        g_id = str(entry.get('gmap_id'))
        if g_id in target_ids_set:
            matched_count += 1
            raw_cats = entry.get('category')
            if raw_cats:
                filtered_cats = [c for c in raw_cats if c.lower() != 'restaurant']
                category_data[g_id] = filtered_cats
            else:
                category_data[g_id] = []

    if matched_count == 0:
        print("CRITICAL ERROR: No matching IDs found in metadata.")
        return

    # Align data
    aligned_categories = []
    for g_id in sorted_gmap_ids:
        aligned_categories.append(category_data.get(g_id, []))

    print("Vectorizing categories...")
    mlb = MultiLabelBinarizer(sparse_output=True)
    X = mlb.fit_transform(aligned_categories)
    X = X.astype(np.float32)

    print("Computing Jaccard Similarity Matrix...")
    intersection = X.dot(X.T).toarray()
    row_sums = np.array(X.sum(axis=1)).flatten()
    union = row_sums[:, None] + row_sums[None, :] - intersection

    with np.errstate(divide='ignore', invalid='ignore'):
        jaccard_matrix = intersection / union
        jaccard_matrix[np.isnan(jaccard_matrix)] = 0.0

    print(f"Matrix shape: {jaccard_matrix.shape}")

    # --- Output Section ---
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    print("Converting to Sparse CSR Matrix...")
    # Convert dense numpy array to sparse matrix
    Z_sparse = csr_matrix(jaccard_matrix)

    print(f"Saving Matrix to {OUTPUT_FILE}...")
    # This is the exact function you wanted
    save_npz(OUTPUT_FILE, Z_sparse)

    print(f"Saving ID List to {OUTPUT_ID_FILE}...")
    # We save IDs separately because save_npz cannot store them
    with open(OUTPUT_ID_FILE, 'w') as f:
        json.dump(sorted_gmap_ids, f)

    print("Done.")

if __name__ == "__main__":
    main()

Loading Index Map...
Detected map format: {Index: ID}. Inverting...
Total items: 3484
Parsing Metadata...
Vectorizing categories...
Computing Jaccard Similarity Matrix...
Matrix shape: (3484, 3484)
Converting to Sparse CSR Matrix...
Saving Matrix to Z_matrixes\item_item_jaccard.npz...
Saving ID List to Z_matrixes\item_item_jaccard_ids.json...
Done.


In [4]:
#this just checks that all the cells are filled

import numpy as np

# Load the file
data = np.load('Z_matrixes/item_item_jaccard.npz')
matrix = data['data']

# Count non-zeros
non_zeros = np.count_nonzero(matrix)
total_elements = matrix.size
sparsity = 1.0 - (non_zeros / total_elements)

print(f"Total elements: {total_elements}")
print(f"Non-zero elements: {non_zeros}")
print(f"Sparsity: {sparsity:.4%}")

Total elements: 1330755
Non-zero elements: 1330755
Sparsity: 0.0000%
