This creates the item-item matrix based on location

In [3]:
import pandas as pd
import numpy as np
import json
import os
from scipy.sparse import csr_matrix, save_npz
from sklearn.metrics.pairwise import haversine_distances

# --- Configuration ---
INPUT_DIR = '20_core_1215'
META_FILE = 'Data/meta-District_of_Columbia.json'
MAP_FILE = os.path.join(INPUT_DIR, 'restaurant_item_map_rest_20.json')
OUT_DIR = 'Z_matrixes'
OUT_FILE = os.path.join(OUT_DIR, 'restaurant_geo_similarity.npz')

os.makedirs(OUT_DIR, exist_ok=True)

# 1. Load & Sort Map (Ensure index 0, 1, 2... matches matrix rows)
with open(MAP_FILE, 'r') as f:
    item_map = json.load(f)
# Sort keys '0','1' as integers to guarantee order
ordered_ids = [item_map[str(i)] for i in sorted([int(k) for k in item_map])]

# 2. Load Meta & Align
# Read JSON -> Set Index -> Remove Duplicates -> Reindex to match Map order
meta_df = pd.read_json(META_FILE, lines=True).set_index('gmap_id')
meta_df = meta_df[~meta_df.index.duplicated(keep='first')]

# Extract only valid coords, preserving original integer index for reconstruction
valid_df = meta_df.loc[meta_df.index.intersection(ordered_ids), ['latitude', 'longitude']]
valid_indices = [i for i, x in enumerate(ordered_ids) if x in valid_df.index]

# 3. Compute Distances & Outliers
rads = np.radians(valid_df)
# Haversine returns radians -> multiply by 6371 for km
dist_matrix = haversine_distances(rads, rads) * 6371

# Logic: Filter outliers (>20km avg distance)
avg_dists = dist_matrix.mean(axis=1)
outliers = valid_df.index[avg_dists > 20].tolist()

# 4. Compute Similarity (1 - Normalized Distance)
max_dist = dist_matrix.max()
sim_dense = 1 - (dist_matrix / max_dist) if max_dist > 0 else np.ones_like(dist_matrix)

# 5. Reconstruct Full NxN Matrix
full_matrix = np.zeros((len(ordered_ids), len(ordered_ids)), dtype=np.float32)
# Fill valid sub-matrix using meshgrid indexing
full_matrix[np.ix_(valid_indices, valid_indices)] = sim_dense
np.fill_diagonal(full_matrix, 1.0) # Ensure self-similarity is always 1

# 6. Save & Output
save_npz(OUT_FILE, csr_matrix(full_matrix))

print(f"Saved {full_matrix.shape} matrix to {OUT_FILE}")
print(f"Outlier Restaurants (>20km avg): {outliers}")
print(f"Stats (km) -> Mean: {dist_matrix.mean():.2f} | Median: {np.median(dist_matrix):.2f} | Max: {dist_matrix.max():.2f}")

Saved (3484, 3484) matrix to Z_matrixes\restaurant_geo_similarity.npz
Outlier Restaurants (>20km avg): []
Stats (km) -> Mean: 3.84 | Median: 3.41 | Max: 18.22
