User-Item matrix with all users. Does not work because it you can't make a test set that matches shape because some users just have 1 review.

This is an old implementation

In [6]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, save_npz
import os
import sys

# --- Configuration ---
FILE_PATH = 'Data/review-District_of_Columbia.json'
RESTAURANT_IDS_FILE = 'Output_Data/restaurant_gmap_ids.json'
OUTPUT_DIR = 'Output_Data'

# Output files
TRAIN_OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'R_train_rest.npz')
TEST_OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'R_test_rest.npz')
USER_MAP_FILE = os.path.join(OUTPUT_DIR, 'restaurant_user_map_rest.json')
ITEM_MAP_FILE = os.path.join(OUTPUT_DIR, 'restaurant_item_map_rest.json')

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Load Valid Restaurant IDs
print(f"Loading restaurant IDs from {RESTAURANT_IDS_FILE}...")
try:
    with open(RESTAURANT_IDS_FILE, 'r') as f:
        valid_restaurant_ids = set(json.load(f))
    print(f"Loaded {len(valid_restaurant_ids)} valid restaurant IDs.")
except FileNotFoundError:
    print(f"Error: Could not find {RESTAURANT_IDS_FILE}. Please run the filter script first.")
    sys.exit(1)

print(f"Loading data from {FILE_PATH}...")

# We use a dictionary of lists to build the dataframe columns directly.
extracted_data = {
    'user_id': [],
    'gmap_id': [],
    'rating': [],
    'time': [] # Needed for splitting
}

# 2. Load Review Data
try:
    with open(FILE_PATH, 'r') as f:
        for i, line in enumerate(f):
            if not line.strip():
                continue

            try:
                record = json.loads(line)
                gmap_id = record.get('gmap_id')

                # FILTER: Only keep if it's a known restaurant
                if gmap_id in valid_restaurant_ids:
                    extracted_data['user_id'].append(record.get('user_id'))
                    extracted_data['gmap_id'].append(gmap_id)
                    extracted_data['rating'].append(record.get('rating'))
                    extracted_data['time'].append(record.get('time'))
            except json.JSONDecodeError:
                continue

            if i % 100000 == 0 and i > 0:
                print(f"Processed {i} rows...", end='\r')

    print(f"Finished loading {len(extracted_data['rating'])} restaurant records.")

except FileNotFoundError:
    print(f"Error: Review file not found at {FILE_PATH}")
    sys.exit(1)

# 3. Create DataFrame and Clean
df = pd.DataFrame(extracted_data)
del extracted_data

# Drop rows with missing values (None/NaN)
initial_len = len(df)
df.dropna(subset=['user_id', 'gmap_id', 'rating', 'time'], inplace=True)
dropped_count = initial_len - len(df)
if dropped_count > 0:
    print(f"Dropped {dropped_count} rows containing missing data.")

print("Mapping categories...")

# 4. Map String IDs to Integer Indices

# User Mapping
# We map only users who actually reviewed a restaurant
# 'category' type stores unique values in .cat.categories
user_cat = df['user_id'].astype('category')
user_indices = user_cat.cat.codes
user_mapper = dict(enumerate(user_cat.cat.categories))

# Item Mapping
# The columns of the matrix must represent the UNIVERSE of restaurants
# defined in restaurant_gmap_ids.json to ensure consistent shapes.
sorted_restaurant_ids = sorted(list(valid_restaurant_ids))
item_cat = pd.Categorical(df['gmap_id'], categories=sorted_restaurant_ids)
item_indices = item_cat.codes
item_mapper = dict(enumerate(item_cat.categories))

# Add integer indices to DataFrame
df['user_idx'] = user_indices
df['item_idx'] = item_indices

print(f"Total Reviews (DataFrame rows): {len(df)}")
print(f"Unique Users (Matrix rows): {len(user_mapper)}")
print(f"Unique Restaurants (Matrix cols): {len(item_mapper)}")
print(f"Average reviews per user: {len(df) / len(user_mapper):.2f}")

print("Splitting data (Leave-One-Last)...")

# 5. Split Data: Drop most recent review for Test set
# Sort by User and Time (Descending) so the latest review is first
df.sort_values(by=['user_idx', 'time'], ascending=[True, False], inplace=True)

# The first record for each user is their latest review -> TEST set
is_latest_mask = ~df.duplicated(subset=['user_idx'], keep='first')

df_test = df[is_latest_mask]
df_train = df[~is_latest_mask]

print(f"Train interactions: {len(df_train)}")
print(f"Test interactions:  {len(df_test)}")

print("Building sparse matrices...")

# 6. Create the Sparse Matrices
# The shape is (Number of Users, Number of Items)
matrix_shape = (len(user_mapper), len(item_mapper))

R_train = csr_matrix(
    (df_train['rating'], (df_train['user_idx'], df_train['item_idx'])),
    shape=matrix_shape
)

R_test = csr_matrix(
    (df_test['rating'], (df_test['user_idx'], df_test['item_idx'])),
    shape=matrix_shape
)

# 7. Output Results
print("\n--- Matrix Properties ---")
print(f"Shape: {matrix_shape} (Users, Restaurants)")
print(f"Train Matrix Non-zeros: {R_train.nnz}")
print(f"Test Matrix Non-zeros: {R_test.nnz}")
# Calculate sparsity
total_elements = matrix_shape[0] * matrix_shape[1]
sparsity = 1.0 - ((R_train.nnz + R_test.nnz) / total_elements)
print(f"Matrix Sparsity: {sparsity:.6%}")

# 8. Save Sparse Matrices and Mappers
print("\nSaving matrices and mappers...")
try:
    save_npz(TRAIN_OUTPUT_FILE, R_train)
    save_npz(TEST_OUTPUT_FILE, R_test)

    # Save the mappers
    with open(USER_MAP_FILE, 'w') as f:
        json.dump({str(k): v for k, v in user_mapper.items()}, f)

    with open(ITEM_MAP_FILE, 'w') as f:
        json.dump({str(k): v for k, v in item_mapper.items()}, f)

    print(f"Successfully saved matrices to {OUTPUT_DIR}")
    print(f" - {TRAIN_OUTPUT_FILE}")
    print(f" - {TEST_OUTPUT_FILE}")

except Exception as e:
    print(f"An error occurred while saving files: {e}")

Loading restaurant IDs from Output_Data/restaurant_gmap_ids.json...
Loaded 3557 valid restaurant IDs.
Loading data from Data/review-District_of_Columbia.json...
Finished loading 989164 restaurant records.
Dropped 590 rows containing missing data.
Mapping categories...
Total Reviews (DataFrame rows): 988574
Unique Users (Matrix rows): 431957
Unique Restaurants (Matrix cols): 3557
Average reviews per user: 2.29
Splitting data (Leave-One-Last)...
Train interactions: 556617
Test interactions:  431957
Building sparse matrices...

--- Matrix Properties ---
Shape: (431957, 3557) (Users, Restaurants)
Train Matrix Non-zeros: 550745
Test Matrix Non-zeros: 431957
Matrix Sparsity: 99.936042%

Saving matrices and mappers...
Successfully saved matrices to Output_Data
 - Output_Data\R_train_rest.npz
 - Output_Data\R_test_rest.npz
