Creates a user-item matrix for every interaction, not filtered for restaurants only. This is not very useful.

This is an old implementation

In [9]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, save_npz
import os
import sys

# --- Configuration ---
FILE_PATH = 'Data/review-District_of_Columbia.json'
RESTAURANT_IDS_FILE = 'Output_Data/restaurant_gmap_ids.json'
OUTPUT_DIR = 'Output_Data'

# Output files
TRAIN_OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'R_train_all.npz')
TEST_OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'R_test_all.npz')
USER_MAP_FILE = os.path.join(OUTPUT_DIR, 'restaurant_user_map_all.json')
ITEM_MAP_FILE = os.path.join(OUTPUT_DIR, 'restaurant_item_map_all.json')

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1. Load Valid Restaurant IDs
print(f"Loading restaurant IDs from {RESTAURANT_IDS_FILE}...")
try:
    with open(RESTAURANT_IDS_FILE, 'r') as f:
        valid_restaurant_ids = set(json.load(f))
    print(f"Loaded {len(valid_restaurant_ids)} valid restaurant IDs.")
except FileNotFoundError:
    print(f"Error: Could not find {RESTAURANT_IDS_FILE}. Please run the filter script first.")
    sys.exit(1)

print(f"Loading ALL data from {FILE_PATH}...")

# We use a dictionary of lists to build the dataframe columns directly.
extracted_data = {
    'user_id': [],
    'gmap_id': [],
    'rating': [],
    'time': [] # Needed for splitting
}

# 2. Load Review Data
# NOTE: We load ALL reviews now, not just restaurants, to enrich the training data.
try:
    with open(FILE_PATH, 'r') as f:
        for i, line in enumerate(f):
            if not line.strip():
                continue

            try:
                record = json.loads(line)

                # We store everything, no filtering by gmap_id here
                extracted_data['user_id'].append(record.get('user_id'))
                extracted_data['gmap_id'].append(record.get('gmap_id'))
                extracted_data['rating'].append(record.get('rating'))
                extracted_data['time'].append(record.get('time'))

            except json.JSONDecodeError:
                continue

            if i % 100000 == 0 and i > 0:
                print(f"Processed {i} rows...", end='\r')

    print(f"Finished loading {len(extracted_data['rating'])} total records.")

except FileNotFoundError:
    print(f"Error: Review file not found at {FILE_PATH}")
    sys.exit(1)

# 3. Create DataFrame and Clean
df = pd.DataFrame(extracted_data)
del extracted_data

# Drop rows with missing values
initial_len = len(df)
df.dropna(subset=['user_id', 'gmap_id', 'rating', 'time'], inplace=True)
dropped_count = initial_len - len(df)
if dropped_count > 0:
    print(f"Dropped {dropped_count} rows containing missing data.")

print("Mapping categories...")

# 4. Map String IDs to Integer Indices

# User Mapping
user_cat = df['user_id'].astype('category')
user_indices = user_cat.cat.codes
user_mapper = dict(enumerate(user_cat.cat.categories))

# Item Mapping
# The Universe of Items must include:
# 1. All valid restaurants (so our restaurant-specific matrices align)
# 2. All other items found in the review dataset (so we can map the non-restaurant reviews)
unique_review_items = set(df['gmap_id'].unique())
universe_of_items = sorted(list(valid_restaurant_ids.union(unique_review_items)))

item_cat = pd.Categorical(df['gmap_id'], categories=universe_of_items)
item_indices = item_cat.codes
item_mapper = dict(enumerate(item_cat.categories))

# Add integer indices to DataFrame
df['user_idx'] = user_indices
df['item_idx'] = item_indices

print(f"Total Reviews: {len(df)}")
print(f"Unique Users: {len(user_mapper)}")
print(f"Unique Items (Restaurants + Others): {len(item_mapper)}")

print("Splitting data (Leave-One-Last-Restaurant)...")

# 5. Split Data
# Logic:
# - Test Set: The MOST RECENT review for each user *that is a restaurant*.
# - Train Set: All other reviews (historical restaurants + all non-restaurants).

# Helper column to identify restaurants
df['is_restaurant'] = df['gmap_id'].isin(valid_restaurant_ids)

# Sort by User and Time (Descending)
df.sort_values(by=['user_idx', 'time'], ascending=[True, False], inplace=True)

# Find the index of the latest restaurant review for each user
# We filter for restaurants, group by user, and take the top 1.
latest_restaurant_indices = df[df['is_restaurant']].groupby('user_idx').head(1).index

# Create Splits
df_test = df.loc[latest_restaurant_indices]
df_train = df.drop(latest_restaurant_indices)

print(f"Train interactions: {len(df_train)}")
print(f"Test interactions:  {len(df_test)}")

print("Building sparse matrices...")

# 6. Create the Sparse Matrices
matrix_shape = (len(user_mapper), len(item_mapper))

R_train = csr_matrix(
    (df_train['rating'], (df_train['user_idx'], df_train['item_idx'])),
    shape=matrix_shape
)

R_test = csr_matrix(
    (df_test['rating'], (df_test['user_idx'], df_test['item_idx'])),
    shape=matrix_shape
)

# 7. Output Results
print("\n--- Matrix Properties ---")
print(f"Shape: {matrix_shape} (Users, All Items)")
print(f"Train Matrix Non-zeros: {R_train.nnz}")
print(f"Test Matrix Non-zeros: {R_test.nnz}")

# 8. Save Sparse Matrices and Mappers
print("\nSaving matrices and mappers...")
try:
    save_npz(TRAIN_OUTPUT_FILE, R_train)
    save_npz(TEST_OUTPUT_FILE, R_test)

    # Save the mappers
    with open(USER_MAP_FILE, 'w') as f:
        json.dump({str(k): v for k, v in user_mapper.items()}, f)

    with open(ITEM_MAP_FILE, 'w') as f:
        json.dump({str(k): v for k, v in item_mapper.items()}, f)

    print(f"Successfully saved matrices to {OUTPUT_DIR}")
    print(f" - {TRAIN_OUTPUT_FILE}")
    print(f" - {TEST_OUTPUT_FILE}")

except Exception as e:
    print(f"An error occurred while saving files: {e}")

Loading restaurant IDs from Output_Data/restaurant_gmap_ids.json...
Loaded 3557 valid restaurant IDs.
Loading ALL data from Data/review-District_of_Columbia.json...
Finished loading 1894317 total records.
Dropped 26220 rows containing missing data.
Mapping categories...
Total Reviews: 1868097
Unique Users: 753560
Unique Items (Restaurants + Others): 11003
Splitting data (Leave-One-Last-Restaurant)...
Train interactions: 1436140
Test interactions:  431957
Building sparse matrices...

--- Matrix Properties ---
Shape: (753560, 11003) (Users, All Items)
Train Matrix Non-zeros: 1420443
Test Matrix Non-zeros: 431957

Saving matrices and mappers...
Successfully saved matrices to Output_Data
 - Output_Data\R_train_all.npz
 - Output_Data\R_test_all.npz
