This is the final implementation of the user-item matrix script. It creates 4 outputs, R_train_rest (missing last 2 interactions), R_valid_rest (the 2nd to last interaction) and R_test_rest, the last interaction. R_retrain_rest is the R matrix that contains the 2nd to last interaction but not the last (validation set added back in)

Min users must be 3 or more for non-trivial results (otherwise there are users with zero training data in R_train_rest)


In [2]:
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix, save_npz
import os
import sys

# --- Configuration ---
# Adjustable parameter for N-core filter
MIN_REVIEWS_PER_USER = 20

# Paths
FILE_PATH = 'Data/review-District_of_Columbia.json' # Raw dataset
# Input file remains in Output_Data (assuming previous step saved it there)
RESTAURANT_IDS_FILE = 'Output_Data/restaurant_gmap_ids_1215.json'
# NEW: Save outputs to n-core folder
OUTPUT_DIR = f'{MIN_REVIEWS_PER_USER}_core_1215'

# Output Filenames (Dynamic based on N)
TRAIN_FILE = os.path.join(OUTPUT_DIR, f'R_train_rest_{MIN_REVIEWS_PER_USER}.npz')
VALID_FILE = os.path.join(OUTPUT_DIR, f'R_valid_rest_{MIN_REVIEWS_PER_USER}.npz')
TEST_FILE  = os.path.join(OUTPUT_DIR, f'R_test_rest_{MIN_REVIEWS_PER_USER}.npz')
RETRAIN_FILE = os.path.join(OUTPUT_DIR, f'R_retrain_rest_{MIN_REVIEWS_PER_USER}.npz') # Train + Valid

USER_MAP_FILE = os.path.join(OUTPUT_DIR, f'restaurant_user_map_rest_{MIN_REVIEWS_PER_USER}.json')
ITEM_MAP_FILE = os.path.join(OUTPUT_DIR, f'restaurant_item_map_rest_{MIN_REVIEWS_PER_USER}.json')

os.makedirs(OUTPUT_DIR, exist_ok=True)

def main():
    print(f"--- Starting N-Core Matrix Generation (N={MIN_REVIEWS_PER_USER}) ---")

    # 1. Load Valid Restaurant IDs
    print(f"Loading restaurant IDs from {RESTAURANT_IDS_FILE}...")
    try:
        with open(RESTAURANT_IDS_FILE, 'r') as f:
            valid_restaurant_ids = set(json.load(f))
        print(f"Loaded {len(valid_restaurant_ids)} valid restaurant IDs.")
    except FileNotFoundError:
        print(f"Error: Could not find {RESTAURANT_IDS_FILE}. Please run the filter script first.")
        sys.exit(1)

    print(f"Loading data from {FILE_PATH}...")

    extracted_data = {
        'user_id': [],
        'gmap_id': [],
        'rating': [],
        'time': []
    }

    # 2. Load Review Data & Filter Non-Restaurants
    try:
        with open(FILE_PATH, 'r') as f:
            # Handle both list-of-dicts and line-delimited JSON
            try:
                first_char = f.read(1)
                f.seek(0)
                if first_char == '[':
                    data = json.load(f)
                    iterator = data
                else:
                    iterator = f
            except Exception:
                f.seek(0)
                iterator = f

            for i, record in enumerate(iterator):
                if isinstance(record, str):
                    try:
                        record = json.loads(record)
                    except json.JSONDecodeError:
                        continue

                gmap_id = record.get('gmap_id')

                # FIRST FILTER: Only keep restaurants
                if gmap_id in valid_restaurant_ids:
                    extracted_data['user_id'].append(record.get('user_id'))
                    extracted_data['gmap_id'].append(gmap_id)
                    extracted_data['rating'].append(record.get('rating'))
                    extracted_data['time'].append(record.get('time'))

                if i % 100000 == 0 and i > 0:
                    print(f"Processed {i} raw rows...", end='\r')

    except FileNotFoundError:
        print(f"Error: Review file not found at {FILE_PATH}")
        sys.exit(1)

    # 3. Create DataFrame
    df = pd.DataFrame(extracted_data)
    del extracted_data
    print(f"\nInitial Restaurant Reviews: {len(df)}")

    # Clean missing data
    df.dropna(subset=['user_id', 'gmap_id', 'rating', 'time'], inplace=True)

    # 4. Apply N-Core Filter (Users must have >= N restaurant reviews)
    print(f"Applying {MIN_REVIEWS_PER_USER}-core filter...")

    # Count reviews per user
    user_counts = df['user_id'].value_counts()

    # Identify valid users
    valid_users = user_counts[user_counts >= MIN_REVIEWS_PER_USER].index

    # Filter DataFrame
    initial_users = len(user_counts)
    df = df[df['user_id'].isin(valid_users)]

    final_users = df['user_id'].nunique()
    print(f"Users reduced from {initial_users} to {final_users}")
    print(f"Total Reviews remaining: {len(df)}")

    if len(df) == 0:
        print("Error: Dataset empty after N-core filter. Lower the N value.")
        sys.exit(1)

    # 5. Map IDs to Integers
    print("Mapping IDs...")

    # User Mapping
    # Users are mapped based on who remains in the N-core set
    user_cat = df['user_id'].astype('category')
    user_indices = user_cat.cat.codes
    user_mapper = dict(enumerate(user_cat.cat.categories))

    # Item Mapping (CRITICAL UPDATE)
    # We must include ALL valid restaurants in the mapping, even those with 0 reviews in this subset.
    # We sort them to ensure a deterministic order.
    all_restaurants_list = sorted(list(valid_restaurant_ids))

    # Force the column to be categorical with the full set of categories
    df['gmap_id'] = pd.Categorical(df['gmap_id'], categories=all_restaurants_list)

    # Get codes (this works because we set the categories explicitly above)
    item_indices = df['gmap_id'].cat.codes

    # Create the item mapper for the full set
    item_mapper = {i: v for i, v in enumerate(all_restaurants_list)}

    df['user_idx'] = user_indices
    df['item_idx'] = item_indices

    num_users = len(user_mapper)
    num_items = len(item_mapper) # This now equals len(valid_restaurant_ids)

    print(f"Matrix Dimensions: {num_users} Users x {num_items} Items")
    print("(Note: num_items matches the total number of restaurants, preserving empty columns)")

    # 6. Split Data (Train / Valid / Test)
    print("Splitting data (Test=Last, Valid=2nd Last)...")

    # Sort by User then Time (Newest first)
    df.sort_values(by=['user_idx', 'time'], ascending=[True, False], inplace=True)

    # Calculate rank: 0 = Newest, 1 = 2nd Newest, etc.
    df['rank'] = df.groupby('user_idx').cumcount()

    df_test = df[df['rank'] == 0]
    df_valid = df[df['rank'] == 1]
    df_train = df[df['rank'] >= 2]

    print(f"Train size: {len(df_train)}")
    print(f"Valid size: {len(df_valid)}")
    print(f"Test size:  {len(df_test)}")

    # 7. Build Sparse Matrices
    print("Constructing Sparse Matrices...")
    shape = (num_users, num_items)

    R_train = csr_matrix((df_train['rating'], (df_train['user_idx'], df_train['item_idx'])), shape=shape)
    R_valid = csr_matrix((df_valid['rating'], (df_valid['user_idx'], df_valid['item_idx'])), shape=shape)
    R_test  = csr_matrix((df_test['rating'],  (df_test['user_idx'],  df_test['item_idx'])),  shape=shape)

    # Create Retrain Matrix (Train + Valid)
    print("Creating Retrain Matrix (Train + Valid)...")
    R_retrain = R_train + R_valid

    # 8. Save
    print(f"Saving outputs to '{OUTPUT_DIR}'...")
    save_npz(TRAIN_FILE, R_train)
    save_npz(VALID_FILE, R_valid)
    save_npz(TEST_FILE, R_test)
    save_npz(RETRAIN_FILE, R_retrain)

    with open(USER_MAP_FILE, 'w') as f:
        json.dump({str(k): str(v) for k, v in user_mapper.items()}, f)
    with open(ITEM_MAP_FILE, 'w') as f:
        json.dump({str(k): str(v) for k, v in item_mapper.items()}, f)

    print("Done! Files generated:")
    print(f"1. {TRAIN_FILE}")
    print(f"2. {VALID_FILE}")
    print(f"3. {TEST_FILE}")
    print(f"4. {RETRAIN_FILE}")

if __name__ == "__main__":
    main()

--- Starting N-Core Matrix Generation (N=20) ---
Loading restaurant IDs from Output_Data/restaurant_gmap_ids_1215.json...
Loaded 3484 valid restaurant IDs.
Loading data from Data/review-District_of_Columbia.json...
Processed 1800000 raw rows...
Initial Restaurant Reviews: 962174
Applying 20-core filter...
Users reduced from 423234 to 4060
Total Reviews remaining: 147160
Mapping IDs...
Matrix Dimensions: 4060 Users x 3484 Items
(Note: num_items matches the total number of restaurants, preserving empty columns)
Splitting data (Test=Last, Valid=2nd Last)...
Train size: 139040
Valid size: 4060
Test size:  4060
Constructing Sparse Matrices...
Creating Retrain Matrix (Train + Valid)...
Saving outputs to '20_core_1215'...
Done! Files generated:
1. 20_core_1215\R_train_rest_20.npz
2. 20_core_1215\R_valid_rest_20.npz
3. 20_core_1215\R_test_rest_20.npz
4. 20_core_1215\R_retrain_rest_20.npz
