In [15]:
# All imports
import pandas as pd
import numpy as np
import ast
import json
from collections import defaultdict
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.ensemble import RandomForestRegressor
from sentence_transformers import SentenceTransformer

# Load data
dataVer10 = "review-Vermont-10.json.gz"
dataVer = "review-Vermont.json.gz"
dataMeta = "meta-Vermont.json.gz"

df_meta = pd.read_json(dataMeta, lines=True, compression="gzip")
df_ver = pd.read_json(dataVer, lines=True, compression="gzip")

In [16]:
# Merge and preprocess data
df = df_ver.merge(df_meta, on="gmap_id", how="left")

print("Before removing missing text:", len(df))
df = df.dropna(subset=["text"])
print("After removing missing text:", len(df))

print("Before removing duplicates:", len(df))
df = df.drop_duplicates(subset=['user_id', 'gmap_id'])
print("After removing duplicates:", len(df))

# Remove unnecessary columns
useless = ["name_x", "name_y", "time", "pics", "resp", "address", "relative_results", "state", "url", "latitude", "longitude"]
maybe = ["description", "num_of_reviews"]
df = df.drop(columns=useless + maybe)

# Remove missing ratings
print("Before removing missing ratings:", len(df))
df = df.dropna(subset=['rating'])
print("After removing missing ratings:", len(df))

# Save preprocessed data
df.to_csv("merged.csv", index=False)
print("Preprocessed data saved to merged.csv")

Before removing missing text: 853549
After removing missing text: 508108
Before removing duplicates: 508108
After removing duplicates: 488212
Before removing missing ratings: 488212
After removing missing ratings: 488052
Preprocessed data saved to merged.csv


In [17]:
# Display sample of preprocessed data
df.head()

Unnamed: 0,user_id,rating,text,gmap_id,category,avg_rating,price,hours,MISC
0,1.044905e+20,5.0,The Royal Group recently performed standard te...,0x89e02445cb9db457:0x37f42bff4edf7a43,"[Security system supplier, Fire protection equ...",4.9,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...
2,1.120627e+20,5.0,I can't say enough great things about The Roya...,0x89e02445cb9db457:0x37f42bff4edf7a43,"[Security system supplier, Fire protection equ...",4.9,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...
4,1.100483e+20,5.0,The Royal Group has done work for us over many...,0x89e02445cb9db457:0x37f42bff4edf7a43,"[Security system supplier, Fire protection equ...",4.9,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...
6,1.061744e+20,5.0,The Royal Group was fantastic to work with. I ...,0x89e02445cb9db457:0x37f42bff4edf7a43,"[Security system supplier, Fire protection equ...",4.9,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...
8,1.062387e+20,5.0,"Have used in different houses, installing mult...",0x89e02445cb9db457:0x37f42bff4edf7a43,"[Security system supplier, Fire protection equ...",4.9,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...


# III. Modeling

This section describes the models we implement for rating prediction. We start with simple baselines and progressively build more sophisticated models that incorporate text features and metadata.


## A. Baseline Models

We implement two simple baseline models to establish a performance floor:

1. **Global Mean**: Predicts the average rating across all training examples
2. **Item Mean (gmap_id mean)**: Predicts the average rating for each business (gmap_id)


In [18]:
# Load data for baseline models
df = pd.read_csv("merged.csv", usecols=['user_id', 'rating', 'gmap_id'])

# Clean and prepare data
df = df.dropna(subset=['user_id', 'rating', 'gmap_id'])
df['rating'] = df['rating'].astype(float)

# Train/Test split
train, test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['rating']
)

print(f"Train size: {len(train)}, Test size: {len(test)}")

# Baseline 1: Global Mean
# Simply predict the average rating across all training examples
global_mean = train['rating'].mean()
print(f"Global Mean: {global_mean:.4f}")

# Make predictions
preds_global = [global_mean] * len(test)

# Evaluate
mse_global = mean_squared_error(test['rating'], preds_global)
# Convert predictions to integers 1-5 for accuracy calculation
preds_global_int = [min(5, max(1, int(round(p)))) for p in preds_global]
acc_global = accuracy_score(test['rating'], preds_global_int)

print(f"Global Mean - MSE: {mse_global:.4f}, Accuracy: {acc_global:.4f}")

# Baseline 2: Item Mean (gmap_id mean)
# Predict the average rating for each business
item_avg = train.groupby('gmap_id')['rating'].mean()
global_mean = train['rating'].mean()

# Make predictions
preds_item = []
for gmap_id in test['gmap_id']:
    if gmap_id in item_avg:
        # Use item average if we've seen this business before
        preds_item.append(item_avg[gmap_id])
    else:
        # Fall back to global mean for unseen businesses
        preds_item.append(global_mean)

# Evaluate
mse_item = mean_squared_error(test['rating'], preds_item)
preds_item_int = [min(5, max(1, int(round(p)))) for p in preds_item]
acc_item = accuracy_score(test['rating'], preds_item_int)

print(f"Item Mean - MSE: {mse_item:.4f}, Accuracy: {acc_item:.4f}")
print(f"Number of unique items in train: {len(item_avg)}")
print(f"Number of unseen items in test: {sum(1 for gid in test['gmap_id'] if gid not in item_avg)}")


Train size: 390441, Test size: 97611
Global Mean: 4.3265
Global Mean - MSE: 1.3808, Accuracy: 0.1689
Item Mean - MSE: 1.2090, Accuracy: 0.4514
Number of unique items in train: 10783
Number of unseen items in test: 234


In [19]:
# ========== Analysis: Why not User Mean? ==========
user_review_counts = train.groupby('user_id').size()
single_review_users = (user_review_counts == 1).sum()

print(f"Total unique users in train: {len(user_review_counts)}")
print(f"Users with only 1 review: {single_review_users} ({100*single_review_users/len(user_review_counts):.2f}%)")
print(f"Users with 2+ reviews: {len(user_review_counts) - single_review_users} ({100*(len(user_review_counts)-single_review_users)/len(user_review_counts):.2f}%)")


Total unique users in train: 174161
Users with only 1 review: 121350 (69.68%)
Users with 2+ reviews: 52811 (30.32%)


## B. Text-based Linear Model (Main Model #1)

**Intuition**: The sentiment and expressions in text are most directly connected to ratings. Words, phrases, and sentiment expressions contained in review text provide key information for predicting the ratings assigned by users.

We use:
- **TF-IDF vectorization** to convert text into numerical features
- **Ridge Regression** for regularization and to handle the high-dimensional feature space


In [20]:
# Load and prepare data for text-based model
df = pd.read_csv("merged.csv", usecols=['text', 'rating'])
df = df.dropna(subset=['text', 'rating'])
df['text'] = df['text'].astype(str)
df['rating'] = df['rating'].astype(float)

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['rating'], test_size=0.2, random_state=42, stratify=df['rating']
)

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF feature shape: {X_train_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")

# Ridge Regression
model_text = Ridge(alpha=1.0)
model_text.fit(X_train_tfidf, y_train)

y_pred_text = model_text.predict(X_test_tfidf)
y_pred_text_clamped = np.clip(np.round(y_pred_text), 1, 5)

mse_text = mean_squared_error(y_test, y_pred_text)
acc_text = accuracy_score(y_test, y_pred_text_clamped)

print(f"Text-based Model (Ridge) - MSE: {mse_text:.4f}, Accuracy: {acc_text:.4f}")


Train size: 390440, Test size: 97610
TF-IDF feature shape: (390440, 50000)
Vocabulary size: 50000
Text-based Model (Ridge) - MSE: 0.4724, Accuracy: 0.6481


## C. Text + Metadata Model (Main Model #2)

This model combines text features with metadata to capture both semantic content and contextual information about the business.

**Features**:
- **TF-IDF(text)**: Text vectorization
- **avg_rating**: Average rating of the business (computed from training data only)
- **category**: Business categories (BOW/one-hot encoding)
- **hours**: Operating hours features (e.g., count of open hours)
- **MISC**: Other feasible metadata features

We use either **Ridge Regression** or **RandomForestRegressor** for this model.


In [None]:
# Load data with metadata
df = pd.read_csv("merged.csv", usecols=['text', 'rating', 'gmap_id', 'category', 'avg_rating', 'hours', 'MISC'])

# Clean and prepare data
df = df.dropna(subset=['text', 'rating', 'gmap_id'])
df['text'] = df['text'].astype(str)
df['rating'] = df['rating'].astype(float)

# Train/Test split
df_train, df_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['rating']
)

print(f"Train size: {len(df_train)}, Test size: {len(df_test)}")

# Feature 1: TF-IDF (text)
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

X_train_tfidf = tfidf.fit_transform(df_train['text'])
X_test_tfidf = tfidf.transform(df_test['text'])
print(f"TF-IDF shape: {X_train_tfidf.shape}")

# Feature 2: avg_rating (recomputed from training data only to avoid data leakage)
train_avg_rating = df_train.groupby('gmap_id')['rating'].mean()
global_avg = df_train['rating'].mean()

df_train['avg_rating_feat'] = df_train['gmap_id'].map(train_avg_rating)
df_train['avg_rating_feat'] = df_train['avg_rating_feat'].fillna(global_avg)

df_test['avg_rating_feat'] = df_test['gmap_id'].map(train_avg_rating)
df_test['avg_rating_feat'] = df_test['avg_rating_feat'].fillna(global_avg)
print(f"avg_rating feature - Train mean: {df_train['avg_rating_feat'].mean():.4f}, Test mean: {df_test['avg_rating_feat'].mean():.4f}")

# Feature 3: category (one-hot encoding)
# Parse category strings into lists
category_list_train = []
for cat_str in df_train['category']:
    if pd.isna(cat_str):
        category_list_train.append([])
    elif isinstance(cat_str, str):
        if cat_str.startswith('['):
            category_list_train.append(ast.literal_eval(cat_str))
        else:
            category_list_train.append([cat_str])
    else:
        category_list_train.append(cat_str if isinstance(cat_str, list) else [])

category_list_test = []
for cat_str in df_test['category']:
    if pd.isna(cat_str):
        category_list_test.append([])
    elif isinstance(cat_str, str):
        if cat_str.startswith('['):
            category_list_test.append(ast.literal_eval(cat_str))
        else:
            category_list_test.append([cat_str])
    else:
        category_list_test.append(cat_str if isinstance(cat_str, list) else [])

# One-hot encode categories
mlb = MultiLabelBinarizer()
X_train_category = mlb.fit_transform(category_list_train)
X_test_category = mlb.transform(category_list_test)
print(f"Category features shape: {X_train_category.shape}, Unique categories: {len(mlb.classes_)}")

# Feature 4: hours (count of open hours)
hours_count_train = []
for hours_str in df_train['hours']:
    if pd.isna(hours_str):
        hours_count_train.append(0)
    else:
        if isinstance(hours_str, str):
            hours_list = ast.literal_eval(hours_str)
        else:
            hours_list = hours_str
        
        if not isinstance(hours_list, list):
            hours_count_train.append(0)
        else:
            open_count = 0
            for day_info in hours_list:
                if isinstance(day_info, list) and len(day_info) >= 2:
                    if day_info[1] != 'Closed':
                        open_count += 1
            hours_count_train.append(open_count)

hours_count_test = []
for hours_str in df_test['hours']:
    if pd.isna(hours_str):
        hours_count_test.append(0)
    else:
        if isinstance(hours_str, str):
            hours_list = ast.literal_eval(hours_str)
        else:
            hours_list = hours_str
        
        if not isinstance(hours_list, list):
            hours_count_test.append(0)
        else:
            open_count = 0
            for day_info in hours_list:
                if isinstance(day_info, list) and len(day_info) >= 2:
                    if day_info[1] != 'Closed':
                        open_count += 1
            hours_count_test.append(open_count)

df_train['hours_count'] = hours_count_train
df_test['hours_count'] = hours_count_test
print(f"Hours count - Train mean: {df_train['hours_count'].mean():.2f}, Test mean: {df_test['hours_count'].mean():.2f}")

# Feature 5: MISC (extract accessibility, amenities, and key count)
has_accessibility_train = []
has_amenities_train = []
misc_keys_count_train = []

for misc_str in df_train['MISC']:
    if pd.isna(misc_str):
        has_accessibility_train.append(0)
        has_amenities_train.append(0)
        misc_keys_count_train.append(0)
    else:
        if isinstance(misc_str, str):
            if misc_str.startswith('{'):
                misc_dict = ast.literal_eval(misc_str)
            else:
                misc_dict = {}
        else:
            misc_dict = misc_str if isinstance(misc_str, dict) else {}
        
        has_accessibility_train.append(1 if 'Accessibility' in misc_dict else 0)
        has_amenities_train.append(1 if 'Amenities' in misc_dict or 'Amenity' in misc_dict else 0)
        misc_keys_count_train.append(len(misc_dict) if isinstance(misc_dict, dict) else 0)

has_accessibility_test = []
has_amenities_test = []
misc_keys_count_test = []

for misc_str in df_test['MISC']:
    if pd.isna(misc_str):
        has_accessibility_test.append(0)
        has_amenities_test.append(0)
        misc_keys_count_test.append(0)
    else:
        if isinstance(misc_str, str):
            if misc_str.startswith('{'):
                misc_dict = ast.literal_eval(misc_str)
            else:
                misc_dict = {}
        else:
            misc_dict = misc_str if isinstance(misc_str, dict) else {}
        
        has_accessibility_test.append(1 if 'Accessibility' in misc_dict else 0)
        has_amenities_test.append(1 if 'Amenities' in misc_dict or 'Amenity' in misc_dict else 0)
        misc_keys_count_test.append(len(misc_dict) if isinstance(misc_dict, dict) else 0)

df_train['has_accessibility'] = has_accessibility_train
df_train['has_amenities'] = has_amenities_train
df_train['misc_keys_count'] = misc_keys_count_train

df_test['has_accessibility'] = has_accessibility_test
df_test['has_amenities'] = has_amenities_test
df_test['misc_keys_count'] = misc_keys_count_test
print("MISC features extracted")

# Combine all features
num_features_train = df_train[['avg_rating_feat', 'hours_count', 'has_accessibility', 'has_amenities', 'misc_keys_count']].values
num_features_test = df_test[['avg_rating_feat', 'hours_count', 'has_accessibility', 'has_amenities', 'misc_keys_count']].values

X_train_combined = hstack([X_train_tfidf, X_train_category, num_features_train])
X_test_combined = hstack([X_test_tfidf, X_test_category, num_features_test])
print(f"Combined feature shape - Train: {X_train_combined.shape}, Test: {X_test_combined.shape}")

# Model: Ridge Regression
model_combined_ridge = Ridge(alpha=1.0)
model_combined_ridge.fit(X_train_combined, df_train['rating'])

y_pred_ridge = model_combined_ridge.predict(X_test_combined)
y_pred_ridge_int = np.clip(np.round(y_pred_ridge), 1, 5)

mse_ridge = mean_squared_error(df_test['rating'], y_pred_ridge)
acc_ridge = accuracy_score(df_test['rating'], y_pred_ridge_int)

print(f"Text + Metadata Model (Ridge) - MSE: {mse_ridge:.4f}, Accuracy: {acc_ridge:.4f}")

# Model: Random Forest Regressor
model_combined_rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=10,
    random_state=42,
    n_jobs=-1
)

# Convert sparse matrix to CSR format for indexing, then to dense for Random Forest
X_train_combined_csr = X_train_combined.tocsr()

if X_train_combined_csr.shape[0] > 100000:
    # Use a sample if dataset is too large
    sample_idx = np.random.choice(X_train_combined_csr.shape[0], 100000, replace=False)
    X_train_sample = X_train_combined_csr[sample_idx].toarray()
    y_train_sample = df_train['rating'].iloc[sample_idx].values
    model_combined_rf.fit(X_train_sample, y_train_sample)
    y_pred_rf = model_combined_rf.predict(X_test_combined.toarray())
else:
    model_combined_rf.fit(X_train_combined_csr.toarray(), df_train['rating'])
    y_pred_rf = model_combined_rf.predict(X_test_combined.toarray())

y_pred_rf_int = np.clip(np.round(y_pred_rf), 1, 5)

mse_rf = mean_squared_error(df_test['rating'], y_pred_rf)
acc_rf = accuracy_score(df_test['rating'], y_pred_rf_int)

print(f"Text + Metadata Model (Random Forest) - MSE: {mse_rf:.4f}, Accuracy: {acc_rf:.4f}")


Train size: 390440, Test size: 97610
TF-IDF shape: (390440, 50000)
avg_rating feature - Train mean: 4.3265, Test mean: 4.3283




Category features shape: (390440, 1971), Unique categories: 1971
Hours count - Train mean: 5.26, Test mean: 5.25
MISC features extracted
Combined feature shape - Train: (390440, 51976), Test: (97610, 51976)
Text + Metadata Model (Ridge) - MSE: 0.4667, Accuracy: 0.6493


## D. BERT Embedding Model

**Explanation**: BERT (Bidirectional Encoder Representations from Transformers) provides richer semantic representation of text compared to TF-IDF. It captures contextual word meanings and can better understand the sentiment and nuances in review text.

We use **SBERT (Sentence-BERT)** to generate sentence embeddings, which are then fed into a Linear Regression model.


In [None]:
# Load SBERT model
print("Loading SBERT model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# Use a subset for efficiency (BERT is computationally expensive)
sample_size = min(50000, len(df_train))
sample_idx = np.random.choice(len(df_train), sample_size, replace=False)

print(f"Generating embeddings for {sample_size} training samples...")
X_train_bert = sbert_model.encode(
    df_train['text'].iloc[sample_idx].tolist(),
    show_progress_bar=True,
    batch_size=32
)
y_train_bert = df_train['rating'].iloc[sample_idx].values

print(f"Generating embeddings for test set...")
X_test_bert = sbert_model.encode(
    df_test['text'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

# Linear Regression on BERT embeddings
model_bert = LinearRegression()
model_bert.fit(X_train_bert, y_train_bert)

y_pred_bert = model_bert.predict(X_test_bert)
y_pred_bert_clamped = np.clip(np.round(y_pred_bert), 1, 5)

mse_bert = mean_squared_error(df_test['rating'], y_pred_bert)
acc_bert = accuracy_score(df_test['rating'], y_pred_bert_clamped)

print(f"BERT Embedding Model - MSE: {mse_bert:.4f}, Accuracy: {acc_bert:.4f}")
bert_available = True


Loading SBERT model...
Generating embeddings for 50000 training samples...


Batches: 100%|██████████| 1563/1563 [01:59<00:00, 13.05it/s]


Generating embeddings for test set...


Batches: 100%|██████████| 3051/3051 [03:14<00:00, 15.66it/s]


BERT Embedding Model - MSE: 0.6408, Accuracy: 0.5741


## E. Model Comparison

Below is a comprehensive comparison of all models we implemented:


In [None]:
# Create comparison table
results = {
    'Model': [
        'Global Mean (Baseline)',
        'Item Mean (Baseline)',
        'Text-based (TF-IDF + Ridge)',
        'Text + Metadata (Ridge)',
        'Text + Metadata (Random Forest)',
    ],
    'MSE': [
        mse_global,
        mse_item,
        mse_text,
        mse_ridge,
        mse_rf,
    ],
    'Accuracy': [
        acc_global,
        acc_item,
        acc_text,
        acc_ridge,
        acc_rf,
    ]
}

# Add BERT if available
if bert_available:
    results['Model'].append('BERT Embedding (SBERT + Linear Regression)')
    results['MSE'].append(mse_bert)
    results['Accuracy'].append(acc_bert)

comparison_df = pd.DataFrame(results)
comparison_df = comparison_df.sort_values('MSE')

print("=" * 80)
print("MODEL COMPARISON")
print("=" * 80)
print(comparison_df.to_string(index=False))
print("=" * 80)

comparison_df
