# Part 5: Sentiment-Based Business Recommender System

In this notebook, we integrate our trained BERT-LSTM sentiment analysis model with a content-based recommender system. The system analyzes user reviews using our sentiment model and recommends similar businesses based on sentiment patterns, business features, and user preferences.

## Installing Dependencies

In [1]:
!pip install -q scikit-learn pandas numpy torch transformers matplotlib seaborn

## Importing Dependencies

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

## Loading Preprocessed Data and Business Information

In [3]:
# Load preprocessed review data
reviews_df = pd.read_parquet("yelp_preprocessed.parquet")
print(f"Reviews dataset shape: {reviews_df.shape}")
reviews_df.head()

Reviews dataset shape: (1498897, 10)


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment
0,-O5ng1XLox6uEr4uIZ8u5A,3zBJUlWtPNoZ0uN83ODbyg,3g6XqkBikTTbZmTukbeGnw,4,1,0,0,great services the people really want to help ...,2005-02-16 03:29:39,positive
1,WC9q5vhQlQkLK05kEs-vYQ,XCsZ3hWa_6oP1WkWvK7pmg,Aes-0Q_guDeYewMapFs_vg,2,0,0,0,food is decent but is a complete tourist trap ...,2005-03-01 16:59:37,negative
2,Q0GJ06L78nkVyNfHrd9iUg,XCsZ3hWa_6oP1WkWvK7pmg,CziOtnFSklimJnBgksDDwA,3,0,0,0,this place gets a [NUM]...great location great...,2005-03-01 17:25:13,neutral
3,-wsNpVhc3D-wDmBXRwnGTw,3MYdpmHeNwC6FquRWi3YOg,ZPwFVWoiqFOTbnhfSuK-ZQ,2,1,2,0,un-safeway. meet homeless people and drug addi...,2005-03-02 04:53:42,negative
4,FfzcGEJ1pYUx8jy0BDAW9Q,3MYdpmHeNwC6FquRWi3YOg,ajfmcCilbPMKb_VxswIuQQ,3,0,0,0,a hippie coffee store but with free wifi also ...,2005-03-04 02:16:43,neutral


In [4]:
# Load business data
business_data = []
with open('yelp_academic_dataset_business.json', 'r') as f:
    for line in f:
        business_data.append(json.loads(line))

business_df = pd.DataFrame(business_data)
print(f"Business dataset shape: {business_df.shape}")
business_df.head()

Business dataset shape: (150346, 14)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


## BERT-LSTM Model Definition and Loading

In [5]:
class RoBERTa_LSTM(nn.Module):
    def __init__(self, roberta_model='roberta-base', lstm_hidden=256, num_classes=3):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(roberta_model)
        self.lstm = nn.LSTM(input_size=self.roberta.config.hidden_size,
                            hidden_size=lstm_hidden,
                            batch_first=True,
                            bidirectional=True)
        self.norm = nn.LayerNorm(lstm_hidden * 2)
        self.drop = nn.Dropout(0.4)
        self.fc = nn.Linear(lstm_hidden * 2, num_classes)

    def forward(self, input_ids, attention_mask):
        roberta_out = self.roberta(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        lstm_out, _ = self.lstm(roberta_out)
        pooled = torch.mean(lstm_out, dim=1)
        normed = self.norm(pooled)
        return self.fc(self.drop(normed))

In [6]:
# Load the trained sentiment model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentiment_model = RoBERTa_LSTM()
sentiment_model.load_state_dict(torch.load("best_model.pt", map_location=device))
sentiment_model = sentiment_model.to(device)
sentiment_model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
print("Sentiment model loaded successfully")

2025-08-23 10:17:04.622676: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755944224.641438    3856 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755944224.647215    3856 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-23 10:17:04.671970: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized

Sentiment model loaded successfully


## Sentiment Analysis Function

In [7]:
def predict_sentiment(text, model, tokenizer, device, max_length=512):
    """
    Predict sentiment for a given text using the trained BERT-LSTM model
    Returns: sentiment_label (0: negative, 1: neutral, 2: positive), confidence_score
    """
    model.eval()
    
    # Tokenize the text
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        probabilities = torch.softmax(outputs, dim=1)
        predicted_class = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_class].item()
    
    return predicted_class, confidence

## Data Preprocessing for Recommender System

In [8]:
# Merge reviews with business data
merged_df = reviews_df.merge(business_df[['business_id', 'name', 'city', 'state', 'categories', 'stars']], 
                            on='business_id', how='left', suffixes=('_review', '_business'))

# Filter out businesses with missing information
merged_df = merged_df.dropna(subset=['categories', 'name'])
print(f"Merged dataset shape: {merged_df.shape}")
merged_df.head()

Merged dataset shape: (1498730, 15)


Unnamed: 0,review_id,user_id,business_id,stars_review,useful,funny,cool,text,date,sentiment,name,city,state,categories,stars_business
0,-O5ng1XLox6uEr4uIZ8u5A,3zBJUlWtPNoZ0uN83ODbyg,3g6XqkBikTTbZmTukbeGnw,4,1,0,0,great services the people really want to help ...,2005-02-16 03:29:39,positive,George's Cycles & Fitness,Boise,ID,"Sporting Goods, Shopping, Bikes",3.0
1,WC9q5vhQlQkLK05kEs-vYQ,XCsZ3hWa_6oP1WkWvK7pmg,Aes-0Q_guDeYewMapFs_vg,2,0,0,0,food is decent but is a complete tourist trap ...,2005-03-01 16:59:37,negative,Longboard's Grill,Santa Barbara,CA,"Nightlife, Seafood, American (Traditional), Ba...",3.0
2,Q0GJ06L78nkVyNfHrd9iUg,XCsZ3hWa_6oP1WkWvK7pmg,CziOtnFSklimJnBgksDDwA,3,0,0,0,this place gets a [NUM]...great location great...,2005-03-01 17:25:13,neutral,Pascucci,Santa Barbara,CA,"Gelato, Food, Restaurants, Italian, Vegetarian...",3.5
3,-wsNpVhc3D-wDmBXRwnGTw,3MYdpmHeNwC6FquRWi3YOg,ZPwFVWoiqFOTbnhfSuK-ZQ,2,1,2,0,un-safeway. meet homeless people and drug addi...,2005-03-02 04:53:42,negative,Safeway,Tucson,AZ,"Flowers & Gifts, Food, Shopping, Grocery, Flor...",2.0
4,FfzcGEJ1pYUx8jy0BDAW9Q,3MYdpmHeNwC6FquRWi3YOg,ajfmcCilbPMKb_VxswIuQQ,3,0,0,0,a hippie coffee store but with free wifi also ...,2005-03-04 02:16:43,neutral,Bentley's House of Coffee & Tea,Tucson,AZ,"Cafes, Breakfast & Brunch, Restaurants, Sandwi...",4.0


## Business Feature Engineering

In [9]:
# Create business features for recommendation
def process_categories(categories_str):
    if pd.isna(categories_str):
        return []
    return [cat.strip() for cat in categories_str.split(',')]

# Process business categories
business_df['categories_list'] = business_df['categories'].apply(process_categories)

# Create category features using TF-IDF
category_texts = business_df['categories'].fillna('').tolist()
category_vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
category_features = category_vectorizer.fit_transform(category_texts)

print(f"Category features shape: {category_features.shape}")
print(f"Sample categories: {business_df['categories'].dropna().head().tolist()}")

Category features shape: (150346, 100)
Sample categories: ['Doctors, Traditional Chinese Medicine, Naturopathic/Holistic, Acupuncture, Health & Medical, Nutritionists', 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services', 'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores', 'Restaurants, Food, Bubble Tea, Coffee & Tea, Bakeries', 'Brewpubs, Breweries, Food']


## Sentiment-Based Business Profiling

In [None]:
from tqdm.notebook import tqdm

# Calculate sentiment distribution for each business
def calculate_business_sentiment_profile(business_id, reviews_df):
    business_reviews = reviews_df[reviews_df['business_id'] == business_id]
    
    if len(business_reviews) == 0:
        return {'negative_ratio': 0, 'neutral_ratio': 0, 'positive_ratio': 0, 'total_reviews': 0}
    
    sentiment_counts = business_reviews['sentiment'].value_counts()
    total = len(business_reviews)
    
    return {
        'negative_ratio': sentiment_counts.get('negative', 0) / total,
        'neutral_ratio': sentiment_counts.get('neutral', 0) / total,
        'positive_ratio': sentiment_counts.get('positive', 0) / total,
        'total_reviews': total
    }

# Create sentiment profiles for all businesses
business_ids = business_df['business_id'].unique()
business_sentiment_profiles = {}

print(f"Calculating sentiment profiles for {len(business_ids)} businesses...")
for business_id in tqdm(business_ids, desc="Processing businesses"):
    business_sentiment_profiles[business_id] = calculate_business_sentiment_profile(business_id, merged_df)

# Convert to DataFrame
sentiment_profile_df = pd.DataFrame.from_dict(business_sentiment_profiles, orient='index')
sentiment_profile_df.reset_index(inplace=True)
sentiment_profile_df.rename(columns={'index': 'business_id'}, inplace=True)
print(f"Sentiment profiles shape: {sentiment_profile_df.shape}")
sentiment_profile_df.head()

Calculating sentiment profiles for 150346 businesses...


Processing businesses:   0%|          | 0/150346 [00:00<?, ?it/s]

## Content-Based Recommender System Class

In [None]:
class SentimentBasedRecommender:
    def __init__(self, business_df, sentiment_profile_df, category_features, sentiment_model, tokenizer, device):
        self.business_df = business_df
        self.sentiment_profile_df = sentiment_profile_df
        self.category_features = category_features
        self.sentiment_model = sentiment_model
        self.tokenizer = tokenizer
        self.device = device
        
        # Merge business data with sentiment profiles
        self.business_features_df = business_df.merge(sentiment_profile_df, on='business_id', how='left')
        self.business_features_df = self.business_features_df.fillna(0)
        
        # Prepare feature matrix
        self._prepare_feature_matrix()
    
    def _prepare_feature_matrix(self):
        # Combine numerical features
        numerical_features = ['stars', 'negative_ratio', 'neutral_ratio', 'positive_ratio', 'total_reviews']
        numerical_data = self.business_features_df[numerical_features].fillna(0)
        
        # Standardize numerical features
        scaler = StandardScaler()
        numerical_scaled = scaler.fit_transform(numerical_data)
        
        # Combine with category features
        self.feature_matrix = np.hstack([numerical_scaled, self.category_features.toarray()])
        
        print(f"Feature matrix shape: {self.feature_matrix.shape}")
    
    def predict_user_sentiment(self, user_review):
        """Predict sentiment for user's review"""
        sentiment_label, confidence = predict_sentiment(
            user_review, self.sentiment_model, self.tokenizer, self.device
        )
        
        sentiment_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
        return sentiment_map[sentiment_label], confidence
    
    def find_similar_businesses(self, business_id, top_k=10):
        """Find businesses similar to the given business_id"""
        try:
            business_idx = self.business_features_df[self.business_features_df['business_id'] == business_id].index[0]
        except IndexError:
            return []
        
        # Calculate cosine similarity
        similarities = cosine_similarity([self.feature_matrix[business_idx]], self.feature_matrix)[0]
        
        # Get top similar businesses (excluding the business itself)
        similar_indices = np.argsort(similarities)[::-1][1:top_k+1]
        
        recommendations = []
        for idx in similar_indices:
            business_info = self.business_features_df.iloc[idx]
            recommendations.append({
                'business_id': business_info['business_id'],
                'name': business_info['name'],
                'city': business_info['city'],
                'state': business_info['state'],
                'categories': business_info['categories'],
                'stars': business_info['stars'],
                'similarity_score': similarities[idx],
                'positive_ratio': business_info['positive_ratio'],
                'total_reviews': business_info['total_reviews']
            })
        
        return recommendations

## Initialize Recommender System

In [None]:
# Initialize the recommender system
recommender = SentimentBasedRecommender(
    business_df=business_df,
    sentiment_profile_df=sentiment_profile_df,
    category_features=category_features,
    sentiment_model=sentiment_model,
    tokenizer=tokenizer,
    device=device
)

print("Recommender system initialized successfully!")