In [1]:
import pandas as pd
import numpy as np
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from collections import Counter
from sklearn.model_selection import train_test_split
import re
import string

In [19]:
review_path = "Yelp JSON/yelp_dataset/yelp_academic_dataset_review.json"

# Define chunk size
chunk_size = 100000  # Adjust based on available memory

# Create an empty list to store dataframes
dfs = []

with open(review_path, "r", encoding="utf-8") as f:
    chunk = []
    for i, line in enumerate(f):
        chunk.append(json.loads(line))
        
        # Process in chunks
        if (i + 1) % chunk_size == 0:
            df_chunk = pd.DataFrame(chunk)
            dfs.append(df_chunk)  # Store in list to avoid memory explosion
            chunk = []  # Reset chunk
    
    # Process remaining lines
    if chunk:
        df_chunk = pd.DataFrame(chunk)
        dfs.append(df_chunk)

# Concatenate all chunks into a single DataFrame
reviews = pd.concat(dfs, ignore_index=True)

In [None]:
reviews.head()

In [21]:
businesses_path = "Yelp JSON/yelp_dataset/yelp_academic_dataset_business.json"

# Define chunk size
chunk_size = 100000  # Adjust based on available memory

# Create an empty list to store dataframes
dfs = []

with open(businesses_path, "r", encoding="utf-8") as f:
    chunk = []
    for i, line in enumerate(f):
        chunk.append(json.loads(line))
        
        # Process in chunks
        if (i + 1) % chunk_size == 0:
            df_chunk = pd.DataFrame(chunk)
            dfs.append(df_chunk)  # Store in list to avoid memory explosion
            chunk = []  # Reset chunk
    
    # Process remaining lines
    if chunk:
        df_chunk = pd.DataFrame(chunk)
        dfs.append(df_chunk)

# Concatenate all chunks into a single DataFrame
businesses = pd.concat(dfs, ignore_index=True)

In [22]:
businesses.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:

business_filtered = businesses[businesses["city"] == "Sparks"]
business_ids = business_filtered["business_id"]
filtered_reviews = reviews[reviews["business_id"].isin(business_ids)]

In [None]:

filtered_reviews.drop(columns=["useful", "funny", "cool","date"], inplace=True)


In [None]:
filtered_reviews.to_csv("filtered_reviews.csv", index=False)

In [2]:
filtered_reviews= pd.read_csv("filtered_reviews.csv")

Remove ratings to test with later

In [67]:
def create_holdout_dataset(df, test_users, test_size=0.2, min_ratings=5):
    """
    Creates a holdout dataset by removing some reviews from test users, 
    ensuring that only users with enough reviews are considered.
    
    Args:
        df (pd.DataFrame): Original dataframe (train)
        test_users (list): List of user IDs to include in test set
        test_size (float): Proportion of reviews to remove
        min_ratings (int): Minimum number of reviews required per user
    
    Returns:
        pd.DataFrame: Training dataset with removed items
        pd.DataFrame: Holdout dataset with removed items
    """
    # Filter test users with sufficient total reviews
    eligible_users = df.groupby('user_id').filter(lambda x: len(x) >= min_ratings)['user_id'].unique()

    # Create holdout set
    holdout = []
    train = df.copy()
    
    for user in eligible_users:
        # Get all reviews of the user
        user_reviews = df[df['user_id'] == user]
        
        # Randomly select items to remove
        removed = user_reviews.sample(frac=test_size)
        holdout.append(removed)
        
        # Remove from training data
        train = train.drop(removed.index)

    return train, pd.concat(holdout)

# Apply the function
train, holdout = create_holdout_dataset(filtered_reviews, test_users=filtered_reviews['user_id'].unique(), test_size=0.2, min_ratings=5)

# Print dataset shapes
print("Train shape:", train.shape)
print("Holdout shape:", holdout.shape)

Train shape: (67129, 5)
Holdout shape: (5904, 5)


In [68]:
# Ensure required resources are downloaded
""" nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet') """

# Load NLTK stopwords
stop_words = set(stopwords.words('english'))

# Initialize stemmer

stemmer = PorterStemmer()

# Function to clean text, remove stopwords, apply lemmatization and stemming
def preprocess_text(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))  # Lowercase and remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    words = [word for word in tokens if word not in stop_words]  # Remove stopwords
    
    words = [stemmer.stem(word) for word in words]  # Stemming
    return ' '.join(words)  # Convert list back to string

# Apply preprocessing directly to the 'text' column
train['text'] = train['text'].astype(str).apply(preprocess_text)

# Count most common words
word_list = ' '.join(train['text']).split()
word_counts = Counter(word_list)
most_common_words = word_counts.most_common(100)

# Display the most common words
print(most_common_words)

[('place', 31592), ('time', 31001), ('food', 30299), ('get', 27851), ('good', 27796), ('servic', 26550), ('great', 26345), ('go', 26210), ('order', 24529), ('like', 22617), ('one', 21944), ('back', 21161), ('would', 20417), ('us', 15333), ('tri', 15246), ('come', 14812), ('even', 14231), ('work', 14173), ('got', 14144), ('call', 14077), ('realli', 13368), ('love', 12962), ('look', 12938), ('also', 12479), ('make', 12469), ('alway', 12415), ('want', 12366), ('dont', 12312), ('wait', 12198), ('day', 12144), ('need', 12106), ('ask', 12015), ('price', 11709), ('never', 11509), ('custom', 11497), ('well', 11275), ('staff', 11200), ('nice', 11110), ('best', 10991), ('friendli', 10927), ('didnt', 10847), ('im', 10538), ('went', 10471), ('came', 10467), ('said', 10467), ('could', 10050), ('take', 9955), ('recommend', 9760), ('first', 9725), ('know', 9552), ('busi', 9478), ('peopl', 9349), ('ive', 9340), ('help', 9232), ('say', 9182), ('new', 9114), ('use', 9019), ('told', 8990), ('year', 8760)

In [7]:
print([word[0] for word in most_common_words])

['place', 'time', 'food', 'get', 'good', 'servic', 'go', 'great', 'order', 'like', 'one', 'back', 'would', 'us', 'tri', 'come', 'even', 'got', 'call', 'work', 'realli', 'look', 'love', 'want', 'dont', 'make', 'alway', 'also', 'wait', 'ask', 'need', 'day', 'price', 'never', 'custom', 'well', 'staff', 'nice', 'didnt', 'best', 'friendli', 'said', 'im', 'went', 'came', 'could', 'take', 'first', 'recommend', 'know', 'busi', 'peopl', 'ive', 'say', 'help', 'told', 'new', 'use', 'year', 'restaur', 'minut', 'give', 'clean', 'made', 'pizza', 'experi', 'thing', 'way', 'eat', 'much', 'littl', 'two', 'amaz', 'right', 'definit', 'took', 'locat', 'chicken', 'see', 'still', 'store', 'hour', 'ever', '2', 'room', 'check', 'sure', 'better', 'reno', 'area', 'fri', 'manag', 'everyth', 'star', 'home', 'review', 'tabl', 'lot', 'anoth', 'everi']


In [69]:
irrelevant_words =['place', 'time', 'good', 'get', 'go', 'great', 'order', 'like', 'one', 'back', 'would', 'tri', 'us', 'come', 'got', 'even', 'work', 'call', 'realli', 'love', 'look', 'alway', 'also', 'make', 'want', 'dont', 'wait', 'need', 'day', 'ask', 'price', 'never', 'well', 'custom', 'nice',  'best', 'didnt', 'im', 'went', 'came', 'said', 'take', 'could', 'first', 'know', 'recommend', 'busi', 'ive', 'peopl', 'say', 'help', 'new', 'use', 'told', 'year',  'give', 'minut', 'made', 'littl', 'thing', 'much', 'way', 'two', 'amaz', 'right', 'definit', 'locat', 'took', 'see', 'still', 'hour', 'fri', '2', 'check', 'ever', 'sure', 'area', 'better', 'lot', 'everyth', 'star', 'reno', 'home', 'tabl', 'review', 'everi', 'manag'] 

In [70]:
def remove_irrelevant_words(text, irrelevant_words):
    """
    Remove irrelevant words from a text string.
    
    Args:
        text (str): Input text
        irrelevant_words (list): List of words to remove
        
    Returns:
        str: Cleaned text with irrelevant words removed
    """
    # Create a regex pattern to match whole words only
    pattern = r'\b(?:{})\b'.format('|'.join(map(re.escape, irrelevant_words)))
    
    # Remove the words and clean up extra spaces
    cleaned_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

train['cleaned_text'] = train['text'].apply(lambda x: remove_irrelevant_words(x, irrelevant_words))

TOPIC AWARE RECOMMENDATION

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity

In [149]:
def vectorize_text(train):
    vectorizer = TfidfVectorizer(max_features=5000)
    X = vectorizer.fit_transform(train["cleaned_text"])
    return vectorizer, X

def train_lda(X, num_topics):
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42,doc_topic_prior=0.01, topic_word_prior=0.01)
    item_topics = lda_model.fit_transform(X)
    return lda_model, item_topics

def create_item_profiles(train, item_topics):
    train["topic_distribution"] = list(item_topics)
    item_profiles = train.groupby("business_id")["topic_distribution"].apply(lambda x: np.mean(np.vstack(x), axis=0)).reset_index()
    return item_profiles

def get_user_profile(user_id, train, item_profiles, num_topics):

    user_data = train[train["user_id"] == user_id].merge(item_profiles, on="business_id", how="left")
    liked = user_data[user_data["stars"] >= 1]["topic_distribution_y"]
    disliked = user_data[user_data["stars"] <= 2]["topic_distribution_y"]
    liked_profile = np.mean(np.vstack(liked), axis=0) if not liked.empty else np.zeros(num_topics)
    disliked_profile = np.mean(np.vstack(disliked), axis=0) if not disliked.empty else np.zeros(num_topics)
    return liked_profile, disliked_profile

def recommend_items_by_topic(user_id, train, item_profiles, num_topics):
    """assigns scores to businesses and returns a dictionary of business_id and score in range [0,1]"""
    liked_profile, disliked_profile = get_user_profile(user_id, train, item_profiles, num_topics)
    rated_businesses = train[train['user_id'] == user_id]['business_id'].unique()
    unrated_items = item_profiles[~item_profiles['business_id'].isin(rated_businesses)].copy()
    unrated_items["similarity"] = unrated_items["topic_distribution"].apply(
        lambda x: cosine_similarity([x], [liked_profile])[0][0] -0*cosine_similarity([x], [disliked_profile])[0][0]
    )

    return dict(zip(unrated_items['business_id'], unrated_items['similarity']))

SVD recommendation

In [10]:

from surprise import Dataset, Reader, SVD,KNNBasic,accuracy
from surprise.model_selection import train_test_split


In [11]:
def train_svd_model(train,factors=5):
    """Trains an SVD model on the given dataset."""
    # Define the rating scale (e.g., 1-5 stars)
    reader = Reader(rating_scale=(1, 5))
    
    # Load data into Surprise's Dataset format
    data = Dataset.load_from_df(
        train[["user_id", "business_id", "stars"]], 
        reader
    )
    # Split into train/test (optional, can use full data)
    trainset = data.build_full_trainset()
    
    # Initialize and train SVD
    model = SVD(n_factors=factors, n_epochs=20, lr_all=0.005, reg_all=0.02)
    model.fit(trainset)
    
    return model
def train_cbf_model(train):
    """Trains an SVD model on the given dataset."""
    # Define the rating scale (e.g., 1-5 stars)
    reader = Reader(rating_scale=(1, 5))
    
    # Load data into Surprise's Dataset format
    data = Dataset.load_from_df(
        train[["user_id", "business_id", "stars"]], 
        reader
    )
    # Split into train/test (optional, can use full data)
    trainset = data.build_full_trainset()
    
    # Initialize and train ibcf model
    model = KNNBasic(sim_options={'user_based': False})
    model.fit(trainset)
    
    return model


In [12]:
def recommend_for_user_with_model(model, user_id, business_ids):
    """Generates recommendations for a user."""
    # Predict ratings for all businesses the user hasn't rated
    predictions = {}
    for biz_id in business_ids:
        pred = model.predict(user_id, biz_id)
        predictions[biz_id]= pred.est  # (business_id, predicted_rating)
    

    
   
    return predictions

In [13]:
def get_unrated_businesses(train_data, user_id):
    rated = train_data[train_data["user_id"] == user_id]["business_id"].unique()
    all_businesses = train_data["business_id"].unique()
    return list(set(all_businesses) - set(rated))

In [78]:
def hybrid_recommendation(user_id, train,num_topics,item_profiles,model1, model2, w1=1, w2=1, w3=1):
    topic_recommendations = recommend_items_by_topic(user_id, train, item_profiles, num_topics)
    unrated_businesses = get_unrated_businesses(train, user_id)
    svd_recommendations = recommend_for_user_with_model(model1, user_id, unrated_businesses)
    ibcf_recommendations = recommend_for_user_with_model(model2, user_id, unrated_businesses)
    
    hybrid_scores = {}
    for biz_id in set(topic_recommendations.keys()) | set(svd_recommendations.keys()) | set(ibcf_recommendations.keys()):
        topic_score = topic_recommendations.get(biz_id, 0)
        svd_score = svd_recommendations.get(biz_id, 0)
        ibcf_score = ibcf_recommendations.get(biz_id, 0)
        hybrid_scores[biz_id] = (w1 * 5*topic_score + w2 * svd_score + w3 * ibcf_score)/(w1 + w2 + w3)
    

     
    
    return hybrid_scores




In [126]:
vectorizer, X = vectorize_text(train)

In [156]:
def compute_perplexity_for_topics(X, topic_range, vectorizer=None):

    perplexities = {}
    for num_topics in topic_range:
        # Train LDA model
        lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        lda.fit(X)
        
        # Compute perplexity
        perplexity = lda.perplexity(X)
        perplexities[num_topics] = perplexity
        print(f"Number of Topics: {num_topics}, Perplexity: {perplexity}")
    
    return perplexities


topic_range = range(50, 100, 10)  # Try topic numbers from 5 to 20
perplexities = compute_perplexity_for_topics(X, topic_range)

Number of Topics: 50, Perplexity: 14027.33701329771
Number of Topics: 60, Perplexity: 17525.938810917374
Number of Topics: 70, Perplexity: 23028.35403650486
Number of Topics: 80, Perplexity: 28612.638244991733
Number of Topics: 90, Perplexity: 36083.66361096018


In [150]:
num_topics = 5 # Number of topics for LDA

lda_model, item_topics = train_lda(X, num_topics)
item_profiles = create_item_profiles(train, item_topics)


In [153]:
# Function to display topics
def display_topics(lda, vectorizer, top_words=10):
    words = np.array(vectorizer.get_feature_names_out())
    for topic_idx, topic in enumerate(lda.components_):
        top_features = topic.argsort()[-top_words:][::-1]  # Top words for the topic
        print(f"\n🔹 Topic {topic_idx + 1}: " + ", ".join(words[top_features]))
display_topics(lda_model, vectorizer, top_words=10)


🔹 Topic 1: food, servic, chicken, delici, restaur, breakfast, friendli, fresh, flavor, eat

🔹 Topic 2: car, servic, job, compani, done, room, nail, clean, guy, thank

🔹 Topic 3: store, staff, friendli, servic, clean, shop, dr, thank, food, experi

🔹 Topic 4: food, servic, sushi, rude, drink, employe, eat, seat, bad, walk

🔹 Topic 5: pizza, food, beer, servic, bar, sandwich, salad, friendli, drink, staff


In [72]:
model1= train_svd_model(train,factors=5)

model2= train_cbf_model(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [29]:
holdout['user_id'].unique().shape

(1801,)

In [79]:
def evaluate_hybrid_model(train, holdout, item_profiles, num_topics,model1, model2,w1=1, w2=1, w3=1):
    errors = []
    for user_id in holdout["user_id"].unique():
        recommendations = hybrid_recommendation(user_id, train,num_topics,item_profiles, model1, model2, w1, w2, w3)
        actual_ratings = holdout[holdout["user_id"] == user_id].set_index("business_id")["stars"]
        predicted_ratings = pd.Series(recommendations).reindex(actual_ratings.index).fillna(3)  # Default to neutral rating
        
        errors.extend(np.abs(actual_ratings - predicted_ratings))
    
    mae = np.mean(errors)
    return mae

In [152]:
evaluate_hybrid_model(train, holdout.iloc[:100], item_profiles,num_topics, model1, model2,w1=1, w2=0, w3=0)

1.5667392267744003