In [None]:
# Fahim Khan - fkhan99@gatech.edu
# DVA Spring 2025 - Final Project
from nltk.corpus import stopwords
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import *
nltk.download('vader_lexicon')
nltk.download('punkt_tab')
nltk.download('stopwords') 
nltk.download('wordnet') 
from datetime import datetime
import pandas as pd
pd.set_option('display.width', 200)
import numpy as np
import string

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/fahimkhan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/fahimkhan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fahimkhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fahimkhan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Review Dataset Columns
0	record
1	review_id
2	user_id
3	business_id
4	stars
5	useful
6	funny
7	cool
8	text
9	date
10	name
11	address
12	city
13	state
14	postal_code
15	business_stars
16	review_count
17	is_open
18	attributes
19	categories

In [329]:
sia = SentimentIntensityAnalyzer()
cols = [1,2,3,4,8,9,10,12,19]
reviews = pd.read_csv('fused_reviews_202503152331.csv',  usecols=cols) # nrows=100)
reviews = reviews[reviews['categories'].str.contains('Restaurants', na=False)]
allowed_cities = ['Philadelphia', 'New Orleans', 'Tampa', 'Tucson', 'Nashville', 'Indianapolis']
reviews = reviews[reviews['city'].isin(allowed_cities)]
reviews = reviews.drop(['categories'], axis=1)

print(reviews.columns.tolist())

['review_id', 'user_id', 'business_id', 'stars', 'text', 'date', 'name', 'city']


In [330]:
# Add recency score
reviews['date'] = pd.to_datetime(reviews['date'])
today = pd.Timestamp(datetime.today().date())
reviews['age'] = (today - reviews['date']).dt.days 
reviews['age'] = reviews['age'] - reviews['age'].min()
lambda_val = 0.001 # half life of two years
reviews['recency_score'] = np.exp(-lambda_val * reviews['age'])
print(reviews['recency_score'].min(),"min / max", reviews['recency_score'].max())
reviews = reviews.drop(['age','date'], axis=1)

print(reviews.columns.tolist())

0.002068358461654562 min / max 1.0
['review_id', 'user_id', 'business_id', 'stars', 'text', 'name', 'city', 'recency_score']


In [331]:
# Join with User Credibility data
user_cred = pd.read_csv('user_postprocess.csv')
reviews = pd.merge(reviews, user_cred, how='left', on='user_id')
# Normalize credibility
cred_min = reviews['credibility'].min()
cred_max = reviews['credibility'].max()
reviews['credibility_norm'] = (reviews['credibility'] - cred_min) / (cred_max - cred_min + 1e-9)
reviews = reviews.drop(['Unnamed: 0','user_id'], axis=1)

print(reviews.columns.tolist())

['review_id', 'business_id', 'stars', 'text', 'name', 'city', 'recency_score', 'credibility', 'credibility_norm']


In [332]:
# Tokenize
reviews['tokens'] = reviews['text'].str.lower().apply(word_tokenize)
# Remove punctuation
reviews['tokens'] = reviews['tokens'].apply(lambda tokens: [word for word in tokens if word not in string.punctuation])
# Remove stopwords
stop_words = set(stopwords.words('english'))
reviews['tokens'] = reviews['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])
# Lemmatize
lemmatizer = WordNetLemmatizer()
reviews['tokens'] = reviews['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
# Join back into string
reviews['joined_text'] =  reviews['tokens'].apply(lambda tokens: ' '.join(tokens))
# Polarity scores
reviews['sentiment'] = reviews['joined_text'].apply(lambda text: sia.polarity_scores(text)['compound'])
# Normalize
min = reviews['sentiment'].min()
max = reviews['sentiment'].max()
reviews['normalized_sentiment'] = (reviews['sentiment'] - min) / (max - min)
reviews = reviews.drop(['tokens', 'joined_text'], axis=1)

print(reviews.columns.tolist())

['review_id', 'business_id', 'stars', 'text', 'name', 'city', 'recency_score', 'credibility', 'credibility_norm', 'sentiment', 'normalized_sentiment']


In [333]:
# Compute final weighted score
reviews['final_score'] = reviews['normalized_sentiment'] * (0.5 + 0.25 * reviews['recency_score'] + 0.25 * reviews['credibility_norm'])

# Group and sort
rest_level_sentiment = reviews.groupby(['business_id','name','city'], as_index=False)[['final_score', 'stars','normalized_sentiment', 'credibility_norm','recency_score']].mean()
sorted_grouped = rest_level_sentiment.sort_values(['final_score'], ascending=False)   

print(sorted_grouped.columns.tolist())

['business_id', 'name', 'city', 'final_score', 'stars', 'normalized_sentiment', 'credibility_norm', 'recency_score']


In [334]:
#Export
sorted_grouped.to_csv('restaurant_level_final_score.csv', index=False)