# NMF Model
## In this program, we will generate topics that are suitable for checking businesses similarities.

## import necessary datasets

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize

## Read the cleaned reviews datasets

In [3]:
file = "Cleaned_Text_Dataset.csv"
cleaned_reviews_df = pd.read_csv(file)
del cleaned_reviews_df["Unnamed: 0"]
cleaned_reviews_df.head(2)

Unnamed: 0,review_id,business_id,text,target,words
0,fxWnU4OqONBNoQhEcyazSg,krTHKI0YOpASr4gz2CVWFw,"This location used to be good, several years a...",0.0,this location use to be good several year ago ...
1,FhtER9SGsEYkEhRcs09rsQ,krTHKI0YOpASr4gz2CVWFw,I love Cosi but this Cosi is going down hill f...,0.0,love cosi but this cosi be go down hill fast a...


In [4]:
pos_reviews_df = cleaned_reviews_df[cleaned_reviews_df['target'] == 1]
neg_reviews_df = cleaned_reviews_df[cleaned_reviews_df['target'] == 0]
pos_reviews_df = pos_reviews_df.reset_index(drop=True)
neg_reviews_df = neg_reviews_df.reset_index(drop=True)

In [5]:
pos_reviews = cleaned_reviews_df.words[cleaned_reviews_df.target == 1.0]
neg_reviews = cleaned_reviews_df.words[cleaned_reviews_df.target == 0.0]

In [6]:
pos_reviews

4565    go here with friend on the positive side very ...
4566    love everything about kanella from the delicio...
4567    daaaaaang that be great philly cheesesteak fry...
4568    can never eat whole burrito could not stop eat...
4569    price ncame here for restaurant week and think...
                              ...                        
9125    definitely huge fan of the pok spot ve order f...
9126    this place be super cool so many choice in one...
9127    hookah the hookah be good too bit to prepare b...
9128    the food be absolutely amazing the service be ...
9129    the resurgence of old city start with the litt...
Name: words, Length: 4565, dtype: object

## Stopwords

In [7]:
# Create a list of stop words with stopwords library 
# and adding extra stopwords that is not potentially useful 
my_stop_words = set(stopwords.words('english') + 
                    list(ENGLISH_STOP_WORDS) + 
                    ['super', 'duper', 've', 'like', 'got', 
                     'Cleveland', 'just', 'don', 'really', 
                     'said', 'told', 'ok','came', 'went', 
                     'did', 'didn', 'good'])

## TF-IDF


- Looks like the top words for both positive and negative Yelp reviews have mention topic related to service , place , time and order.

In [8]:
# Create a vectorizer object to generate term document counts
tfidf_pos = TfidfVectorizer(stop_words=my_stop_words, min_df=10, max_df=0.5, 
                        ngram_range=(1,2), token_pattern='[a-z][a-z]+')

tfidf_neg = TfidfVectorizer(stop_words=my_stop_words, min_df=10, max_df=0.5, 
                        ngram_range=(1,2), token_pattern='[a-z][a-z]+')

In [9]:
# Get the vectors
pos_vectors = tfidf_pos.fit_transform(pos_reviews)
neg_vectors = tfidf_neg.fit_transform(neg_reviews)



## Apply NMF to find Generate Topics

In [10]:
num_topics = 6 # declare the number of topics
num_top_words = 6 # declare the number of words in each topic

nmf_pos = NMF(n_components=num_topics)
W_pos = nmf_pos.fit_transform(pos_vectors)
H_pos = nmf_pos.components_

nmf_neg = NMF(n_components=num_topics)
W_neg = nmf_neg.fit_transform(neg_vectors)
H_neg = nmf_neg.components_

In [11]:
# Get the vectors
pos_vectors = tfidf_pos.fit_transform(pos_reviews)
neg_vectors = tfidf_neg.fit_transform(neg_reviews)

### Function : show_topics_result
#### parameter : model , feature names , number of topics , number of words in topic
#### output : show the result of all topics and weight of each word in each topic

In [12]:
def show_topics_result(model, feature_names, num_topics, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        if topic_idx < num_topics:
            print("{:11}".format("Topic %d:" %(topic_idx+1)), end='')
            print(", ".join(['{:04.3f}*'.format(topic[i])+feature_names[i] \
                             for i in topic.argsort()[:-no_top_words-1:-1]]))

### Generate top 6 topics for positive and negative reviews

In [13]:
print('Top topics + words for Positive reviews')
print('-'*39)
show_topics_result(nmf_pos, tfidf_pos.get_feature_names_out(), num_topics, num_top_words)

print('\nTop topics + words for Negative reviews')
print('-'*39)
show_topics_result(nmf_neg, tfidf_neg.get_feature_names_out(), num_topics, num_top_words)

Top topics + words for Positive reviews
---------------------------------------
Topic 1:   0.794*order, 0.636*come, 0.505*delicious, 0.484*chicken, 0.478*try, 0.469*dish
Topic 2:   0.933*sandwich, 0.877*cheesesteak, 0.768*cheese, 0.754*pork, 0.696*steak, 0.634*roast
Topic 3:   2.292*pizza, 0.264*crust, 0.228*slice, 0.197*pizza place, 0.170*pizza pizza, 0.160*pie
Topic 4:   1.238*place, 0.862*market, 0.721*love, 0.515*terminal, 0.393*love place, 0.393*food
Topic 5:   1.554*great, 1.178*food, 0.872*service, 0.532*recommend, 0.491*amazing, 0.464*staff
Topic 6:   0.750*hour, 0.743*happy hour, 0.721*happy, 0.689*beer, 0.574*bar, 0.514*drink

Top topics + words for Negative reviews
---------------------------------------
Topic 1:   1.160*table, 0.772*wait, 0.729*come, 0.692*minute, 0.624*ask, 0.600*reservation
Topic 2:   0.840*chicken, 0.597*taste, 0.536*salad, 0.436*sauce, 0.393*dish, 0.372*rice
Topic 3:   2.147*pizza, 0.300*slice, 0.207*pizza place, 0.205*order pizza, 0.194*crust, 0.142*ch

### Summarize all topics

Topics in Positive reviews Similarity:
- Topics(0) : Order , Delicious , Dish  
- Topics(1) : Sandwich , Cheese , Steak , Pork
- Topics(2) : Pizza , Food
- Topics(3) : Place , Market
- Topics(4) : Service , Staff
- Topics(5) ：Happy hour , Beer

Topics in Negative reviews Similarity:
- Topics(0) : Table , Wait
- Topics(1) : Dish , Salad , Taste
- Topics(2) : Pizza , Food
- Topics(3) : Bar , Drink
- Topics(4) : Order , Time , Delivery
- Topics(5) ：Cheesesteak, Sandwich

With these topis, we can investigate any review in the datasets that, the similarity of topics to the reviews

In [14]:
# store the topics in a dictionary
pos_topics = {0:'order/delicious/dish', 
              1:'sandwich/cheese/steak/pork', 
              2:'pizza/food', 
              3:'place/market', 
              4:'service/staff', 
              5:'happy_hour/beer'}

neg_topics = {0:'table/wait', 
              1:'dish/salad/taste', 
              2:'pizza/food', 
              3:'bar/drink', 
              4:'order/time/delivery', 
              5:'cheesesteak/sandwich'}

---

## Testing positive review with the generated topics

In [15]:
sample_pos_review = pos_reviews_df['text'][99] #randomly select a positive review
print('Sample Positive Review :')
print('-'*25)
print(sample_pos_review)
print('-'*25)
topic_dict = {}
for ind, w in enumerate(W_pos[99]):
    topic_dict[ind] = w
for k in sorted(topic_dict, key=topic_dict.get, reverse=True):
     print('Topic {}: {:.4f}'.format(k+1, topic_dict[k]))

Sample Positive Review :
-------------------------
What an excellent addition to the Fairmount/Francisville neighborhood collection of bars/restaurants! The fries are great, the salmon entree is seriously one of the best salmon dishes I've ever had, and the boards are a delicious and fun way to sample the menu. They definitely have to work out some kinks in the service, as we were told that a wait for our party of three would be a half hour but it ended up being an hour. And once we were seated we barely saw our waiter who apologized for the slow pace of service multiple times. But it was a Saturday night and their 5th night of business so they definitely get a pass. Oh and my beer-lover husband had great things to say about the in-house brews.
-------------------------
Topic 6: 0.0967
Topic 1: 0.0354
Topic 5: 0.0348
Topic 2: 0.0000
Topic 3: 0.0000
Topic 4: 0.0000


## Testing negative review the generated topics

In [16]:
sample_neg_review = neg_reviews_df['text'][99] #randomly select a positive review
print('Sample Negative Review :')
print('-'*25)
print(sample_neg_review)
print('-'*25)
topic_dict = {}
for ind, w in enumerate(W_neg[99]):
    topic_dict[ind] = w
for k in sorted(topic_dict, key=topic_dict.get, reverse=True):
     print('Topic {}: {:.4f}'.format(k+1, topic_dict[k]))

Sample Negative Review :
-------------------------
This place has a very good chicken cheesesteak wrap, but if you are delivering be cautious of the roof delivery guy. I've had two encounters with the same guy and he is rude. The first time he was impatient, when I work at a hospital and was trying to meet him in the lobby. The second time he shoved the bag in my had since I told him to come inside instead of me going outside for the food . That is not how you treat customers! I called and spoke with the manager and he said "well he is very busy, that's no excuse I'll talk to him". If that was no excuse then it shouldn't have been mentioned. I'd recommend only dining in , if you must.
-------------------------
Topic 5: 0.0673
Topic 6: 0.0287
Topic 4: 0.0268
Topic 1: 0.0080
Topic 3: 0.0038
Topic 2: 0.0000


---

## Calculate the average topic weights of each business in previous chosen restaurant datasets.

### sample weight for a review

In [17]:
print('Topic weights : ', W_pos[1])
print('Total topic weights: ', W_pos[1].sum())

Topic weights :  [0.05704334 0.01336461 0.         0.         0.02129886 0.        ]
Total topic weights:  0.09170681318146529


### Normalize the sum of weights

In [18]:
print('Normalized Topic weights : ', normalize(W_pos[0].reshape(1,-1), norm='l1'))
print('Total normalized topic weights: ', normalize(W_pos[0].reshape(1,-1), norm='l1').sum())

Normalized Topic weights :  [[0.41204515 0.33382474 0.         0.         0.03133379 0.22279632]]
Total normalized topic weights:  1.0


In [19]:
# add normalized topic weights to each review
pos_reviews_df = pd.concat([pos_reviews_df, pd.DataFrame(normalize(W_pos, norm='l1'))], axis=1)
neg_reviews_df = pd.concat([neg_reviews_df, pd.DataFrame(normalize(W_pos, norm='l1'))], axis=1)

In [20]:
pos_reviews_df = pos_reviews_df.rename(columns={0: "topic_1", 1: "topic_2" , 2: "topic_3", 3: "topic_4" , 4: "topic_5", 5: "topic_6"})
neg_reviews_df = neg_reviews_df.rename(columns={0: "topic_1", 1: "topic_2" , 2: "topic_3", 3: "topic_4" , 4: "topic_5", 5: "topic_6"})

In [21]:
# drop unecessary columns
del pos_reviews_df['review_id']
del pos_reviews_df['text']
del pos_reviews_df['target']
del pos_reviews_df['words']

In [22]:
# peek into a dataframe to ensure it worked right
pos_reviews_df.head(5)

Unnamed: 0,business_id,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,4R2KR_-FybS7oegGrXjHVg,0.412045,0.333825,0.0,0.0,0.031334,0.222796
1,mUIBtlWNPD7sz3rGGWQ1RA,0.622019,0.145732,0.0,0.0,0.232249,0.0
2,-mIlmp5l4hKlp1tvHRdvTg,0.0,0.701854,0.0,0.0,0.290394,0.007753
3,gvD09Ev1aOmphtlq07zYEA,0.489863,0.005282,0.066383,0.438472,0.0,0.0
4,Ou-_OQUNvBcaAoRU0XPtaQ,0.51219,0.095558,0.001229,0.0,0.391023,0.0


## group the reviews by businesses and calculate the average topic weights by businesses

In [23]:
# group by business_id and apply mean/count functions to columns
pos_reviews_df = pos_reviews_df.groupby('business_id').agg({'topic_1':'mean', 'topic_2':'mean', 
                                                            'topic_3':'mean', 'topic_4':'mean',
                                                            'topic_5':'mean', 'topic_6':'mean'}).reset_index()
neg_reviews_df = neg_reviews_df.groupby('business_id').agg({'topic_1':'mean', 'topic_2':'mean', 
                                                            'topic_3':'mean', 'topic_4':'mean',
                                                            'topic_5':'mean', 'topic_6':'mean'}).reset_index()

In [24]:
# read the restaurant info file
file = "Filtered_Restaurant_Dataset.csv"
restaurant_df = pd.read_csv(file)
restaurant_df.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,general_category
0,hSbwd-VP4THYYvSKQQr6Ow,George's Famous Roast Pork and Beef,1007 S 9th St,Philadelphia,PA,19147.0,39.937345,-75.158118,4.0,27,0,"{'RestaurantsReservations': 'False', 'Restaura...","Restaurants, Delis","{'Monday': '7:30-15:0', 'Tuesday': '7:30-15:0'...",Restaurants


### add business name, city & categories information back to the review df

In [25]:
# merge business name/city with topic data
pos_reviews_df = pos_reviews_df.merge(restaurant_df[['business_id', 'name', 'city', 'stars', 'categories']], left_on='business_id', 
                               right_on='business_id')

neg_reviews_df = neg_reviews_df.merge(restaurant_df[['business_id', 'name', 'city', 'stars', 'categories']], left_on='business_id', 
                               right_on='business_id')


In [26]:
pos_reviews_df.head(1)

Unnamed: 0,business_id,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,name,city,stars,categories
0,-2-ih3mE8KPyeKVIzpBfPQ,0.169587,0.0,0.0,0.014051,0.144997,0.671364,SkyGarten,Philadelphia,3.5,"American (Traditional), Bars, Restaurants, Nig..."


In [27]:
# rearrange dataframe
pos_reviews_df = pos_reviews_df[['business_id' , 'name' , 'categories' , 'city' , 'stars' , 'topic_1' , 'topic_2' , 'topic_3' , 'topic_4' , 'topic_5' , 'topic_6']]
neg_reviews_df = neg_reviews_df[['business_id' , 'name' , 'categories' , 'city' , 'stars' , 'topic_1' , 'topic_2' , 'topic_3' , 'topic_4' , 'topic_5' , 'topic_6']]
pos_reviews_df.head(1)

Unnamed: 0,business_id,name,categories,city,stars,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6
0,-2-ih3mE8KPyeKVIzpBfPQ,SkyGarten,"American (Traditional), Bars, Restaurants, Nig...",Philadelphia,3.5,0.169587,0.0,0.0,0.014051,0.144997,0.671364


### With the new pos_reviews_df , we can check the restaurant that is in this dataframe that how close is the restaurant to all topics.

Let take a restaurant 'SkyGarten' as Example

In [33]:
topics_weights = pos_reviews_df[pos_reviews_df.business_id == '-2-ih3mE8KPyeKVIzpBfPQ'].iloc[:,-6:].values
print(restaurant_df[restaurant_df.business_id == '-2-ih3mE8KPyeKVIzpBfPQ']['name'].values[0] + ' - Positive Reviews topics')
print('-'*50)
for n in pos_topics:
    print('Topic {}: {:28} -> {:.4f}'.format(n+1, pos_topics[n], topics_weights[0][n]))

SkyGarten - Positive Reviews topics
--------------------------------------------------
Topic 1: order/delicious/dish         -> 0.1696
Topic 2: sandwich/cheese/steak/pork   -> 0.0000
Topic 3: pizza/food                   -> 0.0000
Topic 4: place/market                 -> 0.0141
Topic 5: service/staff                -> 0.1450
Topic 6: happy_hour/beer              -> 0.6714


---

### Export to CSV

In [94]:
pos_reviews_df.to_csv('Positive_Restaurant_Topics.csv')
neg_reviews_df.to_csv('Negative_Restaurant_Topics.csv')