In [2]:
import pickle
import pandas as pd
import numpy as np
import os

In [3]:
def predict_topics(location, model, vectorizer, source):
    df = pd.read_csv(location)
    if(source == 'Google'):
        # Retrieve only reviews column
        df = df['text']
        print(location.split('/')[-1])
    elif(source == 'Yelp'):
        df = df['review']
        print(location.split('/')[-1])

    # Change all empty strings to nan
    df.replace('', np.nan, inplace=True)
    # Remove all empty rows
    df.dropna(inplace=True)

    # Transform the reviews using vectorizer
    text = vectorizer.transform(df)
    # Transform text using model
    scores = model.transform(text)
    # Retrieve topics from model
    topics = pd.DataFrame(model.components_)
    # Change rows to Topic and number
    topics.index = ['Topic ' + str(i) for i in range(model.n_components)]

    # Get the text from words
    words = np.array(vectorizer.get_feature_names())
    topic_words = []
    # Go through values in each cluster
    for i in model.components_:
        # Inversing values and return indices of sorted 
        locs = (-i).argsort()
        # Add the elements from the array along the mentioned axis and indices
        topic_words.append(words.take(locs))

    # Convert list to df
    words_df = pd.DataFrame(topic_words)
    # Add Topic to each column
    words_df.index = ['Topic '+ str(i) for i in range(words_df.shape[0])]

    # Find the highest value and return the column's value
    topic = words_df.iloc[1:11, np.argmax(scores)].values.tolist()
    print(topic)
    return topic

In [4]:
# Location to loop through Yelp reviews
# Go to folder
def process_yelp(model, vectorizer):
    directory = "../../dataset/yelp/processed_reviews/"
    source = 'Yelp'
    for filename in os.listdir(directory):
        # Find all files that end with CSV
        if filename.endswith(".csv"):
            name = filename.replace("-hoboken_processed.csv", "")
            topics =  predict_topics(os.path.join(directory, filename), model, vectorizer, source)
            if name in restaurant_with_topics:
                restaurant_with_topics[name] = restaurant_with_topics[name] + topics
            else:
                restaurant_with_topics[name] = topics

In [5]:
# Location to loop through Google reviews
# Go to folder
def process_google(model, vectorizer):
    directory = "../../dataset/google/processed_reviews/"
    source = 'Google'
    for filename in os.listdir(directory):
        # Find all files that end with CSV
        if filename.endswith(".csv"):
            name = filename.replace("_processed.csv", "")
            topics =  predict_topics(os.path.join(directory, filename), model, vectorizer, source)
            if name in restaurant_with_topics:
                restaurant_with_topics[name] = restaurant_with_topics[name] + topics
            else:
                restaurant_with_topics[name] = topics

In [6]:
# Load Model and vectorizer
model = pickle.load(open('../../model/lda_model.sav', 'rb'))
vectorizer = pickle.load(open('../../model/lda_vectorizer.sav', 'rb'))
restaurant_with_topics = {}

In [7]:
# Used to create CSV to store topics
def write_into_csv(filename, header):
    # Process Google and Yelp Reviews
    process_google(model, vectorizer)
    process_yelp(model, vectorizer)
    rest_list = []
    for restaurant, topics in restaurant_with_topics.items():
        rest_list.append([restaurant, ",".join(topics)])

    with open(filename + ".csv", 'w', encoding='UTF8') as f:
        # Create header
        f.write(header)
        f.write('\n')
        # Add topics
        for name in rest_list:
            f.write("|".join(name))
            f.write('\n')

write_into_csv("../../dataset/restaurant_with_topics_lda", "name,topics")

7-stars-pizzeria_processed.csv
['visiting', 'tot', 'limited', 'wine', 'hungry', 'matter', 'barely', 'rather', 'easily']
80-river-bar-kitchen_processed.csv
['enjoy', 'since', 'protein', 'seated', 'ca', 'said', 'perfect', 'even', 'next']
8th-street-tavern_processed.csv
['extremely', 'overcooked', 'husband', 'try', 'yes', 'love', 'full', 'sausage', 'disappointed']
acai-ya-later_processed.csv
['fall', 'undercooked', 'syrup', 'shocked', 'absolutely', 'code', 'least', 'honey', 'smaller']
acme-markets_processed.csv
['immediately', '4th', 'tabouleh', 'drinking', 'body', 'dancing', 'bonito', 'man', 'frank']
alessio-s-cafe-gelato-pizza_processed.csv
['ok', 'might', 'black', 'kept', 'joint', 'ask', 'world', 'snack', 'salt']
ali-baba-restaurant_processed.csv
['twice', 'note', 'expecting', 'case', 'word', 'indoors', 'combination', 'art', 'churrasco']
amanda-bananas_processed.csv
['crowd', 'portion', 'come', 'min', 'old', 'experience', 'flavor', 'croissant', 'server']
amanda-s_processed.csv
['please

['area', 'shrimp', 'time', 'two', 'little', 'two', 'garlic', 'make', 'friend']
honeygrow_processed.csv
['okay', 'exactly', 'mr', 'happen', 'course', 'saying', 'noodle', 'convenient', 'brussel']
house-of-que_processed.csv
['never', 'wanted', 'street', 'thought', 'overpriced', 'expect', 'average', 'solid', 'room']
hudson-river-waterfront-walkway_processed.csv
['chilled', 'yeah', 'foam', 'lacking', 'honor', 'fear', 'regardless', 'likely', 'indoor']
hudson-table-hoboken_processed.csv
['old', 'actually', 'ginger', 'wo', 'dessert', 'small', 'cash', 'table', 'sat']
il-tavolo-di-palmisano_processed.csv
['list', 'japanese', 'pineapple', 'understand', 'enough', 'take', 'biryani', 'large', 'prosciutto']
illuzion_processed.csv
['way', 'going', 'filling', 'already', 'eating', 'dirty', 'whole', 'warm', 'ravioli']
imposto-s-pizza_processed.csv
['repeat', 'bark', 'scrumptious', 'overcharge', 'comment', 'power', 'bombay', 'hibiscus', 'mild']
insomnia-cookies_processed.csv
['irish', 'week', 'pick', 'dis

['wednesday', 'anything', 'added', 'dinner', 'cold', 'eating', 'stick', 'stopped', 'think']
sirenetta-seafood-raw-bar_processed.csv
['much', 'worth', 'new', 'better', 'next', 'large', 'cutlet', 'stop', 'clam']
sorellina_processed.csv
['outstanding', 'plain', 'grill', 'talk', 'save', 'cheap', 'chat', 'veggie', 'linguine']
south-lions-dim-sum-tea_processed.csv
['price', 'probably', 'first', 'since', 'nothing', 'decided', 'saturday', 'side', 'stuffed']
south-street-fish-ramen-co_processed.csv
['remember', 'real', 'fu', 'boyfriend', 'expected', 'everywhere', 'remember', 'plain', 'recommended']
souzafit-restaurant-hoboken_processed.csv
['highly', 'deal', 'honey', 'open', 'bought', 'crowd', 'might', 'baked', 'creamy']
sri-thai_processed.csv
['dont', 'awhile', 'ago', 'middle', 'covid', 'dim', 'bet', 'upon', 'coconut']
stingray-lounge_processed.csv
['nicely', 'afternoon', 'guy', 'machine', 'sandy', 'outdoors', 'ago', 'welcome', 'owner']
subway_processed.csv
['must', 'dino', 'canai', 'lady', 'k

['wife', 'past', 'heat', 'worked', 'expect', 'jersey', 'type', 'relaxing', 'dined']
bwè-kafe-hoboken_processed.csv
['party', 'rib', 'little', 'still', 'two', 'want', 'taste', 'always', 'amazing']
cafe-michelina-hoboken_processed.csv
['town', 'regular', 'container', 'bring', 'came', 'liked', 'far', 'muffin', 'bite']
cafe-vista-hoboken-2_processed.csv
['ca', 'first', 'fruit', 'business', 'around', 'sure', 'sure', 'mimosa', 'right']
carlos-bakery-hoboken-9_processed.csv
['got', 'rare', 'thing', 'money', 'need', 'overall', 'though', 'recommend', 'mussel']
carpe-diem-pub-and-restaurant-hoboken-2_processed.csv
['booked', 'vinegar', 'replaced', 'happening', 'cooked', 'poorly', 'familiar', 'moving', 'care']
chango-kitchen-hoboken_processed.csv
['tonight', 'certainly', 'sadly', 'disrespectful', 'completely', 'eaten', 'fat', 'heart', 'available']
chicken-factory-hoboken-2_processed.csv
['take', 'disappointing', 'real', 'whole', 'couple', 'trying', 'soggy', 'to', 'friday']
choc-o-pain-hoboken-9_p

['friendly', 'chicken', 'soup', 'took', 'thing', 'well', 'get', 'shop', 'well']
losurdos-italian-bakery-and-deli-hoboken_processed.csv
['went', 'onion', 'great', 'experience', 'much', 'even', 'best', 'definitely', 'shrimp']
louise-and-jerrys-hoboken_processed.csv
['believe', 'container', 'smooth', 'security', 'service', 'section', 'competition', 'kid', 'fell']
luca-brasis-deli-hoboken-151_processed.csv
['bin14', 'connoisseur', 'overwhelmed', 'indication', 'cracked', 'reviewed', 'class', 'bc', 'semolina']
m-and-p-biancamano-hoboken_processed.csv
['whatever', 'sadly', 'hainanese', 'stop', 'grew', 'cost', 'whatever', 'paper', 'arancini']
madd-hatter-hoboken-3_processed.csv
['mediocre', 'lean', 'remember', 'grabbed', 'arugula', 'buddy', 'hoping', 'balance', 'extensive']
malibu-diner-hoboken_processed.csv
['delicious', 'avocado', 'salad', '15', 'right', 'see', 'much', 'bacon', 'main']
mamouns-falafel-hoboken-hoboken-2_processed.csv
['cent', 'flavored', 'count', 'remind', '5th', 'frites', 'e

['older', 'drop', 'picture', 'professional', 'saving', 'write', 'melted', 'woman', 'law']
the-ale-house-hoboken-2_processed.csv
['awesome', 'cooked', 'go', 'server', 'got', 'room', 'la', 'staff', 'nice']
the-brass-rail-hoboken-hoboken_processed.csv
['bon', 'redeeming', 'culinary', 'vomited', 'est', 'dealt', 'fiona', 'lambrusco', 'definition']
the-brick-fire-baked-pizza-hoboken-3_processed.csv
['que', 'making', 'melted', 'fair', 'broke', 'im', 'festival', 'sidewalk', 'fully']
the-cuban-restaurant-and-bar-hoboken-2_processed.csv
['wednesday', 'anything', 'added', 'dinner', 'cold', 'eating', 'stick', 'stopped', 'think']
the-hive-hoboken_processed.csv
['thursday', 'corn', 'authentic', 'pick', 'bread', 'cool', 'tender', 'salmon', 'share']
the-little-grocery-hoboken-3_processed.csv
['fare', 'added', 'reason', 'dropped', 'basically', 'cover', 'soooo', 'press', 'getting']
the-madison-bar-and-grill-hoboken_processed.csv
['180', 'sunset', 'headphone', 'tater', 'puppy', 'sane', 'guarantee', '95',