**Importing Libraries**

In [4]:
import numpy as np # math
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

**Connecting to Yelp Dataset**

In [5]:
from google.colab import drive

drive.mount('/gdrive')
%cd /gdrive

Mounted at /gdrive
/gdrive


In [6]:
%cd ./MyDrive/

/gdrive/MyDrive


In [7]:
!ls

'11th and 12th Grade'
'[CDO] V1 Resume.docx'
'Colab Notebooks'
"Copy of NEUR 250 Questions for Unit 3 test, F'21.gdoc"
'[Draft] V1 Resume.docx'
 Extramural
'Fall 2019'
'Fall 2020'
'Fall 2021'
'Friday Tour Guide Email Template.gdoc'
'[Incomplete] ARTS 107.02 Final Project.mp4'
 Resume_Apple.pdf
'Resume_Vikas Gudhe.pdf'
'Sayl Important Stats'
'Sayl Pitch Deck.pdf'
'Spring 2020'
'Spring 2021'
'Wednesday Tour Guide Email Template.gdoc'


In [8]:
%cd ./'Fall 2021'/'IPHS 200.01'/'Final Project'

/gdrive/MyDrive/Fall 2021/IPHS 200.01/Final Project


In [9]:
yelp_ds = 'yelp_academic_dataset_review.json'

**Cleaning up/Preprocessing Dataset**

In [12]:
import json
import string

reviewtext = []

with open(yelp_ds) as reviews:
        for index,review in enumerate(reviews):
            tupl = (json.loads(review)['text'], json.loads(review)['user_id'], json.loads(review)['business_id'])
            reviewtext.append(tupl)
            if index > 500:
                break

# clean the text for tokenizing
def cleaner(text):
    #text = text.replace(".", " fullstop ")
    #text = text.replace(",", " comma ")
    text = "".join(v for v in text if v not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii",'ignore')
    text = text.replace("\n", " ")
    return text

cleaned_text = [cleaner(text[0]) for text in reviewtext] #cleaned text of singular reviews
cleaned_text_rest_dict = dict() #restaurant_id : all_reviews_grouped pairing
for text in reviewtext:
  if text[2] in cleaned_text_rest_dict:
    cleaned_text_rest_dict[text[2]] += (text[0] + "<REVIEW_END>")
  else:
    cleaned_text_rest_dict[text[2]] = text[0]

# cleaning up combined text of reviews in associated restaurant id
for key in cleaned_text_rest_dict:
  cleaned_text_rest_dict[key] = cleaner(cleaned_text_rest_dict[key])

cleaned_text_rest_ref = [(busId, cleaned_text_rest_dict[busId]) for busId in cleaned_text_rest_dict] #cleaned text of combined reviews grouped by business_id 
cleaned_text_rest = [tupl[1] for tupl in cleaned_text_rest_ref] #(stripping away keys) - each string is combined all reviews for restaurant (same order as clean_text_rest_ref with business_id:combined reviews as list)
cleaned_text_ids = [(cleaner(text[0]), text[1], text[2]) for text in reviewtext] #list of tupls - text[0] = cleaned review text, text[1] = user_id, text[2] = business_id assocaited with specific review

In [13]:
print(cleaned_text_rest_ref)

[('buF9druCkbuXLX526sGELQ', 'apparently prides osteria had a rough summer as evidenced by the almost empty dining room at 630 on a friday night however new blood in the kitchen seems to have revitalized the food from other customers recent visits waitstaff was warm but unobtrusive by 8 pm or so when we left the bar was full and the dining room was much more lively than it had been perhaps beverly residents prefer a later seating   after reading the mixed reviews of late i was a little tentative over our choice but luckily there was nothing to worry about in the food department we started with the fried dough burrata and prosciutto which were all lovely then although they dont offer half portions of pasta we each ordered the entree size and split them we chose the tagliatelle bolognese and a four cheese filled pasta in a creamy sauce with bacon asparagus and grana frita both were very good we split a secondi which was the special berkshire pork secreto which was described as a pork skir

**Review Sentiment Analysis**

In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 22.7 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 28.4 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 14.3 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 11.1 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 5.7 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 6.3 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 6.0 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 6.5 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 5.0 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 5.5 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 5.5 M

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

vader_sa = SentimentIntensityAnalyzer()