In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import time

In [12]:
import os

# Given URL, scrape page for restaurant review

## Read in reviews.csv for URLs to pull

In [6]:
df = pd.read_csv('reviews.csv', index_col='Unnamed: 0' )

In [7]:
df.head()

Unnamed: 0,name,rating,review_date,reviewer,neighborhood,cuisine,review_link_1,review_link_2
0,Hanon,2 star,"May 21, 2019",Pete Wells,Williamsburg,Japanese,https://www.nytimes.com/2019/05/21/dining/hano...,https://www.nytimes.com/2019/05/21/dining/hano...
1,Del Posto,3 star,"May 14, 2019",Pete Wells,Chelsea,Italian,https://www.nytimes.com/2019/05/14/dining/del-...,https://www.nytimes.com/2019/05/14/dining/del-...
2,The Freakin Rican,1 star,"May 7, 2019",Pete Wells,Astoria,"Caribbean, Latin American",https://www.nytimes.com/2019/05/07/dining/the-...,https://www.nytimes.com/2019/05/07/dining/the-...
3,Wayan,2 star,"April 23, 2019",Pete Wells,NoLIta,Indonesian,https://www.nytimes.com/2019/04/23/dining/waya...,https://www.nytimes.com/2019/04/23/dining/waya...
4,Niche,1 star,"April 16, 2019",Pete Wells,Lower East Side,Japanese,https://www.nytimes.com/2019/04/16/dining/nich...,https://www.nytimes.com/2019/04/16/dining/nich...


## Pull HTML from URLs in Reviews Dataframe and save

### The code in this section was borrowed and adapted from Joseph Thurman. http://www.josephthurman.com/blog_index.html

In [9]:
# NYT website has custom error page if it can't find the URL - this finds such pages so they can be re-downloaded
def find_server_error(bs):
    result = bs.find_all('meta', {'content': '500 - Server Error'})
    return(len(result) > 0)

In [53]:
def get_reviews(to_fetch, n=10):
    """
    Downloads the reviews from NYT web page.
    urls is a list of the urls to retrieve. n is the max number of times to try to download a page
    Downloads each page and saves it as html. Also saves a list of the urls of
    every page. Does some parsing of the pages to remove non-review urls that still haven't been caught
    """
    final_url_list = []
    counter = 0
    os.makedirs('reviews', exist_ok=True)

    def get_from_list(url_list, final_url_list, counter):
        refetch_list = []
        for review_url in url_list:
            review = requests.get(review_url)
            parsed_review = BeautifulSoup(review.content, 'html.parser')
            if find_server_error(parsed_review):
                refetch_list.append(review_url)
            else:
                with open('./reviews/review' + str(counter) + '.html', 'w') as newfile:
                    newfile.write(str(parsed_review))
                final_url_list.append(review_url)
                counter += 1
        return (refetch_list, final_url_list, counter)

    attempts = 0
    while (len(to_fetch) > 0) and (attempts <= n):
        to_fetch, final_url_list, counter = get_from_list(to_fetch, final_url_list, counter)
        attempts += 1

    if len(to_fetch) > 0:
        print("Could not successfully access the following reviews:")
        for url in to_fetch:
            print(url)

    with open('./reviews/url_list.txt', 'w') as url_output:
        json.dump(final_url_list, url_output)


In [54]:
get_reviews(df['review_link_1'])

## Below works for recent reviews.  Need to find when CSS tags changed

In [3]:
# def get_article_text_new(soup):
#     rest_name = soup.find('div', class_="css-83hgbf").find('dt').get_text()
#     rev_date = soup.find('time', class_="css-rs1psd e16638kd0").get_text()
#     article = []
#     for p in soup.find_all('p', class_='css-18icg9x evys1bk0'):
#         article.append(p.get_text())
#     ' '.join(article)
#     return {'name': rest_name, 'review_date': rev_date, 'review_text': article}
     

In [124]:
def get_article_modern(soup):
    
    for tag in soup.find_all('meta'):
        #Reviewer
        if tag.get('name', None) == 'author':
            reviewer = tag.get('content', None).strip()
        #Review Date
        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()
        #Keywords
        if tag.get('name', None)== 'news_keywords':
            keywords = tag.get('content', None).strip()
    
    #End of Article summary information:
    EOA = soup.find('aside', class_='review-details restaurant-details')
    
    #restaurant name
    name = EOA.find('h4').get_text()

    #Rating
    if EOA.find('li', class_='critic-star-rating'):
        rating = EOA.find('li', class_='critic-star-rating').get_text()
    elif EOA.find('li', class_='critic-word-rating'):
        rating = EOA.find('li', class_='critic-word-rating').get_text()
    
    #Neighborhood
    hood = EOA.find('p', itemprop='addressLocality').get_text()

    
    #Critic pick T/F
    if EOA.find('li', class_='critics-pick'):
        cpick = True
    else:
        cpick = False
    
    #Atmosphere
    try:
        atmosphere = EOA.find('span', text='Atmosphere').parent.find('span', itemprop='review').get_text()
    except:
        atmosphere = float('nan')
    
    #Sound
    try:
        sound = EOA.find('span', text='Sound').parent.find('span', itemprop='review').get_text()
    except:
        sound = float('nan')
    
    #Menu Recommendations
    try:
        recs = EOA.find('span', text='Recommended Dishes').parent.find('span', itemprop='menu').get_text()
    except:
        recs = float('nan')
    
    #Menu Link
    try:
        menu_link = EOA.find('span', text='Menu').parent.find('span', itemprop='menu').find('a').get('href')
    except:
        menu_link = float('nan')
    
    #Drinks
    try:
        drinks = EOA.find('span', text='Drinks and Wine').parent.find('span', itemprop='menu').get_text()
    except:
        drinks = float('nan')
    
    #Price
    try:
        price = EOA.find('span', itemprop='priceRange').get_text()
    except:
        price = float('nan')
    
    #Hours
    try:
        hours = EOA.find('time').get('datetime')
    except:
        hours = float('nan')
    
    #Reservations
    try:
        resis = EOA.find('span', itemprop='acceptsReservations').get_text()
    except:
        resis = float('nan')
    
    #Review Text
    review = []
    for p in soup.find_all('p', class_='story-body-text story-content'):
        review.append(p.get_text())
    article = ' '.join(review)
    
    #Article ID
    article_id = soup.find('meta', itemprop='identifier').get('content')
    
    rev_dict = {'name': name, 
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'review': article,
                'article_id': article_id,
                'keywords': keywords}
    
    return rev_dict 

## Trying to parse HTML, and making a list of files the parse doesn't work for

In [25]:
# reviews = []
# try_again = []

# for i in range(0,539):
#     with open('./reviews/review'+str(i)+'.html','r') as newfile:
#         soup = BeautifulSoup(newfile, 'html.parser')
    
#     try: 
#         review = get_article_text(soup)
#         reviews.append(review)
#     except:
#         file = './reviews/review'+str(i)+'.html'
#         try_again.append(file)

In [27]:
try_again

['./reviews/review139.html',
 './reviews/review254.html',
 './reviews/review255.html',
 './reviews/review256.html',
 './reviews/review257.html',
 './reviews/review258.html',
 './reviews/review259.html',
 './reviews/review260.html',
 './reviews/review261.html',
 './reviews/review262.html',
 './reviews/review263.html',
 './reviews/review264.html',
 './reviews/review265.html',
 './reviews/review266.html',
 './reviews/review267.html',
 './reviews/review268.html',
 './reviews/review269.html',
 './reviews/review270.html',
 './reviews/review271.html',
 './reviews/review272.html',
 './reviews/review273.html',
 './reviews/review274.html',
 './reviews/review275.html',
 './reviews/review276.html',
 './reviews/review277.html',
 './reviews/review278.html',
 './reviews/review279.html',
 './reviews/review280.html',
 './reviews/review281.html',
 './reviews/review282.html',
 './reviews/review283.html',
 './reviews/review284.html',
 './reviews/review285.html',
 './reviews/review286.html',
 './reviews/re

In [28]:
len(try_again)

286

In [130]:
reviews = []
try_again = []

for i in range(254,539):
    with open('./reviews/review'+str(i)+'.html','r') as newfile:
        soup = BeautifulSoup(newfile, 'html.parser')
    
    try: 
        review = get_article_modern(soup)
        reviews.append(review)
    except:
        file = './reviews/review'+str(i)+'.html'
        try_again.append(file)

In [131]:
try_again

['./reviews/review376.html',
 './reviews/review394.html',
 './reviews/review426.html',
 './reviews/review427.html',
 './reviews/review428.html',
 './reviews/review429.html',
 './reviews/review430.html',
 './reviews/review431.html',
 './reviews/review432.html',
 './reviews/review433.html',
 './reviews/review434.html',
 './reviews/review435.html',
 './reviews/review436.html',
 './reviews/review437.html',
 './reviews/review438.html',
 './reviews/review439.html',
 './reviews/review440.html',
 './reviews/review441.html',
 './reviews/review442.html',
 './reviews/review443.html',
 './reviews/review444.html',
 './reviews/review445.html',
 './reviews/review446.html',
 './reviews/review447.html',
 './reviews/review448.html',
 './reviews/review449.html',
 './reviews/review450.html',
 './reviews/review451.html',
 './reviews/review452.html',
 './reviews/review453.html',
 './reviews/review454.html',
 './reviews/review455.html',
 './reviews/review456.html',
 './reviews/review457.html',
 './reviews/re

In [132]:
reviews[:5]

[{'name': 'Royal Seafood Restaurant',
  'review_date': '20121211',
  'reviewer': 'Pete Wells',
  'rating': '★',
  'neighborhood': 'Chinatown',
  'critic_pick': True,
  'atmosphere': 'A vast dining hall cheered up by shiny golden dragons, sparkly chandeliers and pink tablecloths.',
  'sound': 'Moderate.',
  'recommendations': 'Crispy fried diced pork ribs; braised crispy squab; //West Lake beef soup; salt and pepper squid; crispy fried chicken; deep-fried crispy flounder; house special baby lamb; oxtail with curry casserole; snow pea leaves in broth; olive with minced pork and bean curd; e-fu noodles with mushrooms.',
  'menu': 'http://www.singlepage.com/dunhuang-seafood-restaurant/menu',
  'drinks': 'Beer only.',
  'price': '$$ (moderate)',
  'hours': 'Daily, 8 a.m. to 10:30 p.m.',
  'reservations': 'Accepted only for parties of six or more. ',
  'review': 'EVERYBODY was having lobster. Before placing an order, Chinatown veterans look around the dining room to see what others are eatin

### Testing

In [114]:
with open('./reviews/review258.html','r') as newfile:
    soup_old = BeautifulSoup(newfile, 'html.parser')

In [116]:
article = []
for p in soup_old.find_all('p', class_='story-body-text story-content'):
    article.append(p.get_text())