In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import time

## Create DataFrame of Target Restaurants Reviewed by NY Times

In [2]:
review_url = 'https://www.nytimes.com/reviews/dining'

### Scrape HTML from NYT Dining Page

In [3]:
def create_nyt_soup_object(url):
    
    # Selenium Driver
    dr = webdriver.Chrome()
    dr.get(url)
    WebDriverWait(dr, 100)
    
    # 'Show More' button needs to be 'pressed'
    button = dr.find_element_by_xpath("//button[.='Show More']")

    while True:

        try:

            # Click button
            button.click()
            time.sleep(3)

        # Exception raised when end of reviews is reached
        except StaleElementReferenceException:

            break

    soup = BeautifulSoup(dr.page_source, 'html.parser')

    dr.close()
    dr.quit()
    
    return soup
    
nyt_soup_object = create_nyt_soup_object(review_url)

### Scrape HTML for Restaurant, Review, and URLs and create Dictionary

In [4]:
def nyt_page_scrape(soup_object):

    review_dict = []

    for reviewed in soup_object.find_all('span', itemprop='reviewRating'):
        
        r_name = reviewed.find_parent('div').find('h2').get_text()
        r_rating = reviewed.find('span').get_text()
        r_link_1 = reviewed.find_parent('div').find_parent('div').find('a', href=True)['href']
        r_link_2 = reviewed.find_parent('div').find_parent('article').find('footer').find('a', href=True)['href']
        r_cuisine = reviewed.find_parent('div').find('li', itemprop='servesCuisine').get_text()
        
        try:
            r_neighborhood = reviewed.find_parent('div').find('li', itemprop='addressLocality').get_text()
            
        except AttributeError:
            r_neighborhood = 'NaN'
            
        r_reviewer = reviewed.find_parent('div').find('p', itemprop='author').find('span').get_text()
        r_review_date = reviewed.find_parent('div').find_parent('article').find('time').get_text()

        print(r_name, ' --> ', r_rating, ', scraped')

        restaurant_dict = {'name': r_name,
                           'rating': r_rating,
                           'reviewer': r_reviewer,
                           'review_date': r_review_date,
                           'neighborhood': r_neighborhood,
                           'cuisine': r_cuisine,
                           'review_link_1': r_link_1,
                           'review_link_2': r_link_2}

        review_dict.append(restaurant_dict)
    
    return review_dict

reviews = nyt_page_scrape(nyt_soup_object)

Hanon  -->  2 star , scraped
Del Posto  -->  3 star , scraped
The Freakin Rican  -->  1 star , scraped
Wayan  -->  2 star , scraped
Niche  -->  1 star , scraped
Haenyeo  -->  2 star , scraped
Standard Grill  -->  2 star , scraped
Violet  -->  1 star , scraped
Odo  -->  3 star , scraped
Cka Ka Qellu  -->  2 star , scraped
Madame Vo BBQ  -->  1 star , scraped
Oxalis  -->  1 star , scraped
Bistro Pierre Lapin  -->  1 star , scraped
Cherry Point  -->  2 star , scraped
Benno  -->  3 star , scraped
Bang Bar  -->  1 star , scraped
Hwaban  -->  2 star , scraped
Bluebird London  -->  0.5 star , scraped
The Four Seasons Restaurant  -->  1 star , scraped
Saint Julivert Fisherie  -->  1 star , scraped
Adda Indian Canteen  -->  2 star , scraped
Misi  -->  3 star , scraped
Mama’s Too  -->  1 star , scraped
Henry at Life Hotel by JJ  -->  1 star , scraped
Hunan Slurp  -->  2 star , scraped
Atomix  -->  3 star , scraped
Manhatta  -->  1 star , scraped
Village Cafe  -->  2 star , scraped
Kopitiam  --> 

Candle 79  -->  1 star , scraped
Socarrat Paella Bar  -->  1 star , scraped
Delicatessen  -->  1 star , scraped
James  -->  1 star , scraped
Michael's  -->  0.75 star , scraped
Perbacco  -->  2 star , scraped
Scarpetta  -->  3 star , scraped
Szechuan Gourmet  -->  2 star , scraped
Gottino  -->  1 star , scraped
Terroir  -->  1 star , scraped
Artisanal  -->  2 star , scraped
Commerce  -->  1 star , scraped
La Sirène  -->  1 star , scraped
Bar Boulud  -->  2 star , scraped
Second Avenue Deli  -->  1 star , scraped
Ilili  -->  1 star , scraped
Blue Ribbon Sushi Bar and Grill  -->  2 star , scraped
Barbuto  -->  1 star , scraped
Harry Cipriani  -->  0.25 star , scraped
Moim  -->  1 star , scraped
Gemma  -->  1 star , scraped
Peter Luger Steak House  -->  2 star , scraped
Soto  -->  2 star , scraped
Rayuela  -->  1 star , scraped
Cafe Boulud  -->  3 star , scraped
15 East  -->  2 star , scraped
Ushiwakamaru  -->  1 star , scraped
Landmarc  -->  1 star , scraped
Katz's Delicatessen  -->  1 s

In [5]:
reviews

[{'name': 'Hanon',
  'rating': '2 star',
  'reviewer': 'Pete Wells',
  'review_date': 'May 21, 2019',
  'neighborhood': 'Williamsburg',
  'cuisine': 'Japanese',
  'review_link_1': 'https://www.nytimes.com/2019/05/21/dining/hanon-review.html',
  'review_link_2': 'https://www.nytimes.com/2019/05/21/dining/hanon-review.html?rref=collection%2Fcollection%2Frestaurant-guide'},
 {'name': 'Del Posto',
  'rating': '3 star',
  'reviewer': 'Pete Wells',
  'review_date': 'May 14, 2019',
  'neighborhood': 'Chelsea',
  'cuisine': 'Italian',
  'review_link_1': 'https://www.nytimes.com/2019/05/14/dining/del-posto-review-pete-wells.html',
  'review_link_2': 'https://www.nytimes.com/2019/05/14/dining/del-posto-review-pete-wells.html?rref=collection%2Fcollection%2Frestaurant-guide'},
 {'name': 'The Freakin Rican',
  'rating': '1 star',
  'reviewer': 'Pete Wells',
  'review_date': 'May 7, 2019',
  'neighborhood': 'Astoria',
  'cuisine': 'Caribbean, Latin American',
  'review_link_1': 'https://www.nytimes.

In [6]:
len(reviews)

539

In [7]:
df = pd.DataFrame(reviews)

In [8]:
col_order = ['name', 'rating', 'review_date', 'reviewer', 'neighborhood', 'cuisine', 'review_link_1', 'review_link_2']

In [9]:
df = df[col_order]

In [10]:
df

Unnamed: 0,name,rating,review_date,reviewer,neighborhood,cuisine,review_link_1,review_link_2
0,Hanon,2 star,"May 21, 2019",Pete Wells,Williamsburg,Japanese,https://www.nytimes.com/2019/05/21/dining/hano...,https://www.nytimes.com/2019/05/21/dining/hano...
1,Del Posto,3 star,"May 14, 2019",Pete Wells,Chelsea,Italian,https://www.nytimes.com/2019/05/14/dining/del-...,https://www.nytimes.com/2019/05/14/dining/del-...
2,The Freakin Rican,1 star,"May 7, 2019",Pete Wells,Astoria,"Caribbean, Latin American",https://www.nytimes.com/2019/05/07/dining/the-...,https://www.nytimes.com/2019/05/07/dining/the-...
3,Wayan,2 star,"April 23, 2019",Pete Wells,NoLIta,Indonesian,https://www.nytimes.com/2019/04/23/dining/waya...,https://www.nytimes.com/2019/04/23/dining/waya...
4,Niche,1 star,"April 16, 2019",Pete Wells,Lower East Side,Japanese,https://www.nytimes.com/2019/04/16/dining/nich...,https://www.nytimes.com/2019/04/16/dining/nich...
5,Haenyeo,2 star,"April 9, 2019",Pete Wells,Park Slope,Korean,https://www.nytimes.com/2019/04/09/dining/haen...,https://www.nytimes.com/2019/04/09/dining/haen...
6,Standard Grill,2 star,"April 2, 2019",Pete Wells,West Village,American,https://www.nytimes.com/2019/04/02/dining/stan...,https://www.nytimes.com/2019/04/02/dining/stan...
7,Violet,1 star,"March 26, 2019",Pete Wells,East Village,"Italian, Pizza",https://www.nytimes.com/2019/03/26/dining/viol...,https://www.nytimes.com/2019/03/26/dining/viol...
8,Odo,3 star,"March 19, 2019",Pete Wells,Flatiron district,Japanese,https://www.nytimes.com/2019/03/19/dining/odo-...,https://www.nytimes.com/2019/03/19/dining/odo-...
9,Cka Ka Qellu,2 star,"March 12, 2019",Pete Wells,Belmont,Eastern European,https://www.nytimes.com/2019/03/12/dining/cka-...,https://www.nytimes.com/2019/03/12/dining/cka-...


In [11]:
df.to_csv('reviews.csv', encoding='utf-8')

In [12]:
df.groupby('reviewer')['rating'].value_counts()

reviewer            rating   
Amanda Hesser       2 star         3
                    0.75 star      1
                    1 star         1
Bryan Miller        1 star         5
                    2 star         4
Eric Asimov         1 star         6
                    2 star         4
                    3 star         1
Frank Bruni         1 star        59
                    2 star        36
                    3 star         9
                    0.75 star      7
                    0.25 star      2
Frank J. Prial      2 star         1
Julia Moskin        2 star         1
Ligaya Mishan       1 star         1
                    2 star         1
Marian Burros       2 star         3
                    0.5 star       2
                    1 star         1
Mimi Sheraton       1 star         1
Pete Wells          2 star       131
                    1 star        93
                    3 star        39
                    0.75 star      9
                    0.5 star       5
        

In [23]:
len(df[df['reviewer'] == 'Pete Wells']['name'].unique())

282