In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import time

In [12]:
import os

# Given URL, scrape page for restaurant review

## Pull HTML from webpage and create BeautifulSoup object

In [2]:
def cook_soup(url): 
    webpage = requests.get(url)
    soup = BeautifulSoup(webpage.content, 'html.parser')
    return soup

## Below works for recent reviews.  Need to find when CSS tags changed

In [3]:
def get_article_text(soup):
    rest_name = soup.find('div', class_="css-83hgbf").find('dt').get_text()
    rev_date = soup.find('time', class_="css-rs1psd e16638kd0").get_text()
    article = []
    for p in soup.find_all('p', class_='css-18icg9x evys1bk0'):
        article.append(p.get_text())
    ' '.join(article)
    return {'name': rest_name, 'review_date': rev_date, 'review_text': article}
     

In [6]:
df = pd.read_csv('reviews.csv', index_col='Unnamed: 0' )

In [7]:
df.head()

Unnamed: 0,name,rating,review_date,reviewer,neighborhood,cuisine,review_link_1,review_link_2
0,Hanon,2 star,"May 21, 2019",Pete Wells,Williamsburg,Japanese,https://www.nytimes.com/2019/05/21/dining/hano...,https://www.nytimes.com/2019/05/21/dining/hano...
1,Del Posto,3 star,"May 14, 2019",Pete Wells,Chelsea,Italian,https://www.nytimes.com/2019/05/14/dining/del-...,https://www.nytimes.com/2019/05/14/dining/del-...
2,The Freakin Rican,1 star,"May 7, 2019",Pete Wells,Astoria,"Caribbean, Latin American",https://www.nytimes.com/2019/05/07/dining/the-...,https://www.nytimes.com/2019/05/07/dining/the-...
3,Wayan,2 star,"April 23, 2019",Pete Wells,NoLIta,Indonesian,https://www.nytimes.com/2019/04/23/dining/waya...,https://www.nytimes.com/2019/04/23/dining/waya...
4,Niche,1 star,"April 16, 2019",Pete Wells,Lower East Side,Japanese,https://www.nytimes.com/2019/04/16/dining/nich...,https://www.nytimes.com/2019/04/16/dining/nich...


In [9]:
# NYT website has custom error page if it can't find the URL - this finds such pages so they can be re-downloaded
def find_server_error(bs):
    result = bs.find_all('meta', {'content': '500 - Server Error'})
    return(len(result) > 0)

In [15]:
def get_reviews(to_fetch, n=10):
    """
    Downloads the reviews from NYT web page.
    urls is a list of the urls to retrieve. n is the max number of times to try to download a page
    Downloads each page and saves it as html. Also saves a list of the urls of
    every page. Does some parsing of the pages to remove non-review urls that still haven't been caught
    """
    final_url_list = []
    counter = 0
    os.makedirs('reviews', exist_ok=True)

    def get_from_list(url_list, final_url_list, counter):
        refetch_list = []
        for review_url in url_list:
            review = requests.get(review_url)
            parsed_review = BeautifulSoup(review.content, 'html.parser')
            if find_server_error(parsed_review):
                refetch_list.append(review_url)
            else:
                with open('./reviews/review' + str(counter) + '.html', 'w') as newfile:
                    newfile.write(parsed_review.prettify())
                final_url_list.append(review_url)
                counter += 1
        return (refetch_list, final_url_list, counter)

    attempts = 0
    while (len(to_fetch) > 0) and (attempts <= n):
        to_fetch, final_url_list, counter = get_from_list(to_fetch, final_url_list, counter)
        attempts += 1

    if len(to_fetch) > 0:
        print("Could not successfully access the following reviews:")
        for url in to_fetch:
            print(url)

    with open('./reviews/url_list.txt', 'w') as url_output:
        json.dump(final_url_list, url_output)


In [16]:
get_reviews(df['review_link_1'])

In [21]:
with open('./reviews/review5.html','r') as newfile:
    soup = BeautifulSoup(newfile, 'html.parser')

In [22]:
soup

<!DOCTYPE html>

<html class="story" itemid="https://www.nytimes.com/2019/04/09/dining/haenyeo-restaurant-review-jenny-kwak.html" itemscope="" itemtype="http://schema.org/NewsArticle" lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<title data-rh="true">
   With Haenyeo, a Trailblazing Korean Chef Takes On Seafood - The New York Times
  </title>
<meta content="en-US" data-rh="true" itemprop="inLanguage"/>
<meta content="2019-04-09T15:28:47.000Z" data-rh="true" itemprop="datePublished" property="article:published"/>
<meta content="2019-04-15T14:56:24.154Z" data-rh="true" itemprop="dateModified" property="article:modified"/>
<meta content="en" data-rh="true" http-equiv="Content-Language"/>
<meta content="noarchive" data-rh="true" name="robots"/>
<meta content="100000006445811" data-rh="true" itemprop="identifier" name="articleid"/>
<meta content="nyt://article/9702619b-b339-558c-8df0-ff056493ef57" data-rh="true" itemprop="identifier" name="nyt_uri"/>
<meta content="pubp

In [23]:
soup.find_all('meta')

[<meta content="en-US" data-rh="true" itemprop="inLanguage"/>,
 <meta content="2019-04-09T15:28:47.000Z" data-rh="true" itemprop="datePublished" property="article:published"/>,
 <meta content="2019-04-15T14:56:24.154Z" data-rh="true" itemprop="dateModified" property="article:modified"/>,
 <meta content="en" data-rh="true" http-equiv="Content-Language"/>,
 <meta content="noarchive" data-rh="true" name="robots"/>,
 <meta content="100000006445811" data-rh="true" itemprop="identifier" name="articleid"/>,
 <meta content="nyt://article/9702619b-b339-558c-8df0-ff056493ef57" data-rh="true" itemprop="identifier" name="nyt_uri"/>,
 <meta content="pubp://event/4888310a3beb4da19b4cc2a5a1d71dce" data-rh="true" itemprop="identifier" name="pubp_event_id"/>,
 <meta content="If New Yorkers think of Korean cuisine as comfort food, it’s partly thanks to Jenny Kwak." data-rh="true" itemprop="description" name="description"/>,
 <meta content="https://static01.nyt.com/images/2019/04/10/dining/09rest2/09rest