In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import time

In [138]:
test_url = 'https://www.nytimes.com/2015/11/11/dining/jams-jonathan-waxman-review.html'

In [142]:
"""
TEST REVIEWS
"""

def parse_urls(test_url):

    # GET HTML
    r = requests.get(test_url)
    c = r.content

    # CREATE BeautifulSoup Object
    soup = BeautifulSoup(c, 'html.parser')

    return parse_recent_reviews(soup)
    
parse_urls(test_url)

'\nTEST REVIEWS\n'

{'name': 'Jams',
 'review_url': 'https://www.nytimes.com/2015/11/11/dining/jams-jonathan-waxman-review.html',
 'review_date': '20151110',
 'reviewer': 'Pete Wells',
 'rating': 'Satisfactory',
 'neighborhood': 'Midtown',
 'critic_pick': False,
 'atmosphere': 'A comfortable and casual space stripped down to bare bricks, with large windows on the street. Servers are well meaning but confused at times.',
 'sound': 'Moderate.',
 'recommendations': 'Kale salad; Jams chicken. Appetizers and pasta, $6 to $25; main courses, $21 to $37.',
 'menu': 'http://www.singlepage.com/jams---nyc',
 'drinks': 'Cocktails and a well chosen if somewhat generic wine list.',
 'price': '$$$$ (very expensive)',
 'hours': 'Daily for breakfast, lunch or brunch, and dinner.',
 'reservations': 'Accepted',
 'keywords': ',Restaurant,Jams,Midtown Area Manhattan,Jonathan Waxman',
 'article_id': '100000004023657',

In [141]:
def parse_recent_reviews(soup):
    
    # Extract review text
    article = []
        
    for p in soup.find_all('p', class_='css-18icg9x evys1bk0'):
            
        article.append(p.get_text())

    article = ' '.join(article)

    # Extract Restaurant Name, Stars + Neighborhood
    boa = soup.find('div', {'class': ['bottom-of-article', 'review-details restaurant-details']})

    # Restaurant Name
    name = boa.find('h4').get_text()

    # Restaurant Stars
    try:
        rating = boa.find('span', {'class': ['css-z4hz5', 'css-1y5uc8z']}).get_text()
    except:
        rating = boa.find('div', {'class': 'css-1y5uc8z'}).find('span').get_text()

    # Restaurant Neighborhood
    hood = boa.find('dd', class_='neighborhood').get_text()

    # Critic's Pick?
    if boa.find('span', {'class': 'css-14dcre2'}):
        cpick = True
    else:
        cpick = False

    # Restaurant Atmosphere
    try:
        atmosphere = boa.find('div', class_='atmosphere').find('dd').get_text()
    except:
        atmosphere = float('nan')

    # Sound
    try:
        sound = boa.find('div', class_='noiseLevel').find('dd').get_text()
    except:
        sound = float('nan')

    # Recommendations
    try:
        recs = boa.find('div', class_='recommendedDishes').find('dd').get_text()
    except:
        recs = float('nan')

    # Menu Link
    try:
        menu_link = boa.find('div', class_='menuLink').find('a', href=True)['href']
    except:
        menu_link = float('nan')

    # Drinks
    try:
        drinks = boa.find('div', class_='alcoholInfo').find('dd').get_text()
    except:
        drinks = float('nan')

    # Price
    try:
        price = boa.find('dd', class_='price').get_text()
    except:
        price = float('nan')

    # Hours
    try:
        hours = boa.find('dd', class_='hours').get_text()
    except:
        hours = float('nan')

    # Reservations
    try:
        resis = boa.find('dd', class_='reservations').get_text()
    except:
        resis = float('nan')

    # URL
    try:
        url = soup.find('meta', {'property':'og:url'}).get('content')
    except:
        url = float('nan')
            
    # Extract Meta Tags - Reviewer, Date, Keywords, Article_ID
    for tag in soup.find_all('meta'):

        if tag.get('name', None) == 'byl':
            reviewer = tag.get('content', None).replace('By ', '').strip()

        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()

        if tag.get('name', None) == 'news_keywords':
            keywords = tag.get('content', None).strip()

        if tag.get('name', None) == 'articleid':
            article_id = tag.get('content', None).strip()
            
    rev_dict = {'name': name,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'keywords': keywords,
                'article_id': article_id,
                'review': article}
    
    return rev_dict

In [136]:
def parse_broken_modern_reviews(soup):
    
    # Extract review text
    article = []
        
    for p in soup.find_all('p', class_='css-18icg9x evys1bk0'):
            
        article.append(p.get_text())

    article = ' '.join(article)

    # Extract Restaurant Name, Stars + Neighborhood
    boa = soup.find('p')
    print(boa)

    # Restaurant Name
    name = boa.find('p').find('strong').get_text()

    # Restaurant Stars
    rating = boa.find('span', {'class': ['css-z4hz5', 'css-1y5uc8z']}).get_text()

    # Restaurant Neighborhood
    try:
        hood = boa.find('dd', class_='neighborhood').get_text()
    except:
        pass
    
    # Critic's Pick?
    if boa.find('span', {'class': 'css-14dcre2'}):
        cpick = True
    else:
        cpick = False

    # Restaurant Atmosphere
    try:
        atmosphere = boa.find('p').find('strong', text_='ATMOSPHERE').get_text()
    except:
        atmosphere = float('nan')

    # Sound
    try:
        sound = boa.find('div', class_='noiseLevel').find('dd').get_text()
    except:
        sound = float('nan')

    # Recommendations
    try:
        recs = boa.find('div', class_='recommendedDishes').find('dd').get_text()
    except:
        recs = float('nan')

    # Menu Link
    try:
        menu_link = boa.find('div', class_='menuLink').find('a', href=True)['href']
    except:
        menu_link = float('nan')

    # Drinks
    try:
        drinks = boa.find('div', class_='alcoholInfo').find('dd').get_text()
    except:
        drinks = float('nan')

    # Price
    try:
        price = boa.find('dd', class_='price').get_text()
    except:
        price = float('nan')

    # Hours
    try:
        hours = boa.find('dd', class_='hours').get_text()
    except:
        hours = float('nan')

    # Reservations
    try:
        resis = boa.find('dd', class_='reservations').get_text()
    except:
        resis = float('nan')

    # URL
    try:
        url = soup.find('meta', {'property':'og:url'}).get('content')
    except:
        url = float('nan')
            
    # Extract Meta Tags - Reviewer, Date, Keywords, Article_ID
    for tag in soup.find_all('meta'):

        if tag.get('name', None) == 'byl':
            reviewer = tag.get('content', None).replace('By ', '').strip()

        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()

        if tag.get('name', None) == 'news_keywords':
            keywords = tag.get('content', None).strip()

        if tag.get('name', None) == 'articleid':
            article_id = tag.get('content', None).strip()
            
    rev_dict = {'name': name,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'keywords': keywords,
                'article_id': article_id,
                'review': article}
    
    return rev_dict

In [139]:
def parse_modern_reviews(soup):
    
    for tag in soup.find_all('meta'):
        #Reviewer
        if tag.get('name', None) == 'author':
            reviewer = tag.get('content', None).strip()
        #Review Date
        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()
        #Keywords
        if tag.get('name', None)== 'news_keywords':
            keywords = tag.get('content', None).strip()
    
    #End of Article summary information:
    EOA = soup.find('aside', class_='review-details restaurant-details')
    
    #restaurant name
    name = EOA.find('h4').get_text()

    #Rating
    if EOA.find('li', class_='critic-star-rating'):
        rating = EOA.find('li', class_='critic-star-rating').get_text()
    elif EOA.find('li', class_='critic-word-rating'):
        rating = EOA.find('li', class_='critic-word-rating').get_text()
    else:
        rating = float('nan')

    #Neighborhood
    hood = EOA.find('p', itemprop='addressLocality').get_text()

    #Critic pick T/F
    if EOA.find('li', class_='critics-pick'):
        cpick = True
    else:
        cpick = False
    
    #Atmosphere
    try:
        atmosphere = EOA.find('span', text='Atmosphere').parent.find('span', itemprop='review').get_text()
    except:
        atmosphere = float('nan')
    
    #Sound
    try:
        sound = EOA.find('span', text='Sound').parent.find('span', itemprop='review').get_text()
    except:
        sound = float('nan')
    
    #Menu Recommendations
    try:
        recs = EOA.find('span', text='Recommended Dishes').parent.find('span', itemprop='menu').get_text()
    except:
        recs = float('nan')
    
    #Menu Link
    try:
        menu_link = EOA.find('span', text='Menu').parent.find('span', itemprop='menu').find('a').get('href')
    except:
        menu_link = float('nan')
    
    #Drinks
    try:
        drinks = EOA.find('span', text='Drinks and Wine').parent.find('span', itemprop='menu').get_text()
    except:
        drinks = float('nan')
    
    #Price
    try:
        price = EOA.find('span', itemprop='priceRange').get_text()
    except:
        price = float('nan')
    
    #Hours
    try:
        hours = EOA.find('time').get('datetime')
    except:
        hours = float('nan')
    
    #Reservations
    try:
        resis = EOA.find('span', itemprop='acceptsReservations').get_text()
    except:
        resis = float('nan')
    
    #Review Text
    review = []
    
    for p in soup.find_all('p', class_='story-body-text story-content'):
        if p.get('data-para-count') == '8':
            break
        else:
            review.append(p.get_text())
            
    article = ' '.join(review)

    #Article ID
    article_id = soup.find('meta', itemprop='identifier').get('content')
    
    #Review URL
    url = soup.find('meta', {'property':'og:url'}).get('content')
    
    rev_dict = {'name': name,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'article_id': article_id,
                'keywords': keywords,
                'review': article}
    
    return rev_dict 