In [230]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as snas
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait as wait
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
import time

In [271]:
test_url = 'https://www.nytimes.com/2010/02/24/dining/reviews/24rest.html'

In [274]:
"""
TEST REVIEWS
"""

def parse_urls(test_url):

    # GET HTML
    r = requests.get(test_url)
    c = r.content

    # CREATE BeautifulSoup Object
    soup = BeautifulSoup(c, 'html.parser')

    return parse_modern_reviews(soup)
    
parse_urls(test_url)

'\nTEST REVIEWS\n'

{'name': 'Tanoreen',
 'title': 'Tanoreen Restaurant in Bay Ridge, Brooklyn',
 'review_url': 'https://www.nytimes.com/2010/02/24/dining/reviews/24rest.html',
 'review_date': '20100223',
 'reviewer': 'Sam Sifton',
 'rating': '★',
 'neighborhood': 'Bay Ridge',
 'critic_pick': True,
 'atmosphere': nan,
 'sound': nan,
 'recommendations': nan,
 'menu': 'http://www.singlepage.com/tanoreen/menu',
 'drinks': nan,
 'price': '$$ (moderate)',
 'hours': 'Tuesday to Friday,  noon to 10:30 p.m.;    Saturday, 10:30 a.m. to 10:30 p.m., .; Sunday, 10:30 a.m. to 10 p.m. ',
 'reservations': 'Accepted',
 'article_id': '1247467148081',
 'keywords': 'Tanoreen',
 'review': 'SATURDAY night in the neighborhood that once gave America fever, a Brooklyn that looks to itself for employment and has little time for the glittering island to the north.  A bridge rises out of it, a silver ramp, and you can see ocean beneath it, darkness running all the way to Portugal, and from there down the Mediterranean to the Middle

In [251]:
def get_article_archived(soup):
    
    #Reviewer
    try:
        reviewer = soup.find('meta',{'name':'author'}).get('content')
    except:
        reviewer = float('nan')
    
    #Review Date
    try:
        rev_date = soup.find('meta',{'name':'pdate'}).get('content')
    except:
        rev_date = float('nan')
    
    #Keywords
    try:
        keywords = soup.find('meta',{'name':'news_keywords'}).get('content')
    except:
        keywords = float('nan')
        
    #Article ID
    try:
        article_id = soup.find('meta', itemprop='identifier').get('content')
    except:
        article_id = float('nan')
        
    #URL
    try: 
        url = soup.find('meta', {'property':'og:url'}).get('content')
    except:
        url = float('nan')
    
    
    #restaurant name & rating
    regex = re.compile("([a-zA-Z0-9_ ]+)(?:[ \n]+\*+ \[Rating: )([a-zA-Z0-9_ ]+)] [a-zA-Z0-9_ \('\)]*[, ]{0,2}([a-zA-Z0-9_ ,]*);")
    regex2 = re.compile("([a-zA-Z0-9_ .']+) ([A-Z]{4,12}) [a-zA-Z0-9_ \('\)]+, ([a-zA-Z0-9_ ,]+);")
    if regex.search(str(soup)):
        name = regex.search(str(soup))[1].strip()
        rating = regex.search(str(soup))[2]
        hood = regex.search(str(soup))[3]
    elif regex2.search(str(soup)):
        name = regex2.search(str(soup))[1].strip()
        rating = regex2.search(str(soup))[2]
        hood = regex2.search(str(soup))[3]
    elif soup.find('strong') and re.search('\*{1,5}',str(soup)) and re.search("\d+ [\w ()'.]+, ([\w (),]+);",str(soup)):
        name = soup.find('strong').get_text()
        rating = re.search('\*{1,5}',str(soup))[0]
        hood = re.search("\d+ [\w ()'.]+, ([\w (),]+);",str(soup))[1]
    elif soup.find('strong'):
        name = soup.find('strong').get_text()
        rating = soup.find('strong').find_parent().find_next_sibling().get_text()
        hood = soup.find('strong').find_parent().find_next_sibling().find_next_sibling().get_text()
    else:
        name = float('nan')
        rating = float('nan')
        hood = float('nan')    
    
    #Another Name Function (Maks)
    if name == 'nan':
        for par in soup.find_all('p', class_='story-body-text story-content'):
            string = str(par.get_text().strip())
            res = re.search('(^[A-Za-z]+)*$', string).group(1)
            if res:
                name = res
                
    #Atmosphere
    try:
        atmosphere = re.search('[ATMOSPHERatmospher]{10}[: -]+([\w\d \-,;\(\)]+).',str(soup))[1]
    except:
        atmosphere = float('nan')
    
    #Critic pick T/F
    cpick = float('nan')
    
    #Sound
    try:
        sound = re.search('[SOUNDsound]{5} [LEVlev]{5}[: -]+([\w\d \-,;\(\)]+).',str(soup))[1]
    except:
        sound = float('nan')
    
    #Menu Recommendations
    try:
        recs = re.search('[RECOMNDecomnd]{11} [DISHEdishe]{6}[: -]+([[\w\d \-,;\(\)]+).',str(soup))[1]
    except:
        recs = float('nan')
    
    #Menu Link
    menu_link = float('nan')
    
    #Drinks
    drinks = float('nan')
    
    #Price
    try:
        price = re.search("[PRICEprice]{5} [RANGErange]{5}[: -]+([\w\d \-,;\(\)$']+).", str(soup))[1]
    except:
        price = float('nan')
    
    #Hours
    try:
        hours = re.search("[HOURS]{5}[: -]+([\w\d \-,;\(\)$':]+).",str(soup))[1]
    except:
        hours = float('nan')
    
    #Reservations
    try:
        resis = re.search("[RESVATION]{12}[: -]+([\w\d \-,;\(\)$':]+).", str(soup))[1]
    except:
        resis = float('nan')
    
    #Review Text
    review = []
    for p in soup.find_all('p', class_='story-body-text story-content'):
        review.append(p.get_text())
    article = ' '.join(review)
    
    
    
    rev_dict = {'name': name,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'review': article,
                'article_id': article_id,
                'keywords': keywords}
    
    return rev_dict 

In [262]:
def parse_recent_reviews(soup):
    
    bad_str = ' - The New York Times'
    title = soup.find('title').get_text().replace(bad_str,'')
    
    # Extract review text
    article = []
        
    for p in soup.find_all('p', class_='css-18icg9x evys1bk0'):
            
        article.append(p.get_text())

    article = ' '.join(article)

    # Extract Restaurant Name, Stars + Neighborhood
    boa = soup.find('div', {'class': ['bottom-of-article', 'review-details restaurant-details']})

    # Restaurant Name
    name = boa.find('h4').get_text()

    # Restaurant Stars
    try:
        rating = boa.find('span', {'class': ['css-z4hz5', 'css-1y5uc8z']}).get_text()
    except:
        rating = boa.find('div', {'class': 'css-1y5uc8z'}).find('span').get_text()

    # Restaurant Neighborhood
    hood = boa.find('dd', class_='neighborhood').get_text()

    # Critic's Pick?
    if boa.find('span', {'class': 'css-14dcre2'}):
        cpick = True
    else:
        cpick = False

    # Restaurant Atmosphere
    try:
        atmosphere = boa.find('div', class_='atmosphere').find('dd').get_text()
    except:
        atmosphere = float('nan')

    # Sound
    try:
        sound = boa.find('div', class_='noiseLevel').find('dd').get_text()
    except:
        sound = float('nan')

    # Recommendations
    try:
        recs = boa.find('div', class_='recommendedDishes').find('dd').get_text()
    except:
        recs = float('nan')

    # Menu Link
    try:
        menu_link = boa.find('div', class_='menuLink').find('a', href=True)['href']
    except:
        menu_link = float('nan')

    # Drinks
    try:
        drinks = boa.find('div', class_='alcoholInfo').find('dd').get_text()
    except:
        drinks = float('nan')

    # Price
    try:
        price = boa.find('dd', class_='price').get_text()
    except:
        price = float('nan')

    # Hours
    try:
        hours = boa.find('dd', class_='hours').get_text()
    except:
        hours = float('nan')

    # Reservations
    try:
        resis = boa.find('dd', class_='reservations').get_text()
    except:
        resis = float('nan')

    # URL
    try:
        url = soup.find('meta', {'property':'og:url'}).get('content')
    except:
        url = float('nan')
            
    # Extract Meta Tags - Reviewer, Date, Keywords, Article_ID
    for tag in soup.find_all('meta'):

        if tag.get('name', None) == 'byl':
            reviewer = tag.get('content', None).replace('By ', '').strip()

        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()

        if tag.get('name', None) == 'news_keywords':
            keywords = tag.get('content', None).strip()

        if tag.get('name', None) == 'articleid':
            article_id = tag.get('content', None).strip()
            
    rev_dict = {'name': name,
                'title': title,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'keywords': keywords,
                'article_id': article_id,
                'review': article}
    
    return rev_dict

In [136]:
def parse_broken_modern_reviews(soup):
    
    # Extract review text
    article = []
        
    for p in soup.find_all('p', class_='css-18icg9x evys1bk0'):
            
        article.append(p.get_text())

    article = ' '.join(article)

    # Extract Restaurant Name, Stars + Neighborhood
    boa = soup.find('p')
    print(boa)

    # Restaurant Name
    name = boa.find('p').find('strong').get_text()

    # Restaurant Stars
    rating = boa.find('span', {'class': ['css-z4hz5', 'css-1y5uc8z']}).get_text()

    # Restaurant Neighborhood
    try:
        hood = boa.find('dd', class_='neighborhood').get_text()
    except:
        pass
    
    # Critic's Pick?
    if boa.find('span', {'class': 'css-14dcre2'}):
        cpick = True
    else:
        cpick = False

    # Restaurant Atmosphere
    try:
        atmosphere = boa.find('p').find('strong', text_='ATMOSPHERE').get_text()
    except:
        atmosphere = float('nan')

    # Sound
    try:
        sound = boa.find('div', class_='noiseLevel').find('dd').get_text()
    except:
        sound = float('nan')

    # Recommendations
    try:
        recs = boa.find('div', class_='recommendedDishes').find('dd').get_text()
    except:
        recs = float('nan')

    # Menu Link
    try:
        menu_link = boa.find('div', class_='menuLink').find('a', href=True)['href']
    except:
        menu_link = float('nan')

    # Drinks
    try:
        drinks = boa.find('div', class_='alcoholInfo').find('dd').get_text()
    except:
        drinks = float('nan')

    # Price
    try:
        price = boa.find('dd', class_='price').get_text()
    except:
        price = float('nan')

    # Hours
    try:
        hours = boa.find('dd', class_='hours').get_text()
    except:
        hours = float('nan')

    # Reservations
    try:
        resis = boa.find('dd', class_='reservations').get_text()
    except:
        resis = float('nan')

    # URL
    try:
        url = soup.find('meta', {'property':'og:url'}).get('content')
    except:
        url = float('nan')
            
    # Extract Meta Tags - Reviewer, Date, Keywords, Article_ID
    for tag in soup.find_all('meta'):

        if tag.get('name', None) == 'byl':
            reviewer = tag.get('content', None).replace('By ', '').strip()

        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()

        if tag.get('name', None) == 'news_keywords':
            keywords = tag.get('content', None).strip()

        if tag.get('name', None) == 'articleid':
            article_id = tag.get('content', None).strip()
            
    rev_dict = {'name': name,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'keywords': keywords,
                'article_id': article_id,
                'review': article}
    
    return rev_dict

In [267]:
def parse_modern_reviews(soup):
    
    bad_str = ' - The New York Times'
    title = soup.find('title').get_text().replace(bad_str,'')
    
    for tag in soup.find_all('meta'):
        #Reviewer
        if tag.get('name', None) == 'author':
            reviewer = tag.get('content', None).strip()
        #Review Date
        if tag.get('name', None) == 'pdate':
            rev_date = tag.get('content', None).strip()
        #Keywords
        if tag.get('name', None)== 'news_keywords':
            keywords = tag.get('content', None).strip()
    
    #End of Article summary information:
    EOA = soup.find('aside', class_='review-details restaurant-details')
    
    #restaurant name
    name = EOA.find('h4').get_text()

    #Rating
    if EOA.find('li', class_='critic-star-rating'):
        rating = EOA.find('li', class_='critic-star-rating').get_text()
    elif EOA.find('li', class_='critic-word-rating'):
        rating = EOA.find('li', class_='critic-word-rating').get_text()
    else:
        rating = float('nan')

    #Neighborhood
    hood = EOA.find('p', itemprop='addressLocality').get_text()

    #Critic pick T/F
    if EOA.find('li', class_='critics-pick'):
        cpick = True
    else:
        cpick = False
    
    #Atmosphere
    try:
        atmosphere = EOA.find('span', text='Atmosphere').parent.find('span', itemprop='review').get_text()
    except:
        atmosphere = float('nan')
    
    #Sound
    try:
        sound = EOA.find('span', text='Sound').parent.find('span', itemprop='review').get_text()
    except:
        sound = float('nan')
    
    #Menu Recommendations
    try:
        recs = EOA.find('span', text='Recommended Dishes').parent.find('span', itemprop='menu').get_text()
    except:
        recs = float('nan')
    
    #Menu Link
    try:
        menu_link = EOA.find('span', text='Menu').parent.find('span', itemprop='menu').find('a').get('href')
    except:
        menu_link = float('nan')
    
    #Drinks
    try:
        drinks = EOA.find('span', text='Drinks and Wine').parent.find('span', itemprop='menu').get_text()
    except:
        drinks = float('nan')
    
    #Price
    try:
        price = EOA.find('span', itemprop='priceRange').get_text()
    except:
        price = float('nan')
    
    #Hours
    try:
        hours = EOA.find('time').get('datetime')
    except:
        hours = float('nan')
    
    #Reservations
    try:
        resis = EOA.find('span', itemprop='acceptsReservations').get_text()
    except:
        resis = float('nan')
    
    #Review Text
    review = []
    
    for p in soup.find_all('p', class_='story-body-text story-content'):
        if p.get('data-para-count') == '8':
            break
        else:
            review.append(p.get_text())
            
    article = ' '.join(review)

    #Article ID
    article_id = soup.find('meta', itemprop='identifier').get('content')
    
    #Review URL
    url = soup.find('meta', {'property':'og:url'}).get('content')
    
    rev_dict = {'name': name,
                'title': title,
                'review_url': url,
                'review_date': rev_date, 
                'reviewer': reviewer, 
                'rating': rating, 
                'neighborhood': hood,
                'critic_pick': cpick,
                'atmosphere': atmosphere,
                'sound': sound,
                'recommendations': recs,
                'menu': menu_link,
                'drinks': drinks,
                'price': price,
                'hours': hours,
                'reservations': resis,
                'article_id': article_id,
                'keywords': keywords,
                'review': article}
    
    return rev_dict 

In [211]:
df_og = pd.read_csv('reviews.csv', index_col='Unnamed: 0')

In [210]:
df_all_reviews = pd.read_csv('all_reviews.csv',index_col='Unnamed: 0')

In [212]:
df_og.head()

Unnamed: 0,name,rating,review_date,reviewer,neighborhood,cuisine,review_link_1,review_link_2
0,Hanon,2 star,"May 21, 2019",Pete Wells,Williamsburg,Japanese,https://www.nytimes.com/2019/05/21/dining/hano...,https://www.nytimes.com/2019/05/21/dining/hano...
1,Del Posto,3 star,"May 14, 2019",Pete Wells,Chelsea,Italian,https://www.nytimes.com/2019/05/14/dining/del-...,https://www.nytimes.com/2019/05/14/dining/del-...
2,The Freakin Rican,1 star,"May 7, 2019",Pete Wells,Astoria,"Caribbean, Latin American",https://www.nytimes.com/2019/05/07/dining/the-...,https://www.nytimes.com/2019/05/07/dining/the-...
3,Wayan,2 star,"April 23, 2019",Pete Wells,NoLIta,Indonesian,https://www.nytimes.com/2019/04/23/dining/waya...,https://www.nytimes.com/2019/04/23/dining/waya...
4,Niche,1 star,"April 16, 2019",Pete Wells,Lower East Side,Japanese,https://www.nytimes.com/2019/04/16/dining/nich...,https://www.nytimes.com/2019/04/16/dining/nich...


In [216]:
df_og = df_og.rename(columns={'review_link_1': 'review_url'})

In [217]:
df_og.head()

Unnamed: 0,name,rating,review_date,reviewer,neighborhood,cuisine,review_url,review_link_2
0,Hanon,2 star,"May 21, 2019",Pete Wells,Williamsburg,Japanese,https://www.nytimes.com/2019/05/21/dining/hano...,https://www.nytimes.com/2019/05/21/dining/hano...
1,Del Posto,3 star,"May 14, 2019",Pete Wells,Chelsea,Italian,https://www.nytimes.com/2019/05/14/dining/del-...,https://www.nytimes.com/2019/05/14/dining/del-...
2,The Freakin Rican,1 star,"May 7, 2019",Pete Wells,Astoria,"Caribbean, Latin American",https://www.nytimes.com/2019/05/07/dining/the-...,https://www.nytimes.com/2019/05/07/dining/the-...
3,Wayan,2 star,"April 23, 2019",Pete Wells,NoLIta,Indonesian,https://www.nytimes.com/2019/04/23/dining/waya...,https://www.nytimes.com/2019/04/23/dining/waya...
4,Niche,1 star,"April 16, 2019",Pete Wells,Lower East Side,Japanese,https://www.nytimes.com/2019/04/16/dining/nich...,https://www.nytimes.com/2019/04/16/dining/nich...


In [246]:
df_reviews = df_all_reviews[['review_url', 'review', 'price']]

In [248]:
df_reviews.head(100)

Unnamed: 0,review_url,review,price
0,https://www.nytimes.com/2006/02/08/dining/revi...,AT many restaurants I've received tutorials on...,"For lunch, three-course prix fixe, $60; limite..."
1,https://www.nytimes.com/2010/04/07/dining/revi...,JEAN-CLAUDE IZZO wrote a terrific series of Fr...,$$$ (expensive)
2,https://www.nytimes.com/2005/09/28/dining/revi...,PERHAPS no other recently opened restaurant ra...,"Cold dishes, $11 to $34"
3,https://www.nytimes.com/2008/07/02/dining/revi...,ITALIAN cooking is about a lot more than pasta...,$$$ (expensive)
4,https://www.nytimes.com/2010/07/14/dining/revi...,"IT is a strange feeling, sitting in Má Pêche o...",$$$ (expensive)
5,https://www.nytimes.com/1999/09/22/dining/rest...,POP is a funny name for a restaurant. It's a h...,"Dinner, appetizers, $9 to $15 ($85 for one oun..."
6,https://www.nytimes.com/2011/09/07/dining/revi...,"LIKE drawing a circle or walking a high wire, ...",$$$$ (very expensive)
7,https://www.nytimes.com/2006/06/28/dining/revi...,IT'S easy to see the bad in things and harder ...,$$$ (expensive)
8,https://www.nytimes.com/2012/03/14/dining/revi...,SOME chefs storm into the city like an invadin...,$$$ (expensive)
9,https://www.nytimes.com/2004/03/31/dining/rest...,"RESTAURANTS are like small cruise ships, with ...","Dinner, appetizers, $8 to $15; pizza, pasta an..."


In [221]:
df_final = df_og.merge(df_reviews, on='review_url', how='left')

In [225]:
df_final

Unnamed: 0,name,rating,review_date,reviewer,neighborhood,cuisine,review_url,review_link_2,review
0,Hanon,2 star,"May 21, 2019",Pete Wells,Williamsburg,Japanese,https://www.nytimes.com/2019/05/21/dining/hano...,https://www.nytimes.com/2019/05/21/dining/hano...,"Hanon, a new udon shop in Williamsburg, Brookl..."
1,Del Posto,3 star,"May 14, 2019",Pete Wells,Chelsea,Italian,https://www.nytimes.com/2019/05/14/dining/del-...,https://www.nytimes.com/2019/05/14/dining/del-...,"More than a year ago, I was on the verge of re..."
2,The Freakin Rican,1 star,"May 7, 2019",Pete Wells,Astoria,"Caribbean, Latin American",https://www.nytimes.com/2019/05/07/dining/the-...,https://www.nytimes.com/2019/05/07/dining/the-...,It is true that the bacalaitos at the Freakin ...
3,Wayan,2 star,"April 23, 2019",Pete Wells,NoLIta,Indonesian,https://www.nytimes.com/2019/04/23/dining/waya...,https://www.nytimes.com/2019/04/23/dining/waya...,If you ate at Spice Market before it closed tw...
4,Niche,1 star,"April 16, 2019",Pete Wells,Lower East Side,Japanese,https://www.nytimes.com/2019/04/16/dining/nich...,https://www.nytimes.com/2019/04/16/dining/nich...,"In 2012 and 2013, when people were lining up a..."
5,Haenyeo,2 star,"April 9, 2019",Pete Wells,Park Slope,Korean,https://www.nytimes.com/2019/04/09/dining/haen...,https://www.nytimes.com/2019/04/09/dining/haen...,Looking around Haenyeo’s dining room one night...
6,Standard Grill,2 star,"April 2, 2019",Pete Wells,West Village,American,https://www.nytimes.com/2019/04/02/dining/stan...,https://www.nytimes.com/2019/04/02/dining/stan...,I’m starting to think the American cult of che...
7,Violet,1 star,"March 26, 2019",Pete Wells,East Village,"Italian, Pizza",https://www.nytimes.com/2019/03/26/dining/viol...,https://www.nytimes.com/2019/03/26/dining/viol...,When I left my home state of Rhode Island at 1...
8,Odo,3 star,"March 19, 2019",Pete Wells,Flatiron district,Japanese,https://www.nytimes.com/2019/03/19/dining/odo-...,https://www.nytimes.com/2019/03/19/dining/odo-...,Before I tell you about the seasonally attuned...
9,Cka Ka Qellu,2 star,"March 12, 2019",Pete Wells,Belmont,Eastern European,https://www.nytimes.com/2019/03/12/dining/cka-...,https://www.nytimes.com/2019/03/12/dining/cka-...,The best way to enter the Albanian restaurant ...


In [226]:
df_final.to_csv('reviews_final.csv')

In [227]:
df_final.isnull().sum()

name               0
rating             0
review_date        0
reviewer           0
neighborhood       3
cuisine            0
review_url         0
review_link_2      0
review           139
dtype: int64

In [275]:
df_final.shape

(539, 9)