### Solutions for Web Scraping Homework

In [1]:
# imports
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [2]:
# connect to homework
i = 1
url = f'https://www.yelp.com/search?find_desc=Restaurants&find_loc=London%2C+United+Kingdom&ns=1&start={i}'
req = requests.get(url).text
scraper = BeautifulSoup(req)
restaurants = scraper.find_all('div', {'class': 'container__09f24__21w3G'})

### Scraping Titles, Categories

In [7]:
titles = [restaurant.find_all('a', {'class': 'css-166la90'}) for restaurant in restaurants]
titles = [title[0].text for title in titles]

In [7]:
# grab the restaurant categories
categories = [restaurant.find_all('a', {'class': 'css-1joxor6'}) for restaurant in restaurants]

In [29]:
category_one = []
category_two = []
category_three = []
for category in categories:
    for i in range(3):
        if i == 0:
            try:
                category_one.append(category[i].text)
            except:
                category_one.append(None)
        if i == 1:
            try:
                category_two.append(category[i].text)
            except:
                category_two.append(None)
        if i == 2:
            try:
                category_three.append(category[i].text)
            except:
                category_three.append(None)

### Scraping Phone Numbers, Neighborhoods, Reviews

In [None]:
restaurant_

In [23]:
# paragraphs
restaurant_paragraphs = [restaurant.find_all('p') for restaurant in restaurants]

In [60]:
# get the phone numbers
phone_numbers = [paragraph_group[1].text if paragraph_group[1].text.replace(' ', '').isdigit() else None for paragraph_group in restaurant_paragraphs]

In [63]:
# without a comprehension
phone_numbers = []
for paragraph_group in restaurant_paragraphs:
    if paragraph_group[1].text.replace(' ', '').isdigit():
        phone_numbers.append(paragraph_group[1].text)
    else:
        phone_numbers.append(None)

In [26]:
# finding out neighborhoods
neighborhoods = []
for paragraph_group in restaurant_paragraphs:
    if paragraph_group[-2].text != 'Delivery':
        neighborhoods.append(paragraph_group[-2].text)
    else:
        neighborhoods.append(paragraph_group[-3].text)

In [27]:
# reviews -- sometimes they are missing, hence the check
reviews = []
for idx, paragraph_group in enumerate(restaurant_paragraphs):
    if idx == len(restaurant_paragraphs) - 1:
        try:
            reviews.append(paragraph_group[1][3].text.replace('\xa0more', ''))
        except:
            reviews.append(paragraph_group[-1].text.replace('\xa0more', ''))
    else: 
        if '\xa0more' in paragraph_group[-1].text:
                reviews.append(paragraph_group[-1].text.replace('\xa0more', ''))
        else:
            reviews.append(None)

### Scraping Price Ranges, Addresses

In [30]:
restaurant_spans = [restaurant.find_all('span') for restaurant in restaurants]

In [31]:
# number of reviews
num_reviews = [int(span_group[4].text) if span_group[4].text.isdigit() else None for span_group in restaurant_spans]

In [32]:
# price ranges -- sometimes they have blanks:
price_ranges = []
for span_group in restaurant_spans:
    if '\xA3' in span_group[5].text:
        price_ranges.append(span_group[5].text)
    elif '\xA3' in span_group[6].text:
        price_ranges.append(span_group[6].text)
    else:
        price_ranges.append(None)

In [33]:
# for addresses

# helper function to check if a given piece of text is an address or not
def is_address(text):
    text = text.split()
    try:
        # accommodates for values like 34-66, 4/6, etc
        if text[0].replace('-', '').replace('/', '').isdigit() and text[-1].isalpha():
            return True
        else:
            return False
    except:
        return False

addresses = []
for span_group in restaurant_spans:
    has_address = False
    for group in span_group:
        if is_address(group.text):
            has_address = True
            address = group.text
    if has_address:
        addresses.append(address)
    else:
        addresses.append(None)

### Scraping Average Rating

In [35]:
# get all the divs
restaurant_divs = [restaurant.find_all('div', {'role': 'img'}) for restaurant in restaurants]

In [36]:
restaurant_ratings = [float(str(restaurant[0]).split('"')[1].split()[0]) for restaurant in restaurant_divs]

### Neighborhoods

In [38]:
# neighborhoods
neighborhoods = [paragraph_group[-2].text for paragraph_group in restaurant_paragraphs[:-1]]
neighborhoods.append(restaurant_paragraphs[-1][-2].text)

In [65]:
df_dict = {
    'Name': titles,
    'PhoneNumber': phone_numbers,
    'Address': addresses,
    'Neighborhood': neighborhoods,
    'PriceRange': price_ranges,
    'AvgRating': restaurant_ratings,
    'NumRatings': num_reviews,
    'CategoryOne': category_one,
    'CategoryTwo': category_two,
    'CategoryThree': category_three,
    'Review': reviews
}

df = pd.DataFrame(df_dict)
df

Unnamed: 0,Name,PhoneNumber,Address,Neighborhood,PriceRange,AvgRating,NumRatings,CategoryOne,CategoryTwo,CategoryThree,Review
0,Dishoom,020 7420 9320,12 Upper Saint Martin's Lane,Covent Garden,££,4.5,1841,Indian,,,“Hard to find a way to add any higher praise t...
1,The Breakfast Club,020 7434 2571,33 D'Arblay Street,Soho,££,4.0,494,Coffee & Tea,Breakfast & Brunch,American (Traditional),“By far one of my most favorite breakfast plac...
2,Flat Iron,,17 Beak Street,Soho,££,4.5,380,Steakhouses,,,“Went to London for vacation and stopped by th...
3,Ffiona’s Restaurant,020 7937 4152,51 Kensington Church Street,Outdoor seating,££,4.5,267,British,,,“Ffiona's is easily my favorite restaurant in ...
4,Dishoom,020 7420 9322,22 Kingly Street,Soho,££,4.5,547,Indian,,,“I visited Dishoom during my recent London tri...
5,Restaurant Gordon Ramsay,020 7352 4441,68 Royal Hospital Road,Chelsea,££££,4.5,204,French,British,,“Compared to Michelin 3-star restaurants in Ca...
6,The Fat Bear,020 7236 2498,61 Carter Lane,Blackfriars,££,4.5,122,American (Traditional),Soul Food,Cajun/Creole,"“WOW, this place is delicious!\n\nOur family s..."
7,Mother Mash,020 7494 9644,26 Ganton Street,Soho,££,4.0,470,British,,,“Soho is full of culture and amazing places to...
8,Sketch,020 7659 4500,9 Conduit Street,Mayfair,£££,4.0,826,French,Modern European,Cocktail Bars,"“Having received it's 3rd star in 2020, sketch..."
9,The Golden Chippy,020 8692 4333,62 Greenwich High Road,Deptford,££,5.0,106,Fish & Chips,,,“The hype is REAL! The Golden Chippy is truly ...


### Final Answer, That Loops Through The Entire Website

In [34]:
i = 0
num_round = 1
total_titles = []
total_phone_numbers = []
total_addresses = []
total_neighborhoods = []
total_price_ranges = []
total_restaurant_ratings = []
total_num_reviews = []
total_category_one = []
total_category_two = []
total_category_three = []
total_reviews = []
looping = True

# helper function to check if a given piece of text is an address or not
def is_address(text):
    text = text.split()
    try:
        # accommodates for values like 34-66, 4/6, etc
        if text[0].replace('-', '').replace('/', '').isdigit() and text[-1].isalpha():
            return True
        else:
            return False
    except:
        return False

while looping:
    url = f'https://www.yelp.com/search?find_desc=Restaurants&find_loc=London%2C+United+Kingdom&ns=1&start={i}'
    req = requests.get(url)
    scraper = BeautifulSoup(req.text)
    restaurants = scraper.find_all('div', {'class': 'container__09f24__21w3G'})
    if len(restaurants) == 0:
        print("No more restaurants found.  Breaking the loop.")
        looping = False
        break
    
    #### Getting All Links, Code for Titles and Categories
    titles = [restaurant.find_all('a', {'class': 'css-166la90'}) for restaurant in restaurants]
    titles = [title[0].text for title in titles]
    total_titles.extend(titles)
    
    # restaurant categories
    categories = [restaurant.find_all('a', {'class': 'css-1joxor6'}) for restaurant in restaurants]
    category_one = []
    category_two = []
    category_three = []
    for category in categories:
        for j in range(3):
            if j == 0:
                try:
                    category_one.append(category[i].text)
                except:
                    category_one.append(None)
            if j == 1:
                try:
                    category_two.append(category[i].text)
                except:
                    category_two.append(None)
            if j == 2:
                try:
                    category_three.append(category[i].text)
                except:
                    category_three.append(None)
    total_category_one.extend(category_one)
    total_category_two.extend(category_two)
    total_category_three.extend(category_three)
                    
    # getting all the paragraphs within each restaurant
    restaurant_paragraphs = [restaurant.find_all('p') for restaurant in restaurants]
    
    # get the phone numbers
    phone_numbers = [paragraph_group[1].text if paragraph_group[1].text.replace(' ', '').isdigit() else None for paragraph_group in restaurant_paragraphs]
    total_phone_numbers.extend(phone_numbers)
    
    # get the neighborhoods
    neighborhoods = []
    for paragraph_group in restaurant_paragraphs:
        if paragraph_group[-2].text != 'Delivery':
            neighborhoods.append(paragraph_group[-2].text)
        else:
            neighborhoods.append(paragraph_group[-3].text)
    total_neighborhoods.extend(neighborhoods)
    
    # get the reviews
    reviews = []
    for idx, paragraph_group in enumerate(restaurant_paragraphs):
        if idx == len(restaurant_paragraphs) - 1:
            try:
                reviews.append(paragraph_group[1][3].text.replace('\xa0more', ''))
            except:
                reviews.append(paragraph_group[-1].text.replace('\xa0more', ''))
        else: 
            if '\xa0more' in paragraph_group[-1].text:
                    reviews.append(paragraph_group[-1].text.replace('\xa0more', ''))
            else:
                reviews.append(None)
    total_reviews.extend(reviews)
    
    # get all the spans in a given page
    restaurant_spans = [restaurant.find_all('span') for restaurant in restaurants]
    
    # get the number of reviews
    num_reviews = [int(span_group[4].text) if span_group[4].text.isdigit() else None for span_group in restaurant_spans]
    total_num_reviews.extend(num_reviews)
    
    # get the price ranges
    price_ranges = []
    for span_group in restaurant_spans:
        if '\xA3' in span_group[5].text:
            price_ranges.append(span_group[5].text)
        elif '\xA3' in span_group[6].text:
            price_ranges.append(span_group[6].text)
        else:
            price_ranges.append(None)
    total_price_ranges.extend(price_ranges)
            
    # get the addresses
    addresses = []
    for span_group in restaurant_spans:
        has_address = False
        for group in span_group:
            if is_address(group.text):
                has_address = True
                address = group.text
        if has_address:
            addresses.append(address)
        else:
            addresses.append(None)
    total_addresses.extend(addresses)
            
    # get all the rating divs
    restaurant_divs = [restaurant.find_all('div', {'role': 'img'}) for restaurant in restaurants]
    
    # and get the restaurant ratings
    restaurant_ratings = [float(str(restaurant[0]).split('"')[1].split()[0]) for restaurant in restaurant_divs]
    total_restaurant_ratings.extend(restaurant_ratings)
    
    print(f'Finished Round {num_round}, value of i: {i}')
    i += 10
    num_round += 1
    
df_dict = {
    'Name': total_titles,
    'PhoneNumber': total_phone_numbers,
    'Address': total_addresses,
    'Neighborhood': total_neighborhoods,
    'PriceRange': total_price_ranges,
    'AvgRating': total_restaurant_ratings,
    'NumRatings': total_num_reviews,
    'CategoryOne': total_category_one,
    'CategoryTwo': total_category_two,
    'CategoryThree': total_category_three,
    'Review': total_reviews
}

df = pd.DataFrame(df_dict)
df

Finished Round 1, value of i: 0
Finished Round 2, value of i: 10
Finished Round 3, value of i: 20
Finished Round 4, value of i: 30
Finished Round 5, value of i: 40
Finished Round 6, value of i: 50
Finished Round 7, value of i: 60
Finished Round 8, value of i: 70
Finished Round 9, value of i: 80
Finished Round 10, value of i: 90
Finished Round 11, value of i: 100
Finished Round 12, value of i: 110
Finished Round 13, value of i: 120
Finished Round 14, value of i: 130
Finished Round 15, value of i: 140
Finished Round 16, value of i: 150
Finished Round 17, value of i: 160
Finished Round 18, value of i: 170
Finished Round 19, value of i: 180
Finished Round 20, value of i: 190
Finished Round 21, value of i: 200
Finished Round 22, value of i: 210
Finished Round 23, value of i: 220
Finished Round 24, value of i: 230
No more restaurants found.  Breaking the loop.


Unnamed: 0,Name,PhoneNumber,Address,Neighborhood,PriceRange,AvgRating,NumRatings,CategoryOne,CategoryTwo,CategoryThree,Review
0,The Mayfair Chippy,020 7741 2233,14 North Audley Street,Mayfair,££,4.5,282.0,Fish & Chips,Fish & Chips,Fish & Chips,“One of the best fish ever with the most tasty...
1,Dishoom,020 7420 9320,12 Upper Saint Martin's Lane,Covent Garden,££,4.5,1840.0,Indian,Indian,Indian,“Hard to find a way to add any higher praise t...
2,Flat Iron,,17 Beak Street,Soho,££,4.5,380.0,Steakhouses,Steakhouses,Steakhouses,“Went to London for vacation and stopped by th...
3,Ffiona’s Restaurant,020 7937 4152,51 Kensington Church Street,Outdoor seating,££,4.5,267.0,British,British,British,“Ffiona's is easily my favorite restaurant in ...
4,Restaurant Gordon Ramsay,020 7352 4441,68 Royal Hospital Road,Chelsea,££££,4.5,204.0,French,French,French,“Compared to Michelin 3-star restaurants in Ca...
...,...,...,...,...,...,...,...,...,...,...,...
235,Ali Baba,020 7723 5805,32 Ivor Place,Marylebone,£,4.5,12.0,,,,“I'm staying in an AirBnB around the corner fr...
236,Bill’s,020 7486 7701,119-121 Baker Street,Marylebone,££,4.0,49.0,,,,"“Decent food, nice restaurant and kind staff. ..."
237,Maggie Jones’s,020 7937 6462,6 Old Court Place,Kensington,£££,4.5,119.0,,,,"“I saw this place on Instagram and was like ""w..."
238,The Malaysian Pancake Company,07786 835909,,Southwark,,4.5,3.0,,,,“After many Brick Lane trips and sampling my w...
