# Data Scraping

In [2]:
import concurrent.futures
import requests
from bs4 import BeautifulSoup
import json
import random
import pandas as pd
import time

start_time = time.time()

def SelectHeader():
    user_agent_list = [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; CrOS x86_64 14541.0.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) nBrowser/1.0.0.0 Chrome/121.0.0.0 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/121.0.6167.18 Safari/537.36'
    ]
    user_agent = random.choice(user_agent_list)
    headers = {'User-Agent': user_agent}
    return headers

def ScrapeLinks(restaurants_url):
    Links = []
    with requests.Session() as session:
        response = session.get(restaurants_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        states = soup.find_all('div', class_='row group')
        for state in states:
            cities = state.find_all('a')
            Links.extend(["https://www.yellowpages.com" + city.get('href') for city in cities])
    return Links[:1]
    
def check_amenity(amenities, amenity_icon, data_key):
    if amenity_icon in amenities:
        return 'YES'
    return 'NO'

def ScrapeData(first_url):
    Data = {
        'ID': [],
        'Name': [],
        'City': [],
        'State': [],
        'Rating (YP)': [],
        'Count (YP)': [],
        'Rating (TA)': [],
        'Count (TA)': [],
        'Amenity_Family': [],
        'Amenity_Alcohol': [],
        'Amenity_Outdoor': [],
        'Amenity_Wifi': [],
        'Amenity_Group': [],
        'Amenity_Reservation': [],
        'Amenity_Wheelchair': [],
        'Years': [],
        'Address': [],
        'Website': []
    }
    current_page_url = first_url

    with requests.Session() as session:
        while True:
            try:
                response = session.get(current_page_url, headers=SelectHeader())
                response.raise_for_status()  # Check for HTTP errors
            except requests.exceptions.HTTPError as err:
                print(f"HTTP error occurred: {err}")
                continue
            except requests.exceptions.RequestException as req_err:
                print(f"Request error occurred: {req_err}")
                continue
            except Exception as e:
                print(f"An unexpected error occurred: {e}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            restaurants = soup.find_all('div', class_='result')

            for restaurant in restaurants:
                try:
                    Data['ID'].append(restaurant.get('id').split('-')[1])
                except:
                    Data['ID'].append('N/A')
                try:
                    Data['Name'].append(restaurant.find('a', class_='business-name').text)
                except:
                    Data['Name'].append('N/A')
                try: 
                    if restaurant.find('p',class_='adr'):
                        Data['City'].append(" ".join(restaurant.find('p',class_='adr').text.split('the')[1].split()[:-1]))
                    else:
                        Data['City'].append(restaurant.find('div', class_='locality').text.split(',')[0])
                except: 
                    Data['City'].append('N/A')
                try: 
                    Data['State'].append(restaurant.find('div', class_='locality').text.split(',')[1].split()[0])
                except: 
                    Data['State'].append('N/A')
                try:
                    Data['Rating (YP)'].append(" ".join(restaurant.find('div',class_='result-rating').get('class')[1:]))       
                except:
                    Data['Rating (YP)'].append('N/A')
                try: 
                    Data['Count (YP)'].append(restaurant.find('span',class_='count').text.replace('(','').replace(')',''))
                except:
                    Data['Count (YP)'].append('N/A')
                try:
                    Data['Rating (TA)'].append(json.loads(restaurant.find('div',class_='ratings').get('data-tripadvisor'))['rating'])
                except: 
                    Data['Rating (TA)'].append('N/A')
                try: 
                    Data['Count (TA)'].append(json.loads(restaurant.find('div',class_='ratings').get('data-tripadvisor'))['count'])
                except: 
                    Data['Count (TA)'].append('N/A')
                try: 
                    amenities_span = restaurant.find('span', 'amenities-icons')
                    if amenities_span:
                        amenities = amenities_span.find_all('use')
                        Amenities = [amenity.get('xlink:href') for amenity in amenities]
                    else:
                        Amenities = ['N/A']
                except:
                    Amenities = ['N/A']
                amenity_mapping = {
                    '#icon-amenity-family': 'Amenity_Family',
                    '#icon-amenity-alcohol': 'Amenity_Alcohol',
                    '#icon-amenity-outdoor': 'Amenity_Outdoor',
                    '#icon-amenity-wifi': 'Amenity_Wifi',
                    '#icon-amenity-group': 'Amenity_Group',
                    '#icon-amenity-reservation': 'Amenity_Reservation',
                    '#icon-amenity-wheelchair': 'Amenity_Wheelchair'
                }
                for icon, key in amenity_mapping.items():
                    Data[key].append(check_amenity(Amenities, icon, key))
                try:
                    Data['Years'].append(restaurant.find('strong').text.split()[0])
                except:
                    Data['Years'].append('N/A')
                try:
                    Data['Address'].append(restaurant.find('div', class_='street-address').text)
                except:
                    Data['Address'].append('N/A')
                try:
                    Data['Website'].append(restaurant.find('a', class_='track-visit-website').get('href'))
                except:
                    Data['Website'].append('N/A')
            try:
                current_page_url = "https://www.yellowpages.com" + soup.find('a', class_='next ajax-page').get('href')
            except:
                print(current_page_url)
                break

    return Data

def process_link(link):
    data = ScrapeData(link)
    return pd.DataFrame(data)

def main():
    restaurants_url = 'https://www.yellowpages.com/categories/restaurants'
    links = ScrapeLinks(restaurants_url)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        dfs = list(executor.map(process_link, links))

    result_df = pd.concat(dfs, ignore_index=True)
    result_df.to_csv('restaurant_data_0-1.csv', index=False)

    total_time = time.time() - start_time
    print(f'Tiempo total: {total_time / 60} minutos - {total_time / 100} segundos / ciudad')

if __name__ == "__main__":
    main()

HTTP error occurred: 404 Client Error: Not Found for url: https://www.yellowpages.com/birmingham-al/restaurants?page=15
https://www.yellowpages.com/birmingham-al/restaurants?page=79
Tiempo total: 1.90766841173172 minutos - 1.144601047039032 segundos / ciudad


In [None]:
import pandas as pd
import os

carpeta = r'D:\Programs\Python\Projects\restaurants'

dfs = []

for archivo in os.listdir(carpeta):
    if archivo.endswith('.csv'):
        ruta_completa = os.path.join(carpeta, archivo)
        
        df = pd.read_csv(ruta_completa)
        
        dfs.append(df)

df_final = pd.concat(dfs, ignore_index=True)

df_final.to_csv('restaurant_final.csv', index=False)

# Data Cleaning

In [None]:
import pandas as pd

res_df = pd.read_csv(r'D:\Programs\Python\Projects\Data Scraping\YellowPages_US_Restaurants\restaurant_final.csv',
                    na_values = "NO")

In [None]:
res_df

### Remove Duplicates

All columns

In [None]:
res_df = res_df.drop_duplicates()

res_df.shape

Name - City - Address

In [None]:
res_df = res_df.drop_duplicates(["Name","City","Address"])

res_df.shape

### Remove NaN values

Name - ID - City - State

In [None]:
res_df.dropna(subset=['Name','ID','City','State'], inplace = True)

### Convert Rating (YP) str to int

In [None]:
old_values = ['zero','one','one half','two','two half','three','three half','four','four half','five']
new_values = [0,1,1.5,2,2.5,3,3.5,4,4.5,5]

res_df.replace(old_values,new_values,inplace=True)

### States and Cities Cleaning

In [None]:
# Count the number of states
res_df['State'].unique().shape

In [None]:
# See the number of restaurants per state

res_df['State'].value_counts()

In [None]:
s_99201 = res_df[res_df['State'] == '99201']
s_99201

In [None]:
res_df['State'].replace('99201','WA', inplace = True)

In [None]:
s_99201 = res_df[res_df['State'] == '99201']
s_99201

In [None]:
s_33445 = res_df[res_df['State'] == '33445']
s_33445

In [None]:
res_df['State'].replace('33445','FL', inplace = True)

In [None]:
s_33445 = res_df[res_df['State'] == '33445']
s_33445

In [None]:
s_Dc = res_df[res_df['State'] == 'Dc']
s_Dc

In [None]:
res_df['State'].replace('Dc','DC', inplace = True)

In [None]:
s_Dc = res_df[res_df['State'] == 'Dc']
s_Dc

In [None]:
c_wa = res_df[res_df['City'] == 'Washington Dc']
c_wa

In [None]:
res_df['City'].replace('Washington Dc','Washington', inplace = True)

In [None]:
c_wa = res_df[res_df['City'] == 'Washington Dc']
c_wa

# Data Analysis

In [None]:
res_df.describe()

Top 10 Restaurants

In [None]:
top_res = pd.DataFrame(res_df['Name'].value_counts())
top_res.head(10)

Top 10 States

In [None]:
top_states = pd.DataFrame(res_df['State'].value_counts())
top_states.head(10)

Top 10 Cities

In [None]:
top_cities = pd.DataFrame(res_df['City'].value_counts())
top_cities.head(10)

In [None]:
fam_per = pd.DataFrame({"Count":res_df['Amenity_Family'].value_counts(),
                        "%":100*res_df['Amenity_Family'].value_counts()/res_df.shape[0]})

fam_per.sort_values("Count")

In [None]:
alc_per = pd.DataFrame({"Count":res_df['Amenity_Alcohol'].value_counts(),
                        "%":100*res_df['Amenity_Alcohol'].value_counts()/res_df.shape[0]})

alc_per.sort_values("Count")

In [None]:
out_per = pd.DataFrame({"Count":res_df['Amenity_Outdoor'].value_counts(),
                        "%":100*res_df['Amenity_Outdoor'].value_counts()/res_df.shape[0]})

out_per.sort_values("Count")

In [None]:
wifi_per = pd.DataFrame({"Count":res_df['Amenity_Wifi'].value_counts(),
                        "%":100*res_df['Amenity_Wifi'].value_counts()/res_df.shape[0]})

wifi_per.sort_values("Count")

In [None]:
grp_per = pd.DataFrame({"Count":res_df['Amenity_Group'].value_counts(),
                        "%":100*res_df['Amenity_Group'].value_counts()/res_df.shape[0]})

grp_per.sort_values("Count")

In [None]:
res_per = pd.DataFrame({"Count":res_df['Amenity_Reservation'].value_counts(),
                        "%":100*res_df['Amenity_Reservation'].value_counts()/res_df.shape[0]})

res_per.sort_values("Count")

In [None]:
whe_per = pd.DataFrame({"Count":res_df['Amenity_Wheelchair'].value_counts(),
                        "%":100*res_df['Amenity_Wheelchair'].value_counts()/res_df.shape[0]})

whe_per.sort_values("Count")

In [None]:
res_df.keys()

¿Cuál es el promedio de calificación (Rating) en Yelp (YP) y TripAdvisor (TA)?

In [None]:
res_df['Rating (YP)'].describe()

In [None]:
res_df['Rating (TA)'].describe()

¿Cuál es la cantidad promedio de reseñas (Count) en Yelp y TripAdvisor?

In [None]:
res_df['Count (YP)'].describe()

In [None]:
res_df['Count (YP)'].median()

In [None]:
res_df['Count (TA)'].describe()

In [None]:
res_df['Count (TA)'].median()

¿Cuántos restaurantes hay en cada estado?

In [None]:
pd.DataFrame(res_df['State'].value_counts())

¿En qué estado se encuentran los restaurantes mejor calificados?

In [None]:
state = res_df.groupby('State')['Rating (YP)'].mean()

top_yp_rating = pd.DataFrame({'State':state.index,
                             'Rating (YP) Mean':state.values})

top_yp_rating.sort_values(['Rating (YP) Mean'], ascending=False).head(10)

In [None]:
state = res_df.groupby('State')['Rating (TA)'].mean()

top_yp_rating = pd.DataFrame({'State':state.index,
                             'Rating (TA) Mean':state.values})

top_yp_rating.sort_values(['Rating (TA) Mean'], ascending=False).head(10)