In [148]:
from bs4 import BeautifulSoup
import pandas as pd
from urllib.request import urlopen, Request
from user_agent import generate_user_agent

url = "https://www.chicagomag.com/chicago-magazine/august-2024/chicagos-50-best-restaurants-ranked/"

req = Request(url=url, headers={'User-Agent':generate_user_agent()})
page = urlopen(req)
soup = BeautifulSoup(page, "html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <script src="https://cmp.osano.com/16A1AnRt2Fn8i1unj/f15ebf08-7008-40fe-9af3-db96dc3e8266/osano.js">
   </script>
   <title>
    Chicago’s 50 Best Restaurants, Ranked – Chicago Magazine
   </title>
   <meta content="max-image-preview:large" name="robots"/>
   <style>
    img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }
   </style>
   <!-- Google Tag Manager for WordPress by gtm4wp.com -->
   <script data-cfasync="false" data-pagespeed-no-defer="">
    var gtm4wp_datalayer_name = "dataLayer";
	var dataLayer = dataLayer || [];
   </script>
   <!-- End Google Tag Manager for WordPress by gtm4wp.com -->
   <meta content="width=device-width, initial-scale=1" name="viewport"/>
   <link href="//cdnjs.cloudflare.com" rel="dns-prefetch">
    <link href="https://www.chicagomag

In [149]:
soup.find_all('a', "br50-link price-3")[1].prettify()

'<a class="br50-link price-3" href="/chicago-magazine/august-2024/50-best-restaurants/avec/">\n <h2>\n  <span>\n   4\n  </span>\n  Avec\n </h2>\n <h3>\n  West Loop | River North\n  <br/>\n  <span>\n   $$$\n   <span class="light">\n    $$\n   </span>\n  </span>\n </h3>\n</a>\n'

In [150]:
from urllib.parse import urljoin
import re

rank_list = []
name_list = []
link_list = []

for i in range(5):
    elements = soup.find_all('a', f'br50-link price-{i+1}')
    for element in elements:
        rank = ''.join(re.findall(r"\d", element.find("h2").get_text()))
        name = ''.join(re.findall(r"[A-Za-z&.’\s./-]", element.find("h2").get_text()))
        rank = int(rank)
        link = element['href']

        rank_list.append(rank)
        name_list.append(name)
        link_list.append(urljoin("https://www.chicagomag.com", link))

len(rank_list), len(name_list), len(link_list)

(50, 50, 50)

In [151]:
for i in range(5):
    print(str(rank_list[i]) + ". " + name_list[i] + " " + link_list[i])

23. La Chaparrita Grocery https://www.chicagomag.com/chicago-magazine/august-2024/50-best-restaurants/la-chaparrita-grocery/
26. Birrieria Zaragoza https://www.chicagomag.com/chicago-magazine/august-2024/50-best-restaurants/birrieria-zaragoza/
27. Lem’s Bar-B-Q https://www.chicagomag.com/chicago-magazine/august-2024/50-best-restaurants/lems-bar-b-q/
31. JT’s Genuine Sandwich Shop https://www.chicagomag.com/chicago-magazine/august-2024/50-best-restaurants/jts-genuine-sandwich-shop/
33. Mitsuwa Marketplace Food Court https://www.chicagomag.com/chicago-magazine/august-2024/50-best-restaurants/mitsuwa-marketplace-food-court/


In [152]:
rest_data = {'Rank':rank_list, 'Name':name_list, "URL":link_list}
rest_df = pd.DataFrame(rest_data)

rest_df.set_index("Rank", inplace=True)
rest_df.sort_index(inplace=True)
rest_df.head(20)

Unnamed: 0_level_0,Name,URL
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Monteverde,https://www.chicagomag.com/chicago-magazine/au...
2,Oriole,https://www.chicagomag.com/chicago-magazine/au...
3,Galit,https://www.chicagomag.com/chicago-magazine/au...
4,Avec,https://www.chicagomag.com/chicago-magazine/au...
5,Kasama,https://www.chicagomag.com/chicago-magazine/au...
6,Virtue,https://www.chicagomag.com/chicago-magazine/au...
7,HaiSous Vietnamese Kitchen,https://www.chicagomag.com/chicago-magazine/au...
8,Shaw’s Crab House,https://www.chicagomag.com/chicago-magazine/au...
9,Dear Margaret,https://www.chicagomag.com/chicago-magazine/au...
10,Boka,https://www.chicagomag.com/chicago-magazine/au...


In [93]:
rest_df.to_csv("../data/chicago_restaurant.csv", sep=",", encoding="utf-8")

In [153]:
from tqdm import tqdm

price_list = []
address_list = []
site_list = []

for idx, row in tqdm(rest_df.iterrows(), total=len(rest_df.index)):
    req = Request(url=row['URL'], headers={'User-Agent':generate_user_agent()})
    page = urlopen(req)
    soup = BeautifulSoup(page, "html.parser")

    info = soup.find("div", "br50-footer")
    text = info.get_text()

    if info.find(class_="light") == None:
        price = info.find(class_="dollars").get_text()
    else:
        price = info.find(class_="dollars").get_text()[len(info.find(class_="light").get_text()):]

    match = re.search(r"Address:\s*(.*?)\s*Website:", text)
    if match:
        address = match.group(1)
    website = info.find("a")["href"]

    price_list.append(price)
    address_list.append(address)
    site_list.append(website)

len(price_list), len(address_list), len(site_list)

100%|██████████| 50/50 [00:52<00:00,  1.04s/it]


(50, 50, 50)

In [157]:
rest_df['Price'] = price_list
rest_df['Address'] = address_list
rest_df['Website'] = site_list

rest_df = rest_df.loc[:, ['Name', 'Price', 'Address', 'Website']]

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

display(rest_df)

Unnamed: 0_level_0,Name,Price,Address,Website
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Monteverde,$$$,1020 W. Madison St.,https://monteverdechicago.com
2,Oriole,$$$$$,661 W. Walnut St.,https://www.oriolechicago.com
3,Galit,$$$$,2429 N. Lincoln Ave.,https://www.galitrestaurant.com
4,Avec,$$$,615 W. Randolph St. | 141 W. Erie St.,https://www.avecrestaurant.com
5,Kasama,$$$$$,1001 N. Winchester Ave.,https://www.kasamachicago.com
6,Virtue,$$$,1462 E. 53rd St.,https://www.virtuerestaurant.com
7,HaiSous Vietnamese Kitchen,$$$,1800 S. Carpenter St.,https://www.haisous.com
8,Shaw’s Crab House,$$$,21 E. Hubbard St.,https://www.shawscrabhouse.com/chicago-illinois
9,Dear Margaret,$$$,2965 N. Lincoln Ave,https://www.dearmargaretchi.com
10,Boka,$$$$$,1729 N. Halsted St.,https://www.bokachicago.com


In [156]:
rest_df.to_csv("../data/chicago_restaurant2.csv", sep=",", encoding="utf-8")