# 시카고 샌드위치 맛집 분석

## 1. 시카고 샌드위치 맛집 사이트에 접근하기

In [17]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [18]:
url_base = "http://www.chicagomag.com"
url_sub = "/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub

html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

In [96]:
rest_list = soup.find_all(class_='sammy')
rest_list[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br>
Old Oak Tap<br>
<em>Read more</em> </br></br></a></div>
</div>

In [20]:
len(rest_list)

50

## 2. 접근한 웹 페이지에서 원하는 데이터 추출하고 정리하기

In [21]:
tmp_one = rest_list[0]

In [22]:
tmp_one.find(class_='sammyRank')

<div class="sammyRank">1</div>

In [23]:
tmp_one.find(class_='sammyRank').get_text()

'1'

In [24]:
tmp_one.find(class_='sammyListing').get_text()

'BLT\r\nOld Oak Tap\nRead more '

In [25]:
tmp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [26]:
tmp_str = tmp_one.find(class_='sammyListing').get_text()
tmp_str.split('\n')

['BLT\r', 'Old Oak Tap', 'Read more ']

In [27]:
tmp_res = tmp_str.split('\n')
menu = tmp_res[0].replace('\r','')
menu

'BLT'

In [28]:
cafe = tmp_res[1]
cafe

'Old Oak Tap'

In [29]:
# Regular Expression
import re

In [30]:
re.split(('\n|\r\n'),tmp_str)

['BLT', 'Old Oak Tap', 'Read more ']

In [31]:
print(re.split(('\n|\r\n'),tmp_str)[0])
print(re.split(('\n|\r\n'),tmp_str)[1])

BLT
Old Oak Tap


In [32]:
from urllib.parse import urljoin

## 데이터 프레임 만들기 전에 항목 데이터 구하기

In [33]:
rank = []
main_menu = []
cafe_name = []
url_add = []

for rest in rest_list:
    rank.append(int(rest.find(class_='sammyRank').get_text()))
    tmp_str = rest.find(class_='sammyListing').get_text()
    main_menu.append(re.split(('\n|\r\n'),tmp_str)[0])
    cafe_name.append(re.split(('\n|\r\n'),tmp_str)[1])
    url_add.append(urljoin(url_base, rest.find('a')['href']))

In [34]:
main_menu[:5]

['BLT', 'Fried Bologna', 'Woodland Mushroom', 'Roast Beef', 'PB&L']

In [35]:
import pandas as pd
df = pd.DataFrame({'rank':rank,'cafe':cafe_name,'menu':main_menu,'URL':url_add})
df

Unnamed: 0,rank,cafe,menu,URL
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...
5,6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
6,7,Acadia,Lobster Roll,http://www.chicagomag.com/Chicago-Magazine/Nov...
7,8,Birchwood Kitchen,Smoked Salmon Salad,http://www.chicagomag.com/Chicago-Magazine/Nov...
8,9,Cemitas Puebla,Atomica Cemitas,http://www.chicagomag.com/Chicago-Magazine/Nov...
9,10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [36]:
df.to_csv('data/chicago1.csv',sep=',',encoding="utf-8")

## 3. 다수의 웹 페이지에 자동으로 접근해서 원하는 정보 가져오기

In [37]:
df['URL'][0]

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [38]:
html = urlopen(df['URL'][0])
soup_tmp = BeautifulSoup(html, 'html.parser')

In [39]:
soup_tmp.find(class_='addy')

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [40]:
tmp_str = soup_tmp.find(class_='addy').get_text()
tmp_str

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [41]:
tmp_str.split()

['$10.', '2109', 'W.', 'Chicago', 'Ave.,', '773-772-0406,', 'theoldoaktap.com']

In [42]:
tmp_price = tmp_str.split()[0][:-1]
tmp_price

'$10'

In [43]:
tmp_address = ' '.join(tmp_str.split()[1:-2])[:-1]
tmp_address

'2109 W. Chicago Ave.'

In [44]:
tmp_tel = tmp_str.split()[-2][:-1]
tmp_tel

'773-772-0406'

### - 상태 진행바 적용

In [77]:
from tqdm.notebook import tqdm
import time

In [80]:
price = []
addr = []
for n in tqdm(df.index):
    html = urlopen(df['URL'][n])
    soup_tmp = BeautifulSoup(html,'html.parser')
    tmp_str = soup_tmp.find(class_='addy').get_text()
    
    price.append(tmp_str.split()[0][:-1])
    addr.append(' '.join(tmp_str.split()[1:-2])[:-1])

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [47]:
price[:10]

NameError: name 'price' is not defined

In [36]:
addr[:10]

['2109 W. Chicago Ave.',
 '800 W. Randolph St.',
 '445 N. Clark St.',
 '914 Noyes St., Evanston',
 '825 W. Fulton Mkt.',
 '100 E. Walto',
 '1639 S. Wabash Ave.',
 '2211 W. North Ave.',
 '3619 W. North Ave.',
 '3267 S. Halsted St.']

In [85]:
df2 = df.iloc[:,:-1]
df2

Unnamed: 0,rank,cafe,menu
0,1,Old Oak Tap,BLT
1,2,Au Cheval,Fried Bologna
2,3,Xoco,Woodland Mushroom
3,4,Al’s Deli,Roast Beef
4,5,Publican Quality Meats,PB&L
5,6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad
6,7,Acadia,Lobster Roll
7,8,Birchwood Kitchen,Smoked Salmon Salad
8,9,Cemitas Puebla,Atomica Cemitas
9,10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy


In [86]:
df2['price'] = price

In [87]:
df2['addr'] = addr

In [88]:
df2

Unnamed: 0,rank,cafe,menu,price,addr
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston"
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.
5,6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,$7.25,100 E. Walto
6,7,Acadia,Lobster Roll,$16,1639 S. Wabash Ave.
7,8,Birchwood Kitchen,Smoked Salmon Salad,$10,2211 W. North Ave.
8,9,Cemitas Puebla,Atomica Cemitas,$9,3619 W. North Ave.
9,10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,$17,3267 S. Halsted St.


In [89]:
df2.set_index('rank', inplace=True)
df2

Unnamed: 0_level_0,cafe,menu,price,addr
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.
2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston"
5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,$7.25,100 E. Walto
7,Acadia,Lobster Roll,$16,1639 S. Wabash Ave.
8,Birchwood Kitchen,Smoked Salmon Salad,$10,2211 W. North Ave.
9,Cemitas Puebla,Atomica Cemitas,$9,3619 W. North Ave.
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,$17,3267 S. Halsted St.


In [90]:
df2.to_csv('data/chicago2.csv',sep=',',encoding="utf-8")

## 구글지도 API

In [49]:
df3 = pd.read_csv('chicago2.csv')

In [56]:
import numpy as np
import folium
import googlemaps

In [57]:
key_fd = open('googlemapskey.txt', mode='r')
gmaps_key = key_fd.read(100)
key_fd.close()

In [58]:
gmaps = googlemaps.Client(key=gmaps_key)

In [81]:
lat = []
lng = []

for n in tqdm(df.index):
    if df3['addr'][n] != 'Multipl':
        target_name = df3['addr'][n]+', '+'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [67]:
df3['lat'] = lat
df3['lng'] = lng
df3.head()

Unnamed: 0,rank,cafe,menu,price,addr,lat,lng
0,1,Old Oak Tap,BLT,$10,2109 W. Chicago Ave.,41.895605,-87.679961
1,2,Au Cheval,Fried Bologna,$9,800 W. Randolph St.,41.884658,-87.647667
2,3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St.,41.890523,-87.630783
3,4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston",42.058322,-87.683748
4,5,Publican Quality Meats,PB&L,$10,825 W. Fulton Mkt.,41.8866,-87.648451


In [71]:
mapping = folium.Map(location=[df3['lat'].mean(), df3['lng'].mean()],zoom_start=11)
folium.Marker([df3['lat'].mean(), df3['lng'].mean()],popup='center').add_to(mapping)
mapping

In [70]:
mapping = folium.Map(location=[df3['lat'].mean(), df3['lng'].mean()],zoom_start=11)

for n in df3.index:
    if df3['addr'][n] != 'Multipl':
        folium.Marker([df3['lat'][n], df3['lng'][n]],popup=df3['cafe'][n]).add_to(mapping)
mapping

In [83]:
df3.to_csv('data/chicago3.csv',sep=',',encoding="utf-8")