In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from auth import *

def KakaoMapAPI(regions):
    # Create an empty DataFrame to store all the restaurant information
    all_df = pd.DataFrame()
    
    # Iterate over each region and search for restaurants using the Kakao Map API
    for region in regions:
        # Set the API endpoint URL
        url = "https://dapi.kakao.com/v2/local/search/keyword.json"
        
        # Set the headers for the API request
        headers = {"Authorization": f"KakaoAK {REST_API_KEY}"}
        
        # Set the search parameters, including the radius of the search (20km), the category of the search (food), 
        # and the search query (the region name followed by "restaurant" in Korean)
        params = {
            "radius" : "20000",
            "category_group_code" : "FD6",
            "query": f"{region} 맛집"
        }
        
        # Iterate over each page of search results (up to 45 pages) and append the restaurant information to the DataFrame
        for i in range(1,46):
            # Set the page parameter for the search request
            params['page'] = str(i)
            
            # Send the search request and get the response as JSON
            response = requests.get(url, headers=headers, params=params)
            tmp_json = response.json()['documents']
            
            # Convert the JSON to a DataFrame and append it to the main DataFrame
            df = pd.DataFrame(tmp_json)
            all_df = pd.concat([all_df, df])
    
    # Rename the DataFrame columns to be more descriptive
    all_df = all_df.rename(columns={
        'x': 'diner_lat',
        'y' : 'diner_lon',
        'road_address_name': 'diner_address',
        'place_name' : 'diner_name',
        'category_name' : 'diner_category',
        'phone' : 'diner_phone',
        'id' : 'diner_id'
    })
    
    # Return the final DataFrame with all the restaurant information
    return all_df


In [31]:
regions = ['삼성', '양재']
urls_df = KakaoMapAPI(regions)

In [34]:
urls_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1350 entries, 0 to 14
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   address_name         1350 non-null   object
 1   category_group_code  1350 non-null   object
 2   category_group_name  1350 non-null   object
 3   diner_category       1350 non-null   object
 4   distance             1350 non-null   object
 5   diner_id             1350 non-null   object
 6   diner_phone          1350 non-null   object
 7   diner_name           1350 non-null   object
 8   place_url            1350 non-null   object
 9   diner_address        1350 non-null   object
 10  diner_lat            1350 non-null   object
 11  diner_lon            1350 non-null   object
dtypes: object(12)
memory usage: 137.1+ KB


In [6]:
urls_df.head()

Unnamed: 0,address_name,category_group_code,category_group_name,diner_category,distance,diner_id,diner_phone,diner_name,place_url,diner_address,diner_lat,diner_lon
0,서울 종로구 종로3가 167,FD6,음식점,"음식점 > 한식 > 육류,고기 > 닭요리",,8758064,02-2263-6658,계림 종로본점,http://place.map.kakao.com/8758064,서울 종로구 돈화문로4길 39,126.994503220999,37.5701578866786
1,서울 종로구 종로3가 107-2,FD6,음식점,음식점 > 한식 > 설렁탕,,8074478,02-2271-3820,종로설렁탕,http://place.map.kakao.com/8074478,서울 종로구 종로 110,126.98975583008657,37.5699161034148
2,서울 종로구 종로3가 158,FD6,음식점,"음식점 > 한식 > 육류,고기 > 곱창,막창",,16415346,02-2274-6683,최가네황소곱창,http://place.map.kakao.com/16415346,서울 종로구 돈화문로4길 27,126.99386480069762,37.57012181541133
3,서울 종로구 종로3가 123-3,FD6,음식점,"음식점 > 한식 > 육류,고기",,24049545,02-2269-8877,화통본가,http://place.map.kakao.com/24049545,서울 종로구 수표로20길 33,126.991283949953,37.5700531774283
4,서울 종로구 종로3가 16,FD6,음식점,음식점 > 한식,,16008086,02-742-8525,시골전집,http://place.map.kakao.com/16008086,서울 종로구 종로 123,126.99117633742372,37.57070458907816


## 중복 파악

In [35]:
urls_df.drop_duplicates(subset=['place_url'], keep='last',inplace=True)

# 리뷰 긁기

In [39]:
import os
import re
from time import sleep
import requests

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import (
    ElementNotInteractableException,
    NoSuchElementException,
    StaleElementReferenceException,
)
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

options = webdriver.ChromeOptions()
options.add_argument("lang=ko_KR")
options.add_argument("headless")
options.add_argument("window-size=1920x1080")
options.add_argument("disable-gpu")

# chromedriver_path = "/home/elinha/Testproject/mediapipe/chromedriver"
# 크롬 드라이버를 사용합니다 (맥은 첫 줄, 윈도우는 두번째 줄 실행)
driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)

  driver = webdriver.Chrome(executable_path=ChromeDriverManager().install(), options=options)


In [None]:
diner_cols = [
        'diner_id',
        'diner_name',           # 가게이름 
        'diner_category',       # 가게 카테고리
        'diner_menu',           # 가게 메뉴
        'diner_review_cnt',     # 가게의 평점 개수 
        'diner_review_avg',     # 가게의 평점 평균
        'diner_review_tags',    # 리뷰 태그
        'diner_address',        # 가게 주소    
        'diner_lon',            # 가게 위도
        'diner_lat',            # 가게 경도
        'diner_url',            # 가게 URL
        'diner_open_time'       # 가게 오픈시간
       ]

review_cols = [
        'diner_id',
        'reviewer_review',
        'reviewer_avg',         # 리뷰어의 평점 평균
        'reviewer_review_cnt',  # 리뷰어의 리뷰 개수
        'reviewer_review_score',# 리뷰어가 남긴 평점
        'reviewer_review_date', # 리뷰를 남긴 날짜
        'diner_idx'             # 가게 인덱스
        ]

In [33]:
urls_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1350 entries, 0 to 14
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   address_name         1350 non-null   object
 1   category_group_code  1350 non-null   object
 2   category_group_name  1350 non-null   object
 3   diner_category       1350 non-null   object
 4   distance             1350 non-null   object
 5   diner_id             1350 non-null   object
 6   diner_phone          1350 non-null   object
 7   diner_name           1350 non-null   object
 8   place_url            1350 non-null   object
 9   diner_address        1350 non-null   object
 10  diner_lat            1350 non-null   object
 11  diner_lon            1350 non-null   object
dtypes: object(12)
memory usage: 137.1+ KB


In [40]:
columns = [
    'dinner_id',
    'diner_name',           # 가게이름 
    'diner_category',       # 가게 카테고리
    'diner_menu',           # 가게 메뉴
    'diner_review_cnt',     # 가게의 평점 개수 
    'diner_review_avg',     # 가게의 평점 평균
    'diner_review_tags',    # 리뷰 태그
    'diner_address',        # 가게 주소 
    'diner_lon',
    'diner_lat',
    'diner_url',
    'diner_open_time',       # 가게 오픈시간
    "reviewer_review",
    'reviewer_avg',         # 리뷰어의 평점 평균
    'reviewer_review_cnt',  # 리뷰어의 리뷰 개수
    'reviewer_review_score',# 리뷰어가 남긴 평점
    'reviewer_review_date', # 리뷰를 남긴 날짜
    "reviewer_id"
]
# 사업장명, 주소, 음식종류1,음식종류2(메뉴),리뷰수,별점,리뷰
df = pd.DataFrame(columns=columns)

for i in range(len(urls_df)):
    dinner_id = urls_df.iloc[i,5]
    page_url = urls_df.iloc[i,8]
    cat1 = urls_df.iloc[i,3]
    address = urls_df.iloc[i,9]
    name  = urls_df.iloc[i,7]
    diner_phone = urls_df.iloc[i,6]
    diner_lat = urls_df.iloc[i,10]
    diner_lon = urls_df.iloc[i,11]
    print(f"{dinner_id}: {page_url}")
    # 상세보기 페이지에 접속합니다
    driver.get(page_url)
    wait = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'kakaoWrap')))
    sleep(3)


    if driver.find_elements(
        By.XPATH, '//div[@class="box_grade_off"]'
    ):  # 후기를 제공하지 않는 맛집 넘기기
        continue
    
    # 평균 별점
    score_min = driver.find_element(
        By.XPATH, '//*[@id="mArticle"]/div[1]/div[1]/div[2]/div/div/a[1]/span[1]'
    ).text

    # 리뷰수
    review_num = driver.find_element(By.XPATH, '//span[@class="color_g"]').text[1:-2]
    
    #영업시간
    open_times = driver.find_elements(By.XPATH, '//ul[@class="list_operation"]')
    
    if len(open_times):
        open_time = open_times[0].get_attribute('innerText').split('\n')[0]
    else:
        open_time = '제공하지 않음'

    # # 블로그리뷰수
    # blog_review_num = driver.find_element(
    #     By.XPATH, '//*[@id="mArticle"]/div[1]/div[1]/div[2]/div/div/a[2]/span'
    # ).text


    # 메뉴
    cat2 = []

    menus = driver.find_elements(By.CLASS_NAME, "info_menu")
    for menu in menus:
        cat2.append(menu.text)

    # 식당 장점
    likePoints = driver.find_elements(By.XPATH, '//*[@class="txt_likepoint"]')
    likePointCnts = driver.find_elements(By.XPATH, '//*[@class="num_likepoint"]')
    likePoint = ""
    for p, c in zip(likePoints, likePointCnts):
        likePoint += p.text + "@" + c.text + "@"

    if driver.find_elements(By.XPATH, '//*[@id="mArticle"]/div[7]/div[3]/a/span[1]'):
        # 리뷰 더보기 최대로
        while not bool(
            driver.find_elements(By.XPATH, '//a[@class="link_more link_unfold"]')
        ):
            tmp_clk = driver.find_elements(By.XPATH, '//*[@class="txt_more"]')
            wait = WebDriverWait(driver, 1)
            element = wait.until(
                EC.element_to_be_clickable((By.CLASS_NAME, "link_more"))
            )
            try:
                if tmp_clk[0].text == "후기 더보기":
                    tmp_clk[0].click()
            except Exception as e:
                print("클릭 예외가 발생되었습니다.")
                pass

    try:
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        contents_div = soup.find(name="div", attrs={"class": "evaluation_review"})

        # 별점을 가져옵니다.
        rateNcnt = contents_div.find_all(name="span", attrs={"class": "txt_desc"})
        rateCnts = rateNcnt[::2]
        rates = rateNcnt[1::2]

        # 개인이 해당 식당에 남긴 별점
        rateAts = driver.find_elements(
            By.XPATH, '//div[@class="grade_star size_s"]/span/span'
        )

        # 리뷰를 가져옵니다.
        reviews = contents_div.find_all(name="p", attrs={"class": "txt_comment"})

        # 리뷰를 쓴 날짜를 가져옵니다.
        reviews_dates = contents_div.find_all(
            name="span", attrs={"class": "time_write"}
        )

        # 리뷰 아이디 가져오기
        reviews_ids = contents_div.find_all(name="a", attrs={"class": "link_user"})
        print("rateAts", len(rateAts), "reviews", len(reviews))
        # 데이터프레임으로 정리합니다.
        for rate, rateCnt, rateAt, review, reviews_date, reviews_id in zip(
            rates, rateCnts, rateAts, reviews, reviews_dates, reviews_ids
        ):
            rateAt = int(rateAt.get_attribute("style")[7:-2]) / 20
            row = [
                dinner_id,
                name,
                cat1,
                cat2,
                review_num,
                score_min,
                likePoint,
                address,
                diner_lon,
                diner_lat,
                page_url,
                open_time,
                review.find(name="span").text,
                rate.text,
                rateCnt.text,
                rateAt,
                reviews_date.text,
                reviews_id.text,
            ]
            series = pd.DataFrame([row], columns=columns)
            df = pd.concat([df, series])
    except Exception as e:
        print("예외가 발생되었습니다.", e)

1770731230: http://place.map.kakao.com/1770731230
클릭 예외가 발생되었습니다.
rateAts 585 reviews 585
25744174: http://place.map.kakao.com/25744174
rateAts 3 reviews 3
27261403: http://place.map.kakao.com/27261403
rateAts 3 reviews 3
27512695: http://place.map.kakao.com/27512695
클릭 예외가 발생되었습니다.
rateAts 277 reviews 275
21306153: http://place.map.kakao.com/21306153
클릭 예외가 발생되었습니다.
rateAts 250 reviews 250
8664636: http://place.map.kakao.com/8664636
rateAts 3 reviews 3
1198484281: http://place.map.kakao.com/1198484281
클릭 예외가 발생되었습니다.
rateAts 194 reviews 182
26794399: http://place.map.kakao.com/26794399
클릭 예외가 발생되었습니다.
rateAts 339 reviews 336
8024422: http://place.map.kakao.com/8024422
rateAts 180 reviews 172
12890589: http://place.map.kakao.com/12890589
rateAts 348 reviews 343
24041168: http://place.map.kakao.com/24041168
클릭 예외가 발생되었습니다.
rateAts 125 reviews 117
8969599: http://place.map.kakao.com/8969599
클릭 예외가 발생되었습니다.
rateAts 134 reviews 131
1986314222: http://place.map.kakao.com/1986314222
클릭 예외가 발