In [1]:
import os
import time
import datetime
import numpy as np
import pandas as pd
import requests
import json

from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib.parse import quote
from urllib.request import urlretrieve
from collections import Counter

In [2]:
# Load secrets.json
with open('../src/secrets.json') as f:

    json_data = json.load(f)

#####################################################
BUCKET_NAME = json_data['BUCKET_NAME']
REGION_NAME = json_data['BUCKET_REGION']
ACCESS_KEY = json_data['AWS_ACCESS_KEY']
SECRET_KEY = json_data['AWS_SECRET_KEY']
#####################################################
DB_USER_NAME = json_data['MYSQL_USERNAME']
DB_USER_PASSWORD= json_data['MYSQL_PASSWORD']
DB_HOST = json_data['MYSQL_HOST']
DB_TABLE_NAME = json_data['MYSQL_DB_NAME']
#####################################################

### Main URL Crawling

In [3]:
# exhibition_url = []

def get_exfibition_url():
    
    global exhibition_url
    
    exhibition_url = []
    
    # url 설정
    url = f'http://ticket.interpark.com/TPGoodsList.asp?Ca=Eve&SubCa=Eve_O&tid4=Eve_O&smid1=gnb&smid2=008&smid3=1'

    # requests로 데이터 요청하기
    resp = requests.get(url)

    # 설정한 url 로 데이터 get 함수 요청
    driver.get(url)

    # 접근한 페이지의 소스코드를 텍스트로 저장 (Str)
    page_text = driver.page_source

    # page_text를 html list로 변환 
    html = BeautifulSoup(page_text, 'html.parser')

    # body > table > tbody > tr:nth-child(2) > td:nth-child(3) > div > div > div.con > div > table > tbody > tr:nth-child(1) > td.RKtxt > span > a

    # blog_url
    url=[]

    for i in html.select('td.RKtxt > span > a') :
        urls = i.attrs['href']
        url.append(urls)
        
    print(f"전시회 {len(url)}개 의 주소가 있습니다.")

    for i in url :
        exhibition_url.append(f'http://ticket.interpark.com/{i}')

In [4]:
def get_crawling_exhibition_info(url):
        
    resp = requests.get(url)
    
    # print(f'resp : {resp}')
    
    driver.get (url)
    
    time.sleep(1)
    
    try:
        driver.find_element_by_xpath('//*[@id="popup-prdGuide"]/div/div[3]/div/a').click()
    except:
        pass
    
    # 접근한 페이지의 소스코드를 텍스트로 저장 (Str)
    page_text = driver.page_source

    # page_text를 html list로 변환 
    html = BeautifulSoup(page_text, 'html.parser')
    
    # get title
    # title
    try:
        title = html.select_one('div.summaryTop > h2').text
    except:
        pass
    
    # get place name
    # place
    try:
        place = html.select_one('li:nth-child(1) > div > a').text
        place = place.split('(')[0]
    except:
        pass
    
    # get date (startAt, endAt)
    try:
        date = html.select_one('li:nth-child(2) > div > p').text
        startAt = date.split(' ~')[0]
        endAt = date.split(' ~')[1]
    except:
        pass
    
    # get ageLimit
    try:
        ageLimit = html.select_one('li:nth-child(4) > div > p').text
    except:
        ageLimit = html.select_one('li:nth-child(3) > div > p').text
        
        
    
    # get price
    try:
        price = html.select_one('li:nth-child(2) > span.price').text
    except:
        price = '가격 정보 없음'
        
    # get main_img_url
    try:
        main_img = html.select_one('div.posterBoxTop > img').attrs['src']
        main_img_url = 'http:' + main_img
    except:
        pass
    
    # get statGender
    try:    
        statMale = html.select_one('div.statGender > div.statGenderType.is-male > div.statGenderValue').text
        statFemale = html.select_one('div.statGender > div.statGenderType.is-female > div.statGenderValue').text
    except:
        statMale = '통계 정보 없음'
        statFemale = '통계 정보 없음'
        
    # get statAge
    try:
        statAge_Percent_10 = html.select_one('div:nth-child(1) > div.statAgePercent').text
        statAge_Percent_20 = html.select_one('div:nth-child(2) > div.statAgePercent').text
        statAge_Percent_30 = html.select_one('div:nth-child(3) > div.statAgePercent').text
        statAge_Percent_40 = html.select_one('div:nth-child(4) > div.statAgePercent').text
        statAge_Percent_50 = html.select_one('div:nth-child(5) > div.statAgePercent').text
    except:
        statAge_Percent_10 = '통계 정보 없음'
        statAge_Percent_20 = '통계 정보 없음'
        statAge_Percent_30 = '통계 정보 없음'
        statAge_Percent_40 = '통계 정보 없음'
        statAge_Percent_50 = '통계 정보 없음'
        
    # get exhibition_datail_info
    try:
        global detail_info
        detail_info = []
        block_1 = html.select_one('div:nth-child(1) > div.contentDetail > p').text
        block_2 = html.select_one('div:nth-child(1) > div > ul > div').text
        detail_info.append(block_1)
        detail_info.append(block_2)
        # remove empty elements
        while("" in detail_info):
            detail_info.remove("")
        # remove new lines in string
        detail_info = [item.replace('\r', ' ').replace('\n', ' ') for item in detail_info]

        detail_info = ' '.join(detail_info)

    except:
        detail_info = '공연시간 정보 text 없음'
    
    # get notice img
    try:
        notice_images = []
        notice_image_list = html.select('div:nth-child(2) > div.contentDetail > img')
        
        for i in notice_image_list:
            i = 'http:' + i.get('src')
            notice_images.append(i)
        
        notice_image = ', '.join(notice_images)
    except:
        notice_images = '공지사항 image 없음'
        
    # get notice text
    try:
        notice_texts = html.select_one('div:nth-child(2) > div.contentDetail').text
    except:
        notice_texts = '공지사항 text 없음'

    # get detail_info(img)
    try:
        images = []
        detail_info_image_urls = html.select('div.content.description > div.contentDetail > img')

        for i in detail_info_image_urls:
            i = 'http:' + i.get('src') #type: ignore
            images.append(i)

        detail_info_image = ', '.join(images)
    except:
        detail_info_image = '상세정보 image 없음'
    
    ### append ###
    global title_list
    global place_list
    global startAt_list
    global endAt_list
    global ageLimit_list
    global price_list
    global main_img_url_list
    global statMale_list
    global statFemale_list
    global statAge_Percent_10_list
    global statAge_Percent_20_list
    global statAge_Percent_30_list
    global statAge_Percent_40_list
    global statAge_Percent_50_list
    global detail_info_list
    global notice_img_list
    global notice_txt_list
    global detail_info_img_list
    
    title_list = []
    place_list = []
    startAt_list = []
    endAt_list = []
    ageLimit_list = []
    price_list = []
    main_img_url_list = []
    statMale_list = []
    statFemale_list = []
    statAge_Percent_10_list = []
    statAge_Percent_20_list = []
    statAge_Percent_30_list = []
    statAge_Percent_40_list = []
    statAge_Percent_50_list = []
    detail_info_list = []
    notice_img_list = []
    notice_txt_list = []
    detail_info_img_list = []
    
    # title
    title_list.append(title)
    # place
    place_list.append(place)
    # date
    startAt_list.append(startAt)
    endAt_list.append(endAt)
    # ageLimit
    ageLimit_list.append(ageLimit)
    # price
    price_list.append(price)
    # main_img_url
    main_img_url_list.append(main_img_url)
    # statGender
    statMale_list.append(statMale)
    statFemale_list.append(statFemale)    
    # statAge
    statAge_Percent_10_list.append(statAge_Percent_10)
    statAge_Percent_20_list.append(statAge_Percent_20)
    statAge_Percent_30_list.append(statAge_Percent_30)
    statAge_Percent_40_list.append(statAge_Percent_40)
    statAge_Percent_50_list.append(statAge_Percent_50)
    # detail_info
    detail_info_list.append(detail_info)
    # notice
    notice_img_list.append(notice_image)
    notice_txt_list.append(notice_texts)
    # detail_info_img
    detail_info_img_list.append(detail_info_image)    

In [5]:
def get_DataFrame():
    
    global data
    
    data = {
        'title' : title_list,
        'place' : place_list,
        'startAt' : startAt_list,
        'endAt' : endAt_list,
        'price' : price_list,
        'ageLimit' : ageLimit_list,
        'main_img_url' : main_img_url_list,
        'statMale' : statMale_list,
        'statFemale' : statFemale_list,
        'statAge_10' : statAge_Percent_10_list,
        'statAge_20' : statAge_Percent_20_list,
        'statAge_30' : statAge_Percent_30_list,
        'statAge_40' : statAge_Percent_40_list,
        'statAge_50' : statAge_Percent_50_list,
        'detail_info': detail_info_list,
        'notice_img_url': notice_img_list,
        'notice_txt': notice_txt_list,
        'detail_info_img_url': detail_info_img_list,
        }

    global df
    
    df = pd.DataFrame(data)
    
    return df

In [19]:
def save_img() :
    # 현재 시간 기준 폴더 생성
    # NOW = datetime.datetime.now().strftime('%Y-%m-%d/%H:%M:%S')
    NOW = datetime.datetime.now().strftime('%Y-%m-%d') + '/img'
    # RECORD_IMG_PATH = os.path.join(f'../../data/img', NOW)
    RECORD_IMG_PATH = os.path.join('../../data', NOW)
    print(f'{RECORD_IMG_PATH} 에 저장됩니다.')
    os.makedirs(RECORD_IMG_PATH, exist_ok=True)
    
    for idx, url in enumerate(tqdm(main_img_url_list)) :
        urlretrieve(url, f'{RECORD_IMG_PATH}/{idx}.jpg')
        time.sleep(1)

In [26]:
def save_csv() :
    # 현재 시간 기준 폴더 생성
    # NOW = datetime.datetime.now().strftime('%Y-%m-%d/%H:%M:%S')
    NOW = datetime.datetime.now().strftime('%Y-%m-%d') + '/csv'
    # RECORD_IMG_PATH = os.path.join(f'../../data/img', NOW)
    RECORD_CSV_PATH = os.path.join('../../data', NOW)
    os.makedirs(RECORD_CSV_PATH, exist_ok=True)
    
    df.to_csv(f'{RECORD_CSV_PATH}/exhibition.csv', index=False, encoding='utf-8-sig')

### Run Crwaling

In [27]:
driver = webdriver.Chrome('../chromedriver')

get_exfibition_url()

try:
    for idx, url in enumerate(tqdm(exhibition_url)) :
        # print(f'{idx} 번째 URL Img : {url}')
        get_crawling_exhibition_info(url)
        
    driver.close()
    
    get_DataFrame()
    
    save_img()
    save_csv()
    
    print('Done Processing')
except Exception as e:
    print(e)
    print('Error Accured')
    driver.close()

전시회 106개 의 주소가 있습니다.


100%|██████████| 106/106 [03:23<00:00,  1.92s/it]


../../data/2023-01-23/img 에 저장됩니다.


100%|██████████| 1/1 [00:01<00:00,  1.09s/it]

Done Processing



