# 알라딘 크롤링
- 로그인 -> 주문 상세정보 -> 책 제목, 책 url
- 책 url -> 중고 모두보기 url
- 중고 모두보기 url 에서 파는곳, 책 상태, 가격

In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import pyperclip
from tqdm import tqdm
import numpy as np

## 주문 상세정보
- 주문 번호, url 수가 20개정도로 많지 않고 간단하기 때문에 listly를 활용함

## 주문상세url -> 책이름, 책페이지 추출

In [None]:
order_df=pd.read_excel('알라딘 주문이력.xlsx')

In [None]:
url_list=list(order_df['주문상세url'])

In [None]:
# 셀레니움 시작
driver=webdriver.Chrome()
url='https://www.aladin.co.kr/login/wlogin.aspx' #로그인 페이지 시작
driver.get(url)

# id, pw값 정보 입력
id_value=input("id를 입력하세요: ")
pw_value=input("pw를 입력하세요: ")

# id, pw값 전달, 입력
id_box = driver.find_element(By.ID, value="Email")
id_box.click()
pyperclip.copy(id_value)
id_box.send_keys(Keys.CONTROL, 'v')
time.sleep(0.5)
pw_box = driver.find_element(By.ID, value="Password")
pw_box.click()
pyperclip.copy(pw_value)
pw_box.send_keys(Keys.CONTROL, 'v')
time.sleep(0.5)
driver.find_element(By.XPATH,'//*[@id="LoginForm"]/div[2]/a/div').click()
time.sleep(0.5)

# url 이동
book_df=pd.DataFrame()
for a in tqdm(url_list):
    url=a
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    books=soup.select('#tblOrdersItem>tr')
    book_list=[]
    for i in range(len(books)):
        book_name=books[i].select('.td_item>div>a')[0].text.strip()
        book_url=books[i].select('.td_item>div>a')[0]['href']
        book_list.append([book_name, book_url])
    tmp=pd.DataFrame(book_list)
    book_df=pd.concat([book_df, tmp])
    time.sleep(1)
book_df.columns=['book_name', 'book_url']
book_df=book_df[book_df['book_name']!='봉투']
book_df.reset_index(inplace=True, drop=True)

## 책페이지 -> 책 재고 페이지

In [None]:
book_df['book_storage_url']=np.nan
for i in tqdm(range(len(book_df))):
    url=book_df.iloc[i,1]
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    book_df.iloc[i,2]='https://www.aladin.co.kr'+soup.select('div.Ere_textR>ul>li>a')[0]['href']
    time.sleep(1)

In [None]:
# book_df.to_csv('book_df.csv', index=False)
# book_df=pd.read_csv('book_df.csv')

## 책 재고 페이지 -> 지점, 가격, 상태 등 정보 추출

In [None]:
book_storage_url_list=list(book_df['book_storage_url'].unique())
book_storage_df=pd.DataFrame()
for a in tqdm(range(len(book_storage_url_list))):
    for b in range(1,3):
        url=book_storage_url_list[a]+'&page='+str(b)
        driver.get(url)
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        book_list=[]
        try:
            for i in range(len(soup.select('span.Ere_str.Ere_fs16'))):
                book_name=soup.select('span.Ere_str.Ere_fs16')[i].text
                book_quality=soup.select('span.Ere_sub_top')[i].text
                book_price=soup.select('li.Ere_sub_pink>span')[i].text
                book_deliverfee=soup.select('div.price>ul')[i].select('li')[-1].text.split(' ')[-1]
                if len(soup.select('div.seller>ul')[i])==3:
                    store_name=soup.select('div.seller>ul')[i].select('li')[0].text.strip()
                    store_type=soup.select('div.seller>ul')[i].select('li')[1].text.strip()
                elif len(soup.select('div.seller>ul')[i])==2:
                    store_name=soup.select('div.seller>ul')[i].select('li')[1].text.strip()
                    store_type=soup.select('div.seller>ul')[i].select('li')[0].text.strip()
                book_list.append([book_name, book_quality, book_price, book_deliverfee, store_name, store_type])
            tmp=pd.DataFrame(book_list)
            book_storage_df=pd.concat([book_storage_df, tmp])
        except:
            pass
book_storage_df.columns=['book_name', 'book_quality','book_price','book_deliverfee','store_name','store_type']
book_storage_df.reset_index(inplace=True, drop=True)

book_storage_df['book_price']=book_storage_df['book_price'].apply(lambda x:str(x).split('원')[0])
a_df=book_storage_df[~book_storage_df['book_price'].str.contains(',')]
b_df=book_storage_df[book_storage_df['book_price'].str.contains(',')]
b_df['book_price']=b_df['book_price'].apply(lambda x:str(x).split(',')[0])+b_df['book_price'].apply(lambda x:str(x).split(',')[1])
b_df['book_price']=b_df['book_price'].astype(int)
a_df['book_price']=a_df['book_price'].astype(int)
book_storage_df=pd.concat([a_df,b_df])
book_storage_df.reset_index(inplace=True, drop=True)



In [None]:
# book_storage_df.to_csv('book_sales_df.csv', index=False)
# book_storage_df=pd.read_csv('book_sales_df.csv')

# 활용

In [None]:
# 5000원 이하
tmp=book_storage_df[book_storage_df['book_price']<=5000]
tmp=tmp.groupby('store_name').count()[['book_name']]
tmp.reset_index(inplace=True)
tmp.sort_values(by='book_name', ascending=False).head(20)

In [None]:
book_storage_df[book_storage_df['store_name']=='중고매장신림점'].sort_values(by='book_price').head(30)