In [13]:
import time
import re
import json
import math
import logging

import boto3
from boto3.dynamodb.conditions import Key, Attr

import requests
from bs4 import BeautifulSoup

In [91]:
base_url = 'https://search.shopping.naver.com/search/category.nhn'
headers = {
    "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "accept-encoding":"gzip, deflate, br",
    "accept-language":"ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "user-agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
params = {
    'brand':0,
    'cat_id':0,
    "productSet":"model",
    "pagingIndex":1,
    'pagingSize':80,
    "viewType":'list',
    'sort':'date',
    'frm':'NVSHBRD'
}
method = "get"
logger = logging.getLogger("NvmidCrawler")

In [92]:
def requests_text(params):
    try:
        res = requests.request(method,
                               base_url,
                               params=params,
                               headers=headers,
                               timeout=5)
        res_text = res.text
        res_url = res.url
    except requests.ConnectionError as e:
        logger.error(e)
        res_text = None
        res_url = None
    except requests.Timeout as e:
        logger.error(e)
        res_text = None
        res_url = None
    except requests.RequestException as e:
        logger.error(e)
        res_text = None
        res_url = None
    finally:
        return res_text, res_url    

In [93]:
def count_pages(res_text,res_url):
    # 해당 Brand & Cat에 해당하는 검색 결과의 페이지 수를 가져오는 메소드
    bsObj = BeautifulSoup(res_text,"html.parser")
    try:
        count_expr = bsObj.find("ul",{'class':'snb_list'})\
                        .find("a",{'class':'_productSet_total'})\
                        .find(text=re.compile("[\d,]+"))
    except AttributeError as e:
        logger.error("count_expr is missing ...url : {}".format(res_url))
        count_expr = "0"
    total_count = int(re.sub("\D","",count_expr))
    paging_count = math.ceil(total_count / 80)
    return paging_count

In [94]:
def parse_item_context(res_text,res_url,brand_id,brand_title):
    # item 창 파싱하기
    bsObj = BeautifulSoup(res_text,"html.parser")
    row = {"brand_id":brand_id,"brand":brand_title}
    rows = []
    for li in bsObj.find_all("li",{"class":"_itemSection"}):
        try:
            row["nv_mid"] = li.attrs['data-nv-mid']
        except AttributeError as e:
            logger.error("nvmid is missing... url : {}".format(res_url))
            continue
        try:
            info = li.find('div',{'class':"info"})
            if info is None or not info.text:
                raise AttributeError
        except AttributeError as e:
            logger.error("info is missing... url : {}".format(res_url))
            continue
        try:
            row["url"] = li.a.attrs['href']
        except AttributeError as e:
            row['url'] = ""
            logger.warning("url is missing... url : {}".format(res_url))
        try:
            row["img_url"] = li.img.attrs['data-original']
        except AttributeError as e:
            row['img_url'] = ""
            logger.warning("image url is missing... url : {}".format(res_url))
        try:
            row["item_title"] = info.find('a',{'class':'tit'}).attrs['title']
        except AttributeError as e:
            row["item_title"] = ""
            logger.warning("item title is missing... url : {}".format(res_url))
        try:
            span_price = info.find('span',{"class":"price"})
            if span_price.text.strip() == "판매중단":
                row['price'] = "-1"
            else:
                price_expr = span_price.em.text.strip()
                row["price"] = re.sub("\D","",price_expr)
                if int(row['price']) > 1e8:
                    raise AttributeError
        except AttributeError as e:
            row["price"] = "-1"
            logger.warning("item price is missing... url : {}".format(res_url))
        try:
            cat_expr = info.find('span',{'class':'depth'}).text.strip()
            idx = 1
            for expr in re.sub("\n *","",cat_expr).split(">"):
                row['{}th_category'.format(idx)] = expr.strip()
                idx += 1
        except AttributeError as e:
            for idx in range(1,5):
                row['{}th_category'.format(idx)] = ""
            logger.warning("category is missing... url : {}".format(res_url))

        try:
            row["item_spec"] = info.find('span',{'class':'detail'}).text.strip()
        except AttributeError as e:
            row["item_spec"] = ""
            logger.warning("item spec is missing... url : {}".format(res_url))
        try:
            date_expr =info.find('span',{'class':'date'}).text.strip()
            row["reg_date"] = re.search("\d+.\d+.",date_expr).group(0)
        except AttributeError as e:
            row["reg_date"] = datetime.strftime(datetime.now(),"%Y.%m.")
            logger.warning("reg date is missing... url : {}".format(res_url))
        rows.append(row.copy())
    return rows

In [95]:
# Logging 설정
logger = logging.getLogger(__name__) # "__name__"를 하면, root log는 제외시키고, 이 모듈 내에서 발생한 로그만 포함시킴. 없으면 requests 모듈에서 생긴 로그, boto3에서 생긴 로그 등이 포함되어 버림 그래서 지저분해짐
logger.setLevel(logging.INFO)
formatter = logging.Formatter('{"method" : "crawling_nvmid", "time" : "%(asctime)s", "level" : "%(levelname)s", "message" : "%(message)s"}')
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)

In [96]:
# categorybrands에서 message 가져오기
QueueName = "categorybrands"
sqs = boto3.resource('sqs')
queue = sqs.get_queue_by_name(QueueName=QueueName)

messages = queue.receive_messages(AttributeNames=["All"],
                                MessageAttributeNames=['Page'],
                                MaxNumberOfMessages=1,
                                WaitTimeSeconds=3)

msg = messages[0]
data = json.loads(msg.body)
msg.delete()

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '215',
   'content-type': 'text/xml',
   'date': 'Fri, 09 Mar 2018 05:28:00 GMT',
   'server': 'Server',
   'x-amzn-requestid': '5ab8b712-0b5c-5e47-8453-e40e16fcf318'},
  'HTTPStatusCode': 200,
  'RequestId': '5ab8b712-0b5c-5e47-8453-e40e16fcf318',
  'RetryAttempts': 0}}

In [97]:
items = []
params['brand'] = data["brand_id"]
params['cat_id'] = data["category_id"]
params['pagingIndex'] = 1

res_text, res_url = requests_text(params)
item = parse_item_context(res_text,res_url,data['brand_id'],data['brand_title'])
items.append(item)
paging_count = count_pages(res_text,res_url)
logger.info("start to search brand[{}]|category[{}] ... nums of paging : {}"\
                 .format(data['brand_id'],data['category_id'],paging_count))
start = time.time()
for idx in range(2,paging_count):
    params['pagingIndex'] = idx

    res_text, res_url = requests_text(params)
    item = parse_item_context(res_text,res_url,data['brand_id'],data['brand_title'])
    items.append(item)
    if res_text is None:
        continue
    end = time.time()
    if (end-start)-delay > 0:
        time.sleep(end-start-delay)
    start = time.time()
logger.info("end to run")

{"method" : "crawling_nvmid", "time" : "2018-03-09 05:28:41,300", "level" : "INFO", "message" : "start to search brand[205696]|category[50001238] ... nums of paging : 8"}
{"method" : "crawling_nvmid", "time" : "2018-03-09 05:28:41,300", "level" : "INFO", "message" : "start to search brand[205696]|category[50001238] ... nums of paging : 8"}
{"method" : "crawling_nvmid", "time" : "2018-03-09 05:28:43,544", "level" : "INFO", "message" : "end to run"}
{"method" : "crawling_nvmid", "time" : "2018-03-09 05:28:43,544", "level" : "INFO", "message" : "end to run"}


In [None]:
items[0]

### 1. dynamoDB 생성하고, 데이터를 추가하기

In [105]:
dynamodb = boto3.resource("dynamodb",region_name="ap-northeast-2")

In [107]:
table = dynamodb.create_table(
    TableName="nvmids",
    KeySchema=[
        {
            "AttributeName" : "nvmid",
            "KeyType" : "HASH"
        },
        {
            "AttributeName" : "category_id",
            "KeyType" : "RANGE"
        },
    ],
    AttributeDefinitions=[
        {
            "AttributeName":"nvmid",
            "AttributeType":"N"
        },
        {
            "AttributeName":"category_id",
            "AttributeType":"N"
        },
    ],
    ProvisionedThroughput={
        "ReadCapacityUnits" : 20,
        "WriteCapacityUnits" : 20,
    }
)

ResourceInUseException: An error occurred (ResourceInUseException) when calling the CreateTable operation: Table already exists: nvmids

In [115]:
table = dynamodb.Table("nvmids")