### Import Package

In [25]:
import requests
import pandas as pd
import time
import calendar

from bs4 import BeautifulSoup

### Timer Decorate

In [2]:
def timer(fn):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = fn(*args, **kwargs)
        end_time = time.time()
        print("{fn_name} : {time} ms".format(fn_name = fn.__name__, time = end_time - start_time))
        return result
    return wrapper 

### Category Dictionary

In [3]:
category_dict = {
    "100":950203, # 정치
    "101":949986, # 경제
    "102":949987, # 사회
    "103":949988, # 생활/문화
    "104":949990, # 세계
    "105":949984, # IT/과학
}

### Crawling Last Page

In [4]:
# @timer
def last_page(category, date):
    compnentId = category_dict[str(category)]
    url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(compnentId) + "&date=" + date + " 00:00:00&page=100"
    response = requests.get(url)
    return response.json()["pagerInfo"]["page"]
    
# last_page(100, "2016-06-10")

### Crawling Content, Comment, LikeIt 

In [5]:
# using json
# @timer
def get_likeit(aid, oid):    
    url = "http://news.like.naver.com/likeIt/likeItContent.jsonp?_callback=window.__jindo2_callback._7105&serviceId=NEWS&displayId=NEWS&contentsId=ne_" + str(oid) + "_" + str(aid) + "&lang=ko&viewType=recommend"
    response = requests.get(url)
    return response.text.split('likeItCount":')[1].split(",")[0]
    
# using bs4
# @timer
def get_content(path):
    
    response = requests.get(path)
    dom = BeautifulSoup(response.content, "html.parser")

    if len(dom.select("#articleTitleCommentCount .lo_txt")) == 0:
        return 0, 0, "-"
    
    comment = dom.select_one("#articleTitleCommentCount .lo_txt").text
    content = dom.select_one("#articleBodyContents").text.replace("\n","").replace("\r","").replace("\t","")
    aid = path.split("aid=")[1]
    oid = path.split("oid=")[1].split("&")[0]
    likeit = get_likeit(aid, oid)
    
    return comment, likeit, content

# url = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=100&oid=003&aid=0007327243"
# content_data = get_content(url)
# content_data[0], content_data[1], len(content_data[2])

### Crawling 1 category, 1 day, 1 page

In [15]:
# @timer
def one_page_df(category, date, page):
    """ excute time about 5 ~ 6 sec """
    
    url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=" + str(category) + "#&date=" + date + " 00:00:00&page=" + str(page)
    response = requests.get(url)
    dom = BeautifulSoup(response.content, "html.parser")
    
    result_df = pd.DataFrame(columns=["newsid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])

    article_list = dom.select("#section_body li")
    for article in article_list:
        
        link = article.select_one("a").get("href")
        comment, likeit, content = get_content(link)
        
        tmp_dict = {
            "newsid": link.split("aid=")[1],
            "newspaper": article.select_one(".writing").text,
            "title": article.select_one("strong").text,
            "link": link,
            "comment": comment,
            "likeit": likeit,
            "content": content,
            "date": date,
            "category": str(category-100),
        }
        result_df.loc[len(result_df)] = tmp_dict
    return result_df

# df = one_page_df(100, "2016-01-01", 1)
# len(df)

### 1 category, 1 day, all page

In [14]:
# @timer
def one_day_df(category, date):
    """ excute time about 60 sec / 10 page """
    
    last_page_number = int(last_page(category, date))
      
    df_list = []
    
    for page in range(1, last_page_number + 1):
        df = one_page_df(category, date, page)
        df_list.append(df)
        
    return pd.concat(df_list).reset_index(drop=True)

# day_df = one_day_df(105, "2016-01-01")
# len(day_df)

### 1 category, 1 month, all page

In [17]:
def check_zero(num):
    return "0" + str(num) if int(num) < 10 else str(num)

@timer    
def total_page(category, year, month):
    """ excute time about 6 sec """
    
    last_day = calendar.monthrange(year,month)[1]
    total_page = 0
    for day in range(1, last_day + 1):
        date = str(year) + "-" + check_zero(month) + "-" + check_zero(day)  
        total_page += last_page(category, date)
    
    excute_time = 6 * total_page / 60
    
    return { "total_page":total_page, "excute_time(min)":excute_time }
   
@timer
def one_month_df(category, year, month):
    
    last_day = calendar.monthrange(year,month)[1]
    
    df_list = []
    
    for day in range(1, last_day + 1):
        date = str(year) + "-" + check_zero(month) + "-" + check_zero(day)  
        df = one_day_df(category, date)
        df_list.append(df)

    return pd.concat(df_list).reset_index(drop=True)

# total_page(105, 2016, 1)
# month_df = one_month_df(105, 2016, 1)
# len(month_df)

### 1 category, many months, all page

In [30]:
def one_year_df(category, year, start_month, end_month):
    
    df_list = []
    
    for month in range(start_month, end_month + 1):
        df = one_month_df(category, year, month)
        df_list.append(df)
        
    return pd.concat(df_list).reset_index(drop=True) 

In [None]:
year_df = one_year_df(100, 2016, 1, 6)
year_df.to_csv("./news_data/100_2016.csv", index=False, encoding="utf-8")

year_df = one_year_df(101, 2016, 1, 6)
year_df.to_csv("./news_data/101_2016.csv", index=False, encoding="utf-8")

year_df = one_year_df(102, 2016, 1, 6)
year_df.to_csv("./news_data/102_2016.csv", index=False, encoding="utf-8")

year_df = one_year_df(103, 2016, 1, 6)
year_df.to_csv("./news_data/103_2016.csv", index=False, encoding="utf-8")

year_df = one_year_df(104, 2016, 1, 6)
year_df.to_csv("./news_data/104_2016.csv", index=False, encoding="utf-8")

year_df = one_year_df(105, 2016, 1, 6)
year_df.to_csv("./news_data/105_2016.csv", index=False, encoding="utf-8")