### Import Package

In [7]:
import requests
import pandas as pd
import time
import calendar

from bs4 import BeautifulSoup

### Timer Decorate

In [8]:
def timer(fn):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = fn(*args, **kwargs)
        end_time = time.time()
        print("{fn_name} : {time} ms".format(fn_name = fn.__name__, time = end_time - start_time))
        return result
    return wrapper 

### Category Dictionary

In [9]:
category_dict = {
    "100":950203, # 정치
    "101":949986, # 경제
    "102":949987, # 사회
    "103":949988, # 생활/문화
    "104":949990, # 세계
    "105":949984, # IT/과학
}

### Crawling Last Page

In [10]:
# @timer
def last_page(category, date):
    compnentId = category_dict[str(category)]
    url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(compnentId) + "&date=" + date + " 00:00:00&page=100"
    response = requests.get(url)
    return response.json()["pagerInfo"]["page"]
    
# last_page(100, "2016-06-10")

### Crawling Content, Comment, LikeIt 

In [11]:
# using json
# @timer
def get_likeit(aid, oid):    
    url = "http://news.like.naver.com/likeIt/likeItContent.jsonp?_callback=window.__jindo2_callback._7105&serviceId=NEWS&displayId=NEWS&contentsId=ne_" + str(oid) + "_" + str(aid) + "&lang=ko&viewType=recommend"
    response = requests.get(url)
    return response.text.split('likeItCount":')[1].split(",")[0]
    
# using bs4
# @timer
def get_content(path):
    
    response = requests.get(path)
    dom = BeautifulSoup(response.content, "html.parser")

    if len(dom.select("#articleTitleCommentCount .lo_txt")) == 0:
        return 0, 0, "-"
    
    comment = dom.select_one("#articleTitleCommentCount .lo_txt").text
    content = dom.select_one("#articleBodyContents").text.replace("\n","").replace("\r","").replace("\t","")
    aid = path.split("aid=")[1]
    oid = path.split("oid=")[1].split("&")[0]
    likeit = get_likeit(aid, oid)
    
    return comment, likeit, content

# url = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=100&oid=003&aid=0007327243"
# content_data = get_content(url)
# content_data[0], content_data[1], len(content_data[2])

### Crawling 1 category, 1 day, 1 page

In [12]:
# @timer
def one_page_df(category, date, page):
    """ excute time about 5 ~ 6 sec """
    
    url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=" + str(category) + "#&date=" + date + " 00:00:00&page=" + str(page)
    response = requests.get(url)
    dom = BeautifulSoup(response.content, "html.parser")
    
    result_df = pd.DataFrame(columns=["newsid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])

    article_list = dom.select("#section_body li")
    for article in article_list:
        
        link = article.select_one("a").get("href")
        comment, likeit, content = get_content(link)
        
        tmp_dict = {
            "newsid": link.split("aid=")[1],
            "newspaper": article.select_one(".writing").text,
            "title": article.select_one("strong").text,
            "link": link,
            "comment": comment,
            "likeit": likeit,
            "content": content,
            "date": date,
            "category": str(category-100),
        }
        
        result_df.loc[len(result_df)] = tmp_dict
        
    return result_df

# df = one_page_df(105, "2016-01-01", 1)
# len(df)

### 1 category, 1 day, all page

In [16]:
# @timer
def one_day_df(category, date):
    """ excute time about 60 sec / 10 page """
    
    last_page_number = int(last_page(category, date))
      
    df_list = []
    
    for page in range(1, last_page_number + 1):
        df = one_page_df(category, date, page)
        df_list.append(df)
        
    return pd.concat(df_list).reset_index(drop=True)

# day_df = one_day_df(100, "2016-01-01")
# len(day_df)

250

In [20]:
category = 104
day_df = one_day_df(category, "2016-01-01")
day_df.to_csv("./news_data/" + str(category) + ".csv", index=False, encoding="utf-8")
len(day_df)

200

In [21]:
day_df

Unnamed: 0,newsid,newspaper,title,link,comment,likeit,content,date,category
0,0008524070,연합뉴스,"IS, 방글라데시 추가테러 위협…방글라인 조직원 등장",http://news.naver.com/main/read.nhn?mode=LSD&m...,0,0,방글라데시 추가 테러를 위협하는 IS 조직원 (테헤란=연합뉴스) 강훈상 특파원...,2016-01-01,4
1,0008524066,연합뉴스,"스위스軍 ""빈대와의 전쟁""…외부 행사도 취소",http://news.naver.com/main/read.nhn?mode=LSD&m...,0,0,미국 디트로이트에서는 빈대를 잡으려던 여성이 실수로 화재를 내는 사고도 있었다 [A...,2016-01-01,4
2,0008524034,연합뉴스,"美FBI 국장 ""힐러리 극히 부주의""…기소 면해도 후유증 예고(종…",http://news.naver.com/main/read.nhn?mode=LSD&m...,1,0,"불기소 권고했으나 국가안보 손상 조목조목 지적NYT ""판단력·능력에 대한 기소""…폴...",2016-01-01,4
3,0008523984,연합뉴스,"""아빠가 내 대마 다 태웠어요""…경찰에 신고한 얼빠진 호주 남성",http://news.naver.com/main/read.nhn?mode=LSD&m...,6,3,대마 [AFP=연합뉴스 자료사진] (시드니AFP=연합뉴스) 한 호주 남성이 자...,2016-01-01,4
4,0008523970,연합뉴스,브렉쇼크 재점화…英부동산·伊은행 불안에 금융시장 '요동'(종합),http://news.naver.com/main/read.nhn?mode=LSD&m...,1,3,런던 시내[EPA=연합뉴스 자료사진]英 부동산펀드 '펀드런'에 줄줄이 환매중단…伊 ...,2016-01-01,4
5,0008523968,연합뉴스,프랑스인 목숨만 중요한가…무관심에 우는 비유럽 테러 희생자,http://news.naver.com/main/read.nhn?mode=LSD&m...,11,1,바그다드 내 희생자 추모인파[AP=연합뉴스]파리·브뤼셀에 울던 지구촌 터키·방글라·...,2016-01-01,4
6,0002152649,뉴스1,"모디 인도 총리, 아프리카 4개국 순방…中 견제 염두",http://news.naver.com/main/read.nhn?mode=LSD&m...,1,0,나렌드라 모디 인도 총리.© AFP=뉴스1(서울=뉴스1) 최종일 기자 = 나렌드라 ...,2016-01-01,4
7,0008523875,연합뉴스,"IS ""시리아서 미군 드론 격추"" vs 미군 ""추락 후 파괴""",http://news.naver.com/main/read.nhn?mode=LSD&m...,7,1,"IS ""시리아에서 미군 드론 격추"" (이스탄불=연합뉴스) 하채림 특파원 = IS...",2016-01-01,4
8,0008523858,연합뉴스,"""대러 제재 동참국 작년 중반까지 수출 손실 70조 원 넘어""",http://news.naver.com/main/read.nhn?mode=LSD&m...,2,0,佛연구소 자료…서방 제재에 러시아도 맞제재 지속 (모스크바=연합뉴스) 유철...,2016-01-01,4
9,0008523854,연합뉴스,"美 무급인턴제는 "" '특권의 곱셈' 통해 불평등 심화 결과 낳아""",http://news.naver.com/main/read.nhn?mode=LSD&m...,0,1,"포드재단 회장 ""방학때 돈 벌어야 하는 저소득층 학생엔 경험·인맥 기회 박탈"" ...",2016-01-01,4


In [None]:
df = pd.concat([day_df[:25],day_df[50:]])

### 1 category, 1 month, all page

In [None]:
def check_zero(num):
    return "0" + str(num) if int(num) < 10 else str(num)

@timer    
def total_page(category, year, month):
    """ excute time about 6 sec """
    
    last_day = calendar.monthrange(year,month)[1]
    total_page = 0
    for day in range(1, last_day + 1):
        date = str(year) + "-" + check_zero(month) + "-" + check_zero(day)  
        total_page += last_page(category, date)
    
    excute_time = 6 * total_page / 60
    
    return { "total_page":total_page, "excute_time(min)":excute_time }
   
@timer
def one_month_df(category, year, month, startday=1):
    
    last_day = calendar.monthrange(year,month)[1]
    
    df_list = []
    
    for day in range(startday, last_day + 1):
        date = str(year) + "-" + check_zero(month) + "-" + check_zero(day)  
        df = one_day_df(category, date)
        df_list.append(df)
        print(date, len(df))
        save_path = "./news_data/" + str(category) + "_" + str(year) + "_" + str(month) + "_" + check_zero(day) + ".csv"
        print(save_path)
        df.to_csv(save_path, index=False, encoding="utf-8")
        
    return pd.concat(df_list).reset_index(drop=True)

# total_page(105, 2016, 1)
month_df = one_month_df(105, 2016, 1)
len(month_df)

In [None]:
month_df

In [None]:
category = 105
year = 2016
month = 5

month_df = one_month_df(category, year, month)
save_path = "./news_data/" + str(category) + "_" + str(year) + "_" + str(month) +".csv"
month_df.to_csv(save_path, index=False, encoding="utf-8")

In [None]:
df = pd.read_csv(save_path) 
df.tail()

In [None]:
year = 2016
for category in range(100,106):
    for month in range(1,3):
        save_path = "./news_data/" + str(category) + "_" + str(year) + "_" + str(month) +".csv"
        print(len(pd.read_csv(save_path)), save_path)

### 1 category, many months, all page

In [None]:
def one_year_df(category, year, start_month, end_month):
    
    df_list = []
    
    for month in range(start_month, end_month + 1):
        df = one_month_df(category, year, month)
        df_list.append(df)
        
    return pd.concat(df_list).reset_index(drop=True) 

In [None]:
# year_df = one_year_df(100, 2016, 1, 6)
# year_df.to_csv("./news_data/100_2016.csv", index=False, encoding="utf-8")

# year_df = one_year_df(101, 2016, 1, 6)
# year_df.to_csv("./news_data/101_2016.csv", index=False, encoding="utf-8")

# year_df = one_year_df(102, 2016, 1, 6)
# year_df.to_csv("./news_data/102_2016.csv", index=False, encoding="utf-8")

# year_df = one_year_df(103, 2016, 1, 6)
# year_df.to_csv("./news_data/103_2016.csv", index=False, encoding="utf-8")

# year_df = one_year_df(104, 2016, 1, 6)
# year_df.to_csv("./news_data/104_2016.csv", index=False, encoding="utf-8")

# year_df = one_year_df(105, 2016, 1, 6)
# year_df.to_csv("./news_data/105_2016.csv", index=False, encoding="utf-8")