### Documents Classification

1. crawling (NAVER news)
2. save dataframe
3. classification
4. result report
5. model save

##### 1. Crawling

In [1]:
import requests
import pandas as pd
import re
import time

from bs4 import BeautifulSoup
from selenium import webdriver

In [2]:
sleep_time = 0.5

In [3]:
# 정치 : 100
# 경제 : 101
# 사회 : 102
# 생활/문화 : 103
# 세계 : 104
# IT/과학 : 105

def make_url(category, date, page):
    url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=" + str(category) + "#&date=" + date + " 00:00:00&page=" + str(page)
    return url

# function test code
# make_url(100, "2016-04-14", 1)

In [4]:
def last_page(url):
    
    driver =  webdriver.PhantomJS()
    driver.get(url)

    time.sleep(sleep_time)
    pages = driver.find_elements_by_css_selector("#paging ._paging")
    total_page = len(pages)
    
    if len(pages) > 9:
        
        pages[9].click()        
        time.sleep(sleep_time)
        pages = driver.find_elements_by_css_selector("#paging ._paging")
        total_page += len(pages) - 1
        
        while len(pages) > 10:
        
            pages[10].click()        
            time.sleep(sleep_time)
            pages = driver.find_elements_by_css_selector("#paging ._paging")
            
            if len(pages) > 10:
                total_page += len(pages) - 1 
            else:
                total_page += len(pages)
            
    driver.close()   
    
    return total_page

# function test code
# url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=100#&date=2016-04-14 00:00:00&page=1"
# last_page(url)

In [5]:
def get_content(url):
    
    driver =  webdriver.PhantomJS()
    driver.get(url)
    time.sleep(sleep_time)
    content = driver.find_element_by_css_selector("#articleBodyContents").text
    
    comment = 0
    comment_element = driver.find_elements_by_css_selector("#articleTitleCommentCount .lo_txt")
    if int(len(comment_element)) > 0:
        comment = comment_element[0].text
    
    likeit = 0
    likeit_element = driver.find_elements_by_css_selector(".u_likeit_module .u_cnt")
    if int(len(likeit_element)) > 0:
        likeit = likeit_element[0].text
    
    driver.close()
    
    return comment, likeit, content
    
# function test code
# url = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=100&oid=032&aid=0002691860"
# comment, likeit, content = get_content(url)    
# comment, likeit, content

In [6]:
def article_url_list(url):
    
    driver =  webdriver.PhantomJS()
    driver.get(url)
    
    time.sleep(sleep_time)
    articles = driver.find_elements_by_css_selector("#section_body.section_body ul li")
    
    article_df = pd.DataFrame(columns=["newsid", "newspaper", "title", "link", "comment", "likeit", "content"])
    
    for article in articles:
        link = article.find_element_by_css_selector("a").get_attribute("href")
        title = article.find_element_by_css_selector("a").get_attribute("title")
        newspaper = article.find_element_by_css_selector("span").text
        newsid = link.split("aid=")[1]
        comment, likeit, content = get_content(link)  
        
        tmp_dict = {
            "newsid": newsid, 
            "newspaper": newspaper, 
            "title": title, 
            "link": link, 
            "comment": comment, 
            "likeit": likeit, 
            "content": content,
        }
        
        article_df.loc[len(article_df)] = tmp_dict
        
    article_df["date"] = url.split("date=")[1].split(" ")[0]
    article_df["category"] = url.split("sid1=")[1].split("#")[0]
    
    driver.close()
    
    return article_df
    
# function test code
# url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=100#&date=2016-04-14 00:00:00&page=1"
# article_df1 = article_url_list(url)

# url = "http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=100#&date=2016-04-14 00:00:00&page=2"
# article_df2 = article_url_list(url)

# result = pd.concat([article_df1,article_df2]).reset_index(drop=True)
# result

In [7]:
def date_category_df(category, date):
    
    url = make_url(category, date, 1)
    last_page_number = last_page(url)
    
    result_df = pd.DataFrame(columns=["newsid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])

    for page in range(1, last_page_number + 1):
        link = make_url(category, date, page)
        tmp_df = article_url_list(link)        
        result_df = pd.concat([result_df, tmp_df])
        print(last_page_number, page)
        
    return result_df

# result = date_category_df(100, "2016-04-14")
# result = result.reset_index(drop=True)
# len(result)

In [8]:
def main(category, start_date, days):
    
    result_df = pd.DataFrame(columns=["newsid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])
    
    for day in range(1, days+1):
        if day < 10:
            day = "0" + str(day)
        date = start_date + "-" + str(day) 
        print(date)
        tmp_df = date_category_df(category, date)
        result_df = pd.concat([result_df, tmp_df])
        print(len(result_df),len(tmp_df))
        
    return result_df
    
result = main(100,"2016-01", 31)
result = result.reset_index(drop=True)
result_df.to_csv("./100_2016-01.csv", index=False)

2016-01-01
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
225 225
2016-01-02
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
425 200
2016-01-03
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
650 225
2016-01-04
18 1
18 2
18 3
18 4
18 5
18 6
18 7
18 8
18 9
18 10
18 11
18 12
18 13
18 14
18 15
18 16
18 17
18 18
1100 450
2016-01-05
23 1
23 2
23 3
23 4
23 5
23 6
23 7
23 8
23 9
23 10
23 11
23 12
23 13
23 14
23 15
23 16
23 17
23 18
23 19
23 20
23 21
23 22
23 23
1673 573
2016-01-06
31 1
31 2
31 3
31 4
31 5
31 6
31 7
31 8
31 9
31 10
31 11
31 12
31 13
31 14
31 15
31 16
31 17
31 18
31 19
31 20
31 21
31 22
31 23
31 24
31 25
31 26
31 27
31 28
31 29
31 30
31 31
2446 773
2016-01-07
29 1
29 2
29 3
29 4
29 5
29 6
29 7
29 8
29 9
29 10
29 11
29 12
29 13
29 14
29 15
29 16
29 17
29 18
29 19
29 20
29 21


StaleElementReferenceException: Message: {"errorMessage":"Element is no longer attached to the DOM","request":{"headers":{"Accept":"application/json","Accept-Encoding":"identity","Connection":"close","Content-Type":"application/json;charset=UTF-8","Host":"127.0.0.1:61023","User-Agent":"Python-urllib/3.5"},"httpVersion":"1.1","method":"GET","url":"/text","urlParsed":{"anchor":"","query":"","file":"text","directory":"/","path":"/text","relative":"/text","port":"","host":"","password":"","user":"","userInfo":"","authority":"","protocol":"","source":"/text","queryKey":{},"chunks":["text"]},"urlOriginal":"/session/8d6762a0-3fcc-11e6-a851-73deaf05b803/element/:wdc:1467405847660/text"}}
Screenshot: available via screen


In [None]:
# result_df = pd.DataFrame(columns=["newsid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])
# result_df.to_csv("./test.csv", index=False)