In [1]:
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup, Tag
from time import gmtime, strftime


def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent

    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()

def get_latest_allnews(section, begin_date, end_date, max_num=10, sleep=1.0):
    """
    Arguments
    ---------
    begin_page : int
        Default is 1
    end_page : int
        Default is 3
    verbose : Boolean
        If True, print current status

    Returns
    -------
    links_all : list of str
        List of urls
    """
    section = str(section)
    for pagenum in range(1, max_num):   
        url = "https://www.defense.gov/Explore/{}/Listing/StartDate/{}/EndDate/{}/?Page={}".format(section, begin_date, end_date, max_num)
        soup = get_soup(url)
        urls = [a['article-url'] for a in soup.find_all('story-card')]
        
        for each_url in urls:
            news_json = parse_page(each_url)
            print(news_json)

In [33]:
import re
from dateutil.parser import parse
def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d %H:%M:%S", gmtime())
def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists with
            title : article title
            time : article written time
            content : text with line separator \\n
            url : web page url
            scrap_time : scrapped time
    """

    soup = get_soup(url)
    
    title = soup.find('h1', class_= 'maintitle').text
    title = re.sub('\n', '' , title, 100)
    title = re.sub('\r', '' , title, 100)
    phrases = soup.find('div', class_= 'content content-wrap').find_all('p')
    content = '\n'.join([p.text.strip() for p in phrases])
    content = re.sub('\n', '' , content, 10000)
    content = re.sub('\t', '' , content, 10000)
    content = re.sub('\r', '' , content, 10000)
    content = re.sub('\xa0', '' , content, 1000)
    time = soup.find('span', class_ = 'date')
    date =  parse(time.text[:30]).strftime("%Y-%m-%d")
    
    json_object = {
        'title' : title,
        'date' : date,
        'content' : content,
        'url' : url,
        'source' : "US Department of Defense",
        'scrap_time': now()}

    return json_object

In [36]:
url = 'https://www.defense.gov/News/News-Stories/Article/Article/2997723/defense-intelligence-agency-report-details-space-based-threats-from-competitors/'

In [37]:
parse_page(url)


            April 12, 2022



{'title': '            Defense Intelligence Agency Report Details Space-Based Threats From Competitors        ',
 'date': '2022-04-12',
 'content': 'The Defense Intelligence Agency today released the new unclassified report, "Challenges to Security in Space 2022," which is a follow-up to its similarly titled report in 2019.The new report examines the space and counterspace programs which could pose significant challenges to U.S. or partner interests by China, Russia, North Korea and Iran, said John F. Huth, the DIA defense intelligence officer for space and counterspace, during a briefing today at the Pentagon."This new edition of \'Challenges to Security in Space\' provides an updated, unclassified overview of current threats to U.S. space-based capabilities, particularly from China and Russia, but also to a lesser extent, those emerging from North Korea and Iran," he said. "This edition examines the expansion of space operations and details Earth-focused space services, as well as gr

In [11]:
url = "https://www.defense.gov/News/News-Stories/StartDate/2021-09-19/EndDate/2021-10-02/?Page=3"

In [None]:
https://www.defense.gov/News/News-Stories/StartDate/2021-09-19/EndDate/2021-10-02/?Page=2

In [45]:
soup.find_all('a')

[<a href="#skip-target" id="skip-link">Skip to main content (Press Enter).</a>,
 <a href="https://twitter.com/DeptofDefense" rel="noopener" target="_blank" title="Twitter"><span class="social-icon fa fa-twitter social-link-4"></span></a>,
 <a href="https://www.facebook.com/DeptofDefense" rel="noopener" target="_blank" title="Facebook"><span class="social-icon fa fa-facebook social-link-1"></span></a>,
 <a href="https://www.instagram.com/deptofdefense/" rel="noopener" target="_blank" title="Instagram"><span class="social-icon fa fa-instagram social-link-3"></span></a>,
 <a href="https://www.youtube.com/deptofdefense" rel="noopener" target="_blank" title="YouTube"><span class="social-icon fa fa-youtube social-link-2"></span></a>,
 <a class="logo" href="http://www.defense.gov/" target="">
 <img alt="Logo for U.S. Department of Defense" src="/Portals/1/Images/DOD-Icon-Header.png?ver=5sAfFl2--9znca0j3SrX_g%3d%3d" title="U.S. Department of Defense"/>
 <span>U.S. Department<span>of</span>Defe

In [43]:
soup.find('div', class_='alist-inner alist-more-here')

<div class="alist-inner alist-more-here">
<feature-template :remove-grid="true" :show-all="true" template-mode="story">
<listing-with-preview :has-terms="true" :show-image="true" article-alt="Deputy Defense Secretary Kathleen H. Hicks stands outside with hand over heart." article-id="2790466" article-image-url="https://media.defense.gov/2021/Sep/27/2002862648/825/780/0/210730-D-BM568-1176Y.JPG" article-summary="Blue Star Families is a national, nonprofit network of military families from all ranks and services, including the National Guard and Reserves, dedicated to supporting, connecting and empowering military families." article-title="Hicks Greets Blue Star Families, Veterans, Civilians in Welcome Week" article-url="http://www.defense.gov/News/News-Stories/Article/Article/2790466/hicks-greets-blue-star-families-veterans-civilians-in-welcome-week/" article-url-or-link="http://www.defense.gov/News/News-Stories/Article/Article/2790466/hicks-greets-blue-star-families-veterans-civilians-

In [3]:
soup = get_soup('https://pann.nate.com/talk/c20048?page=6')

In [4]:
sub_url = soup.find_all('listing-with-preview')
urls = [a['article-url'] for a in sub_url]

In [14]:
get_latest_allnews('News', "20210901", "20210801", max_num=10, sleep=1.0)

In [11]:
urls

[]

In [10]:
urls = ["https://pann.nate.com" + a['href'] for a in sub_url]

In [8]:
url = 'https://pann.nate.com/talk/c20048?page=6'

In [9]:
sub_urls = get_soup(url).find_all('td', class_ = 'subject')

In [13]:
def save(json_obj, directory):
    date = json_obj.get('date', '')
    title = json_obj.get('title', '')
    filepath = '{}/{}_{}.json'.format(directory, date, re.sub('[ㄱ-ㅎ]+',"", title[10:50]))
    with open(filepath, 'w', encoding='utf-8') as fp:
        json.dump(json_obj, fp, indent=2, ensure_ascii=False, sort_keys=True, default=str)


In [19]:
urls

['https://pann.nate.com/talk/359731556?page=6',
 'https://pann.nate.com/talk/359634932?page=6',
 'https://pann.nate.com/talk/c20048/channel/222270',
 'https://pann.nate.com/talk/359491583?page=6',
 'https://pann.nate.com/talk/359490114?page=6',
 'https://pann.nate.com/talk/359487998?page=6',
 'https://pann.nate.com/talk/c20048/channel/222145',
 'https://pann.nate.com/talk/359366273?page=6',
 'https://pann.nate.com/talk/c20048/channel/198577',
 'https://pann.nate.com/talk/359327084?page=6',
 'https://pann.nate.com/talk/c20048/channel/85574',
 'https://pann.nate.com/talk/c20048/channel/4806',
 'https://pann.nate.com/talk/c20048/channel/85574',
 'https://pann.nate.com/talk/c20048/channel/4806',
 'https://pann.nate.com/talk/c20048/channel/198577',
 'https://pann.nate.com/talk/c20048/channel/85574',
 'https://pann.nate.com/talk/c20048/channel/210948',
 'https://pann.nate.com/talk/358821354?page=6',
 'https://pann.nate.com/talk/c20048/channel/198577',
 'https://pann.nate.com/talk/c20048/chan

In [17]:
urls = ["https://pann.nate.com" + a.find('a')['href'] for a in sub_urls]

In [24]:
def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists with
            title : article title
            time : article written time
            content : text with line separator \\n
            url : web page url
            scrap_time : scrapped time
    """

    soup = get_soup(url)
    
    title = soup.find('h4').text
    title = re.sub('\n', '' , title, 100)
    title = re.sub('\r', '' , title, 100)
    
    content = soup.find('div', id = 'contentArea').text
    content = re.sub('xa', '' , content, 10000)
    content = re.sub('\t', '' , content, 10000)
    content = re.sub('\r', '' , content, 10000)
    content = re.sub('\xa0ab', '' , content, 1000)
    content = re.sub('\ufeff', '' , content, 1000)

    date =  parse(soup.find('span', class_='date').text).strftime("%Y-%m-%d")

    json_object = {
        'title' : title,
        'date' : date,
        'content' : content,
        'url' : url,
        'source' : "nate"}

    return json_object

In [25]:
from dateutil.parser import parse

In [26]:
parse_page(urls[0])

{'title': '저는 아빠랍니다~^^',
 'date': '2021-05-14',
 'content': '\n저는 이런 자식 없습니다~ \n아빠보고 엄마라고 부르는 자식이 세상 어디에 있을까요?~^^\n\n모두 스팸 조심하세요~\n ',
 'url': 'https://pann.nate.com/talk/359731556?page=6',
 'source': 'nate'}

In [27]:
soup.find('div', id = 'contentArea')

In [28]:
soup.find('div', id = 'contentArea').text

AttributeError: 'NoneType' object has no attribute 'text'

In [111]:
content = soup.find('div', id = 'contentArea').text
content = re.sub('\n', '' , content, 10000)
content = re.sub('\t', '' , content, 10000)
content = re.sub('\r', '' , content, 10000)
content = re.sub('\xa0ab', '' , content, 1000)
content = re.sub('\ufeff', '`' , content, 1000)

In [49]:

def get_latest_allnews(section, max_num=10, sleep=1.0):
    """
    Arguments
    ---------
    begin_page : int
        Default is 1
    end_page : int
        Default is 3
    verbose : Boolean
        If True, print current status

    Returns
    -------
    links_all : list of str
        List of urls
    """
    
    pattern = re.compile('https://pann.nate.com/talk/c20048/channel/')

    def is_matched(url):
        if pattern.match(url):
            return False
        return True
    
    
    section = str(section)
    for pagenum in range(1, max_num):   

        url = "https://pann.nate.com/talk/{}?page={}".format(section, pagenum)
        soup = get_soup(url)
        sub_url = get_soup(url).find_all('td', class_ = 'subject')
        
        urls = ["https://pann.nate.com" + a.find('a')['href'] for a in sub_url]
        urls = [url for url in urls if is_matched(url)]
        
        for each_url in urls:
            print(each_url)
            news_json = parse_page(each_url)
            print(news_json)
            
            # yield
            return news_json
            time.sleep(sleep)


In [53]:

def get_latest_allnews(section, max_num=10, sleep=1.0):
    """
    Arguments
    ---------
    begin_page : int
        Default is 1
    end_page : int
        Default is 3
    verbose : Boolean
        If True, print current status

    Returns
    -------
    links_all : list of str
        List of urls
    """
    
    pattern = re.compile('https://pann.nate.com/talk/c20048/channel/')

    def is_matched(url):
        if pattern.match(url):
            return False
        return True
    
    
    section = str(section)
    for pagenum in range(1, max_num):   

        url = "https://pann.nate.com/talk/{}?page={}".format(section, pagenum)
        soup = get_soup(url)
        sub_urls = get_soup(url).find_all('td', class_ = 'subject')
        
        urls = ["https://pann.nate.com" + a.find('a')['href'] for a in sub_urls]
        urls = [url for url in urls if is_matched(url)]
        
        for each_url in urls:
            print(each_url)
            try:
                news_json = parse_page(each_url)
                print(news_json)
            except:
                print("error2")
                pass
            
            # yield
            return news_json
            time.sleep(sleep)


In [55]:
temp.get('date', '')

'2021-11-22'

In [56]:
temp = get_latest_allnews("c20048", max_num=10, sleep=1.0)

https://pann.nate.com/talk/363740147?page=1
{'title': '이번주별자리운세 보고 가세요 ! 재미로보는 별자리 운세~!', 'date': '2021-11-22', 'content': '\n\xa0\xa0\xa0\xa0\xa0재미로 보는 이번주 별자리 운세에요~!여러분 월요팅하세요!!!출저 : 품격선생 감우품격선생 - 타로,운세,사주,궁합,신점,역학,운세상담,전화상담,꿈해몽,토정비결, (dignitymento.com)\xa0\n ', 'url': 'https://pann.nate.com/talk/363740147?page=1', 'source': 'nate'}


In [119]:
soup.find('div', id = 'contentArea').text

'\n\ufeff오늘 친구 어머니께서 갑자기 폐에 부종이 있고 염증이 생겨 병원에 입원을 하시게 되었는데요의사가 급한 상황이라고, 당장 내일 수술을 해야한다고 했다네요.. ㅠㅠ근데 코로나덕분에 전체적으로 피가 턱없이 부족한 상황이라환자가 직접 피를 구해와야 하는 상황이라고 합니다 ㅠ혹시나\xa0ab형이신분들, 시간 되시면김포우리병원 김애자\xa0님 앞으로\xa0지정헌혈\xa0부탁드려요(+)집에 잠자고 있는 헌혈증도 보내주시면 도움이 될거예요대표전화 031-999-1000\xa0\xa010099 경기도 김포시 감암로 11 김포우리병원 김애자님\ufeff\n '

In [128]:
parse_page(b[0])

{'title': 'ab형이신분들 죄송하지만 지정헌혈좀 부탁드려요 ㅠㅠ',
 'date': '2021-05-10',
 'content': '\n오늘 친구 어머니께서 갑자기 폐에 부종이 있고 염증이 생겨 병원에 입원을 하시게 되었는데요의사가 급한 상황이라고, 당장 내일 수술을 해야한다고 했다네요.. ㅠㅠ근데 코로나덕분에 전체적으로 피가 턱없이 부족한 상황이라환자가 직접 피를 구해와야 하는 상황이라고 합니다 ㅠ혹시나형이신분들, 시간 되시면김포우리병원 김애자\xa0님 앞으로\xa0지정헌혈\xa0부탁드려요(+)집에 잠자고 있는 헌혈증도 보내주시면 도움이 될거예요대표전화 031-999-1000\xa0\xa010099 경기도 김포시 감암로 11 김포우리병원 김애자님\n ',
 'url': 'https://pann.nate.com/talk/359634932?page=6',
 'source': 'nate',
 'scrap_time': '2021-11-22 03:37:10'}