In [1]:
import re
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup, Tag
from time import gmtime, strftime
from bs4 import BeautifulSoup, NavigableString
def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d-%W %H:%M:%S", gmtime())

def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent

    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()


In [2]:
def to_string(instance):
    final_string = ''
    if isinstance(instance, NavigableString):
        return instance
    for contents in instance.contents:
        final_string += to_string(contents)
    return final_string

In [3]:
import re
from dateutil.parser import parse
from bs4 import BeautifulSoup, NavigableString

def to_string(instance):
    final_string = ''
    if isinstance(instance, NavigableString):
        return instance
    for contents in instance.contents:
        final_string += to_string(contents)
    return final_string

def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists of
            date : publication date of article
            title : article title
            subtitle : none
            content : text 
            category : World
            source : Guardian
            url : web page url
            scrap_time : scrapped time
    """

    #try:
    soup = get_soup(url)
    try:
        main_content = soup.find('div', class_='article__announce-text').text
        content_list = soup.find_all('div', class_='article__text')
    except:
        return 'error'
    
    content = ''
    for paragraph in content_list:
        content = content + to_string(paragraph) + ' '
    content = main_content + content.replace(u'\xa0', u' ')
    
   
    title = soup.find('h1', class_ = 'article__title')
    title = title.text.replace('\n','')
    
    try:
        time = parse(url.split('/')[3]).strftime("%Y-%m-%d")
    except:
        time = parse(soup.find('div', class_='article__info-date').text[:20]).strftime("%Y-%m-%d")

    category_list = soup.find_all('li', class_ = 'tag')
    category_all = ''
    for category in category_list:
        category_all = category_all + category.find('a').text + ', '
    

    json_object = {
        'date': time,
        'title': title,
        'subtitle': '',
        'content': content,
        'category': category.text,
        'source': 'Sputnik',
        'url': url
    }
    return json_object
    """
    except Exception as e:
        return e
    """

In [4]:
sleep = 0.01
def yield_latest_article(begin_date, end_date, sleep, section, verbose=True):
    """
    Arguments
    ---------
    begin & end : datetime.datetime
    pagenum : int
    verbose : Boolean
        If True, print current status

    Returns
    -------
    links_all : list of str
        List of urls on the page pagenum
    """
    daily_datarange = pd.date_range(start=begin_date,
                                end=end_date,
                                freq='b')
    
    for date in daily_datarange:
        
        base_url = "https://sputniknews.com/services/{}/more.html?date={}&tags=1"
        url = base_url.format(section, date.strftime("%Y%m%d"))
        print(url)
        soup = get_soup(url)
        old_url = set()

        sub_urls = soup.find_all('div', class_ = 'list__content')
        daily_url = set(['https://sputniknews.com'+ a.find('a')['href'] for a in sub_urls])
        ## remove duplicate url
        links = list(daily_url - old_url)
        ## update new urls
        old_url = set()
        old_url.update(daily_url)

        for url in links:
            print(url)
            news_json = parse_page(url)           
            yield news_json
            time.sleep(sleep)
       

In [5]:
def save(json_obj, directory):
    date = json_obj.get('date', '')
    title = json_obj.get('title', '')
    filepath = '{}/{}_{}.json'.format(directory, date, re.sub('[^a-zA-Z ]+',"", title[:50]))
    if not os.path.exists(filepath):
        with open(filepath, 'w', encoding='utf-8') as fp:
            json.dump(json_obj, fp, indent=2, ensure_ascii=False)
            print('scraped {}'.format(json_obj['title']))
    else:
        print('this {}'.format(json_obj['title'])+ 'is already scraped')

In [6]:
directory ='C:/Users/13a71/Documents/crawling output/sputnik_outcome'

In [13]:
import argparse
import json
import os
import re
import time
import datetime
for article in yield_latest_article('2022-03-17', '2022-04-02', 0.001, "world"):
    try:
        save(article, directory)
        print('scraped {}'.format(article.get('url'), ''))
    except Exception as e:
        print(e)
        continue

https://sputniknews.com/services/world/more.html?date=20220317&tags=1
https://sputniknews.com/20220316/us-president-joe-biden-delivers-remarks-on-assistance-to-ukraine--1093929888.html
scraped US President Joe Biden Delivers Remarks on Assistance to Ukraine 
scraped https://sputniknews.com/20220316/us-president-joe-biden-delivers-remarks-on-assistance-to-ukraine--1093929888.html
https://sputniknews.com/20220317/live-updates-russias-emercom-supplying-2000-tonnes-of-humanitarian-aid-to-donbass-ukraine-1093944944.html
scraped LIVE UPDATES: Pentagon Official Says US Cannot Confirm Which Forces Struck Mariupol Theater
scraped https://sputniknews.com/20220317/live-updates-russias-emercom-supplying-2000-tonnes-of-humanitarian-aid-to-donbass-ukraine-1093944944.html
https://sputniknews.com/20220317/russian-fm-lavrov--uae-counterpart-al-nahyan-hold-presser-after-talks-in-moscow-1093951371.html
scraped Russian FM Lavrov & UAE Counterpart Al Nahyan Hold Presser After Talks in Moscow
scraped https:

ConnectionError: HTTPSConnectionPool(host='sputniknews.com', port=443): Max retries exceeded with url: /20220331/ukraine-threatens-to-sink-any-foreign-ship-trying-to-leave-its-ports-russian-mod-says-1094355972.html (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DD01823A30>: Failed to establish a new connection: [WinError 10060] 연결된 구성원으로부터 응답이 없어 연결하지 못했거나, 호스트로부터 응답이 없어 연결이 끊어졌습니다'))

In [None]:
sub_urls = soup.find('div', class_ = 'b-plainlist').find_all('a')
each_urls = [ 'https://sputniknews.com/'+ a['href'] for a in sub_urls]

In [None]:
get_soup(urls[0]).find_all('div', clsss_='b-plainlist')

In [None]:
a= set(each_urls )

In [None]:
soup = get_soup(urls[1])
sub_urls = soup.find('div', class_ = 'b-plainlist').find_all('a')
each_urls = [ 'https://sputniknews.com/'+ a['href'] for a in sub_urls]
b= list(set(each_urls ))

In [None]:
old_urls = set()

In [None]:
old_urls.update(a)

In [None]:
urls = list(a-b)

In [None]:
urls[0]

'https://sputniknews.com//military/201906301076103760-russian-aerobatic-teams-stunts-army-exhibition/'

In [118]:
parse_page(urls[0])

AttributeError: 'str' object has no attribute 'text'

In [28]:
print(dt_index)

DatetimeIndex(['2019-07-01', '2019-07-02', '2019-07-03', '2019-07-04',
               '2019-07-05', '2019-07-06', '2019-07-07', '2019-07-08',
               '2019-07-09', '2019-07-10',
               ...
               '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25',
               '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29',
               '2020-06-30', '2020-07-01'],
              dtype='datetime64[ns]', length=367, freq='D')


In [26]:
dt_index = pd.date_range(start= '2019-07-01', end='2020-07-01')

In [120]:
soup.find('div' , class_ ='b-article').find_all('p')[:-1]

[<p>The Army-2019 military expo concludes in the Moscow Region's Alabino with a closing show featuring real battle simulations using the newest weapons, tanks, and aircraft.</p>,
 <p>The 5th forum, Army-2019, which showcases modern arms and military equipment, kicked off at Patriot Park in the Moscow Region on Tuesday. The forum will run until Sunday.</p>,
 <p>The forum is a multifaceted event that apart from the main exposition, includes special projects, such as an international exhibition of high-throughput equipment and technology for re-equipping enterprises in the military-industrial complex, Intellectual Technology Exhibition 2019, and Innovation Club special exhibition.</p>]

In [123]:
content = ''
for paragraph in content_list:
    content = content + to_string(paragraph) + ' '

In [124]:
content

"The Army-2019 military expo concludes in the Moscow Region's Alabino with a closing show featuring real battle simulations using the newest weapons, tanks, and aircraft. The 5th forum, Army-2019, which showcases modern arms and military equipment, kicked off at Patriot Park in the Moscow Region on Tuesday. The forum will run until Sunday. The forum is a multifaceted event that apart from the main exposition, includes special projects, such as an international exhibition of high-throughput equipment and technology for re-equipping enterprises in the military-industrial complex, Intellectual Technology Exhibition 2019, and Innovation Club special exhibition. "