In [2]:
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup, Tag
from time import gmtime, strftime
from bs4 import BeautifulSoup, NavigableString
import pandas as pd
def now():
    """
    Returns
    -------
    Current time : str
        eg: 2018-11-22 13:35:23
    """
    return strftime("%Y-%m-%d-%W %H:%M:%S", gmtime())

def get_soup(url, headers=None):
    """
    Arguments
    ---------
    url : str
        Web page url
    headers : dict
        Headers for requests. If None, use Mozilla/5.0 as default user-agent

    Returns
    -------
    soup : bs4.BeautifulSoup
        Soup format web page
    """

    if headers is None:
        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
    r = requests.get(url, headers=headers)
    html = r.text
    page = BeautifulSoup(html, 'lxml')
    return page

doublespace_pattern = re.compile('\s+')
lineseparator_pattern = re.compile('\n+')

def normalize_text(text):
    text = text.replace('\t', ' ')
    text = text.replace('\r', ' ')
    text = lineseparator_pattern.sub('\n', text)
    text = doublespace_pattern.sub(' ', text)
    return text.strip()


In [3]:
def to_string(instance):
    final_string = ''
    if isinstance(instance, NavigableString):
        return instance
    for contents in instance.contents:
        final_string += to_string(contents)
    return final_string

In [4]:
from dateutil.parser import parse

In [5]:
def parse_page(url):
    """
    Argument
    --------
    url : str
        Web page url

    Returns
    -------
    json_object : dict
        JSON format web page contents
        It consists of
            date : publication date of article
            title : article title
            subtitle : none
            content : text 
            category : World
            source : Guardian
            url : web page url
            scrap_time : scrapped time
    """
    soup = get_soup(url)

    content_list = soup.find('div', class_ = 'content').find_all('p')
    content = ''
    for paragraph in content_list:
        content = content + to_string(paragraph) + ' '
        content = content.replace(u'\xa0', u' ').replace('\n', '').replace('\'s', '`s')


    
    title = soup.find('h1')
    title = title.text.replace('\r\n','')
        
    time = soup.find('p', class_ = 'time').text
    time = parse(time).strftime("%Y-%m-%d")
    json_object = {
        'date': time,
        'title': title,
        'time' : time,
        'subtitle': '',
        'content': content,
        'source': 'Xinhua',
        'url': url,
        'scrap_time': now()
        }
    return json_object

In [7]:
get_soup('https://search.news.cn/getNews?sortField=0&searchFields=0&keyword=olympic&curPage=3&lang=en')

<html><body><p>{"code":200,"content":{"recommendation":null,"keyword":"olympic","sortField":"0","optionsSearchTypes":null,"curPage":3,"results":[{"code":null,"contentId":"202202174c218def613b40ba9b0a1e76d7d15cc7_ea5462a0768d4c3b94ddbf661c1932cc","des":null,"imgUrl":null,"keyword":null,"listResult":null,"pubtime":"2022-02-17 00:47:00","sitename":"Ëã±ÊñáÈ¢ëÈÅì","title":"China¬†keeps¬†semifinal¬†hope¬†alive¬†in¬†<font color="red">Olympic</font>¬†women's¬†team¬†curling","url":"https://english.news.cn/20220217/4c218def613b40ba9b0a1e76d7d15cc7/c.html"},{"code":null,"contentId":"202202174c218def613b40ba9b0a1e76d7d15cc7_a8e397e9d2224c0aa3855cf94e9fda0a","des":null,"imgUrl":null,"keyword":null,"listResult":null,"pubtime":"2022-02-17 00:47:00","sitename":"Ëã±ÊñáÈ¢ëÈÅì","title":"China¬†keeps¬†semifinal¬†hope¬†alive¬†in¬†<font color="red">Olympic</font>¬†women's¬†team¬†curling","url":"https://english.news.cn/20220217/4c218def613b40ba9b0a1e76d7d15cc7/c.html"},{"code":null,"contentId":"202202174c218

In [8]:
get_soup('http://search.news.cn/?lang=en#search/0/OLYMPIC/6/').find_all('a')

[<a href="http://english.news.cn/" target="_blank">Xinhuanet</a>,
 <a href="http://english.news.cn/" target="_blank"><img src="/font/images/index/logo.png"/></a>,
 <a class="advanceSearch" href="javascript:;">Advanced Search</a>,
 <a class="typeBtn selected" data-type="0" href="javascript:;">Stories</a>,
 <a class="typeBtn" data-type="1" href="javascript:;">Photos</a>,
 <a class="typeBtn" data-type="2" href="javascript:;">Videos</a>,
 <a class="advanceSearch" href="javascript:;" id="advanceSearch">Advanced Search</a>,
 <a href="http://english.news.cn/" target="_blank">Xinhuanet</a>,
 <a href="javascript:;" id="seniorSubmit">Enter</a>]

In [9]:
def yield_latest_article(begin_date, max_num=10, sleep=0.1):
    """
    Artuments
    ---------
    begin_date : str
        eg. 2018-07-01
    end_date :str
        eg. 2019-03-31
    max_num : int
        Maximum number of news to be scraped
    sleep : float
        Sleep time. Default 1.0 sec

    It yields
    ---------
    news : json object
    """

    # prepare parameters
    n_news = 0
    outdate = False

    for page in range(139800541, 130000000, -1):

        # check number of scraped news
        # get urls
        page = str(page)
        ymonth = parse(begin_date).strftime("%Y-%m/%d")
        url = "http://www.news.cn/english/{}/c_{}.htm".format(ymonth, page)

        print(url)
        try:
            news_json = parse_page(url)
            return news_json
            # check date of scraped news

        except:
            begin_date = (datetime.strptime(begin_date, "%Y-%m-%d") +  timedelta(days=1)).strftime("%Y-%m/%d")
            print(afterday)
            url = "http://www.news.cn/english/{}/c_{}.htm".format(afterday, page)
            news_json = parse_page(url)
            return news_json

In [6]:
begin_date = "2021-03-11"

In [7]:
parse_page('https://english.news.cn/20220218/abfbbf15cf6e4cf78b1235a3ceb6322b/c.html')

{'date': '2022-02-18',
 'title': "Chinese Wisdom in Xi's Words: Fragrant plum blossoms in bitter cold",
 'time': '2022-02-18',
 'subtitle': '',
 'content': '   BEIJING, Feb. 18 (Xinhua) -- "There will be no fragrant plum blossoms without freezing cold weather."    Chinese President Xi Jinping has quoted this line from a poem of the Tang Dynasty (618-907) to encourage athletes to train hard to achieve good results in the ongoing Beijing 2022 Winter Olympics.    The Chinese share a particular fondness for plum blossoms because they bloom in the winter when almost all other plants wither in the bitter cold. They admire the flower for its perseverance and tenacity in the face of adversity.    Writers of later generations continued to use the similar metaphor. For instance, a popular couplet paralleled plum blossoms with the blade of swords -- "good honing gives a sharp edge to a sword; bitter cold adds fragrance to plum blossoms."    For most athletes, Beijing 2022 saw their years of dedic

In [7]:
for page in range(139800541, 139800500, -1):
    page = str(page)
    ymonth = parse(begin_date).strftime("%Y-%m/%d")
    url = "http://www.news.cn/english/{}/c_{}.htm".format(ymonth, page)

    print(url)
    try:
        news_json = parse_page(url)
        print(news_json)
        # check date of scraped news

    except:
        try:
            begin_date = (datetime.strptime(begin_date, "%Y-%m-%d") +  timedelta(days=1)).strftime("%Y-%m/%d")
            url = "http://www.news.cn/english/{}/c_{}.htm".format(begin_date, page)
            print(url)
            news_json = parse_page(url)
            print(news_json)
        except:
            print("This url is not available")

http://www.news.cn/english/2021-03/11/c_139800541.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800540.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800539.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800538.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800537.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800536.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800535.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800534.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800533.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800532.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800531.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_139800530.htm
This url is not available
http://www.news.cn/english/2021-03/11/c_

In [190]:
yield_latest_article("2021-03-11", max_num=10, sleep=0.1)

http://www.news.cn/english/2021-03/11/c_139800541.htm


{'date': '2021-03-11',
 'title': 'Xinhua Photos of the Day (March 10) ',
 'time': '2021-03-11',
 'subtitle': '',
 'content': 'BEIJING, March 10 (Xinhua) -- The world in the past 24hrs. A selection of the best daily press photos from Xinhua.  The closing meeting of the fourth session of the 13th National Committee of the Chinese People`s Political Consultative Conference (CPPCC) is held at the Great Hall of the People in Beijing, capital of China, March 10, 2021. Xi Jinping, Li Keqiang, Li Zhanshu, Wang Huning, Zhao Leji, Han Zheng and Wang Qishan attended the closing meeting. Wang Yang presided over the closing meeting and delivered a speech. (Xinhua/Huang Jingwen)  The closing meeting of the fourth session of the 13th National Committee of the Chinese People`s Political Consultative Conference (CPPCC) is held at the Great Hall of the People in Beijing, capital of China, March 10, 2021. (Xinhua/Li Xiang)  The closing meeting of the fourth session of the 13th National Committee of the C

In [8]:
yield_latest_allnews('the', "2021-08-01")

NameError: name 'yield_latest_allnews' is not defined

In [66]:
d_begin = parse('2021-08-01')

In [69]:
for url in real_url:
            print(url)
            news_json = parse_page(url)
            print(news_json)
            time.sleep(1)
            d_news = parse(news_json['date'])
            if d_begin > d_news:
                outdate = True
                print('Stop scrapping. {} news was scrapped'.format(n_news))
                print('The oldest news has been created after {}'.format(begin_date))
                break

http://www.xinhuanet.com/english/asiapacific/2021-08/27/c_1310152878.htm
{'date': '2021-08-27', 'title': 'China urges Japan not to mislead the next generation ', 'time': '2021-08-27', 'subtitle': '', 'content': 'BEIJING, Aug. 27 (Xinhua) -- China firmly opposes Japan unveiling a defense white paper for children, and has lodged solemn representations with Japan, urging Japan not to mislead the next generation, a foreign ministry spokesperson said on Friday. Japan`s defense ministry on Aug. 16 released a 30-page white paper for elementary and junior high school students on its website, which covers China`s military development, the Diaoyu Islands and the South China Sea issues. Chinese Foreign Ministry spokesperson Zhao Lijian told a press briefing that China already stated its solemn position on Japan`s 2021 defense white paper before, saying China firmly opposes Japan`s unreasonable accusations against China`s regular national defense construction, irresponsible remarks on China`s legi

http://www.news.cn/english/northamerica/2021-08/27/c_1310152594.htm
{'date': '2021-08-27', 'title': 'China opposes politicizing the origins of COVID-19: New York Consulate General ', 'time': '2021-08-27', 'subtitle': '', 'content': 'NEW YORK, Aug. 26 (Xinhua) -- China opposes politicizing efforts to uncover the origins of the coronavirus, the Chinese Consulate General in New York has said. "Some U.S. politicians and media never stop their stigmatizing and politicizing campaign" but origins tracing is a serious scientific matter, the consulate`s spokesperson said in a Boston Herald editorial Thursday. The U.S. intelligence community isn\'t qualified to reach any scientific conclusion tracing the origins of the virus, and its report on the issue will only "interfere in and undermine the global efforts of origins tracing and cooperation on pandemic response," added the spokesperson. The joint report on origins tracing done by the World Health Organization and Chinese scientists in early 2

KeyboardInterrupt: 

In [9]:
import os

In [10]:

def save(json_obj, directory):
    date = json_obj.get('date', '')
    title = json_obj.get('title', '')
    filepath = '{}/{}_{}.json'.format(directory, date, re.sub('[^a-zA-Z ]+',"", title[:50]))
    if not os.path.exists(filepath):
        with open(filepath, 'w', encoding='utf-8') as fp:
            json.dump(json_obj, fp, indent=2, ensure_ascii=False)
            print('scraped {}'.format(json_obj['title'][:10]))
    else:
        print('this {}'.format(json_obj['title'])+ 'is already scraped')

In [11]:
 directory = 'C:/Users/13a71/Documents/crawling output/xinhua'

In [14]:
save(json_obj, directory)

this The "incredible" U.S. intelligence: targeted killings is already scraped


In [13]:
json_obj = parse_page('http://www.news.cn/english/2021-08/28/c_1310154451.htm')

In [23]:
yield_latest_allnews("the", '2021-08-01', 2, 1.0)

['http://www.news.cn/english/2021-08/29/c_1310155794.htm', 'http://www.news.cn/english/2021-08/29/c_1310155546.htm', 'http://www.news.cn/english/2021-08/29/c_1310155498.htm', 'http://www.news.cn/english/2021-08/29/c_1310155497.htm', 'http://www.news.cn/english/2021-08/29/c_1310155497.htm', 'http://www.news.cn/english/2021-08/29/c_1310155358.htm', 'http://www.news.cn/english/2021-08/29/c_1310155150.htm', 'http://www.news.cn/english/2021-08/29/c_1310154979.htm', 'http://www.news.cn/english/2021-08/29/c_1310154673.htm', 'http://www.news.cn/english/2021-08/28/c_1310154454.htm']
{'date': '2021-08-29', 'title': 'GLOBALink | Canadian painter:"The Dream of Red Chamber" is my favorite ', 'time': '2021-08-29', 'subtitle': '', 'content': '  NANCHANG, Aug. 29 (Xinhua) -- Brandon Collins-Green, a canadian painter, came to China to study ancient Chinese literature as he is interested in the famous Chinese novel "The Dream of the Red Chamber". Check out his story. #GLOBALink  Produced by Xinhua Globa

{'date': '2021-08-29', 'title': 'Firing the Chengni inkstone ', 'time': '2021-08-29', 'subtitle': '', 'content': ' As one of the "four treasures in the study" in ancient China, inkstone is so much more than just a container for ink. In this final episode of four, English teacher Mark Dinning from the UK tries his hand at making Chengni inkstone, the only kind made from mud√¢\x96  ', 'source': 'Xinhua', 'url': 'http://www.news.cn/english/2021-08/29/c_1310154673.htm', 'scrap_time': '2021-08-29-34 22:34:05'}
{'date': '2021-08-28', 'title': 'The "incredible" U.S. intelligence: torture and coverup ', 'time': '2021-08-28', 'subtitle': '', 'content': 'CIA tortures and lies...  ', 'source': 'Xinhua', 'url': 'http://www.news.cn/english/2021-08/28/c_1310154454.htm', 'scrap_time': '2021-08-29-34 22:34:05'}


In [34]:
article 

In [33]:
article = yield_latest_allnews("the", '2021-08-01', 2, 1.0)

['http://www.news.cn/english/2021-08/29/c_1310155794.htm', 'http://www.news.cn/english/2021-08/29/c_1310155546.htm', 'http://www.news.cn/english/2021-08/29/c_1310155498.htm', 'http://www.news.cn/english/2021-08/29/c_1310155497.htm', 'http://www.news.cn/english/2021-08/29/c_1310155497.htm', 'http://www.news.cn/english/2021-08/29/c_1310155358.htm', 'http://www.news.cn/english/2021-08/29/c_1310155150.htm', 'http://www.news.cn/english/2021-08/29/c_1310154979.htm', 'http://www.news.cn/english/2021-08/29/c_1310154673.htm', 'http://www.news.cn/english/2021-08/28/c_1310154454.htm']
{'date': '2021-08-29', 'title': 'GLOBALink | Canadian painter:"The Dream of Red Chamber" is my favorite ', 'time': '2021-08-29', 'subtitle': '', 'content': '  NANCHANG, Aug. 29 (Xinhua) -- Brandon Collins-Green, a canadian painter, came to China to study ancient Chinese literature as he is interested in the famous Chinese novel "The Dream of the Red Chamber". Check out his story. #GLOBALink  Produced by Xinhua Globa

In [8]:
def yield_latest_allnews(keyword, begin_date, end_page=10, sleep=1.0):
    """
    Artuments
    ---------
    section : str
        eg. "world, asia, opinion"
    begin_date : str
        eg. 2018-01-01
    max_num : int
        Maximum number of news to be scraped
    sleep : float
        Sleep time. Default 1.0 sec
    It yields
    ---------
    news : json object
    """
    base_url = 'https://search.news.cn/getNews?sortField=0&searchFields=1&keyword={}&curPage={}&lang=en'

    # prepare parameters
    d_begin = parse(begin_date)
    n_news = 0
    outdate = False

    for page in range(1, end_page):

        # check number of scraped news
        if outdate:
            break

        # get urls
        url = base_url.format(keyword, page)
        soup = get_soup(url)
        
        url_list = soup.text.split(',{')
        real_url = []
        for url in url_list: 
            end = re.search("url", url).end()+3
            real_url.append(url[end:-2])

        real_url[-1] = real_url[-1][:-38]
        print(real_url)

        for url in real_url:
            news_json = parse_page(url)
            print(news_json)
                
            d_news = parse(news_json['date'])
            if d_begin > d_news:
                outdate = True
                print('Stop scrapping. {} news was scrapped'.format(n_news))
                print('The oldest news has been created after {}'.format(begin_date))
                break
                # yield
                return news_json
                n_news += 1

In [9]:
end = re.search("url", soup.text.split(',{')[0]).end()+3

NameError: name 'soup' is not defined

In [10]:
soup.text.split(',{')[0][end:-2]

NameError: name 'soup' is not defined

In [109]:
end

389

In [14]:

def yield_latest_article(section, begin_date, end_date, verbose=True):
    """
    Arguments
    ---------
    begin & end : datetime.datetime
    pagenum : int
    verbose : Boolean
        If True, print current status

    Returns
    -------
    links_all : list of str
        List of urls on the page pagenum
    """
    section =str(section)
    base_url = 'https://sputniknews.com/{}/{}/'
    old_url = set()

    dt_index = pd.date_range(start=begin_date, end=end_date)

    for format_date in dt_index:
        date = format_date.strftime("%Y%m%d")
        url = base_url.format(section, date)
        soup = get_soup(url)

        sub_urls = soup.find('div', class_ = 'b-plainlist').find_all('a')
        daily_url = set(['https://sputniknews.com'+ a['href'] for a in sub_urls])
        ## remove duplicate url
        links = list(daily_url - old_url)
        ## update new urls
        old_url = set()
        old_url.update(daily_url)

        for url in links:
            try:
                news_json = parse_page(url)           
                print(news_json)
                time.sleep(sleep)
            except:
                print("this {} is not checked".format(url))

In [15]:
yield_latest_article('world', 20200701, 20200715, verbose=True)

{'date': '2020-07-19', 'title': 'TikTok Suspends Talks on Opening Headquarters in London, Reports Suggest', 'subtitle': '', 'content': 'MOSCOW (Sputnik) - ByteDance, the parent company of social media giant TikTok has broken off negotiations to open a global headquarters in the United Kingdom over worsening relations between Beijing and London, The Sunday Times reported. The decision to halt negotiations was made against the backdrop of a "wider geopolitical context", a source told the newspaper. Chinese media and tech firms have been subject to increasing attacks and restrictions led by the United States. ByteDance had been in talks for several months to launch operations in the UK which could create 3,000 jobs, according to The Sunday Times. ', 'category': 'Business', 'source': 'Sputnik', 'url': 'https://sputniknews.com/business/202007191079927606-tiktok-owner-halts-negotiations-to-open-headquarters-in-london-amid-tensions---reports/'}
this https://sputniknews.com/business/2020071910

In [16]:
url = 'https://sputniknews.com/us/202007191079927652-portland-protesters-break-into-police-association-hq-building-setting-it-on-fire-video/'

In [18]:
url = 'https://sputniknews.com/india/202007191079927681-mumbai-police-register-fir-against-two-for-rape-murder-threats-to-bollywood-star-rhea-chakraborty/'

In [20]:
url = 'https://sputniknews.com/india/202007191079927681-mumbai-police-register-fir-against-two-for-rape-murder-threats-to-bollywood-star-rhea-chakraborty/'

In [21]:
parse_page(url)

{'date': '2020-07-19',
 'title': 'Mumbai Police Register FIR Against Two for Rape, Murder Threats to Bollywood Star Rhea Chakraborty',
 'subtitle': '',
 'content': 'New Delhi (Sputnik): Bollywood actress Rhea Chakraborty was a close friend of actor Sushant Singh Rajput, who allegedly committed suicide last month. Rhea has also been questioned by the Mumbai Police in Rajput‚Äôs death case. The late actor was found hanging at his Mumbai residence on 16 June. Mumbai Police have registered a First Information Report (FIR) against two Instagram users for allegedly threatening Bollywood actress Rhea Chakraborty, an official said on Sunday. Chakraborty, who had been receiving messages with rape and murder threats since the death of her close friend and actor Sushant Singh Rajput, approached the cybercrime cell of the Mumbai Police on Thursday with a request to look into the threats that she was sent. Rajput, 34, was found hanging in his Mumbai apartment on 14 June, sending shockwaves across B

In [55]:
urls = []
for a in dt_index:
    date = a.strftime("%Y%m%d")
    urls.append(url.format(date))

In [60]:
get_soup(urls[0])

<!DOCTYPE html SYSTEM "about:legacy-compat">
<html class="responsive" lang="en"><head><title>Military &amp; Intelligence - News of the Day - 01.07.2019 | Sputnik International</title><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="Military &amp; Intelligence - News of the Day - 01.07.2019 | Sputnik International. For more news check our website Sputnik" name="description"/><meta content="News about Military, New Military Technologies, News about Army, News about Air Force, News about Navy, military news now, war news now, russian military news, us military news, military news, military weapons news, us defense news, russian defence news, intelligence news" name="keywords"/><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/><meta content="all" name="robots"/><meta content="telephone=no" name="format-detection"/>
<meta content="ahrefs-site-verification_899ad24abeb428773ae480d902877f72151b81b12d4b7293cbfa1c452c76adcd" name="ahrefs-site-verification"/>

In [81]:
soup = get_soup(urls[0])

In [82]:
sub_urls = soup.find('div', class_ = 'b-plainlist').find_all('a')
each_urls = [ 'https://sputniknews.com/'+ a['href'] for a in sub_urls]

In [83]:
get_soup(urls[0]).find_all('div', clsss_='b-plainlist')

[]

In [84]:
a= set(each_urls )

In [85]:
soup = get_soup(urls[1])
sub_urls = soup.find('div', class_ = 'b-plainlist').find_all('a')
each_urls = [ 'https://sputniknews.com/'+ a['href'] for a in sub_urls]
b= list(set(each_urls ))

In [89]:
old_urls = set()

In [90]:
old_urls.update(a)

In [99]:
urls = list(a-b)

In [117]:
urls[0]

'https://sputniknews.com//military/201906301076103760-russian-aerobatic-teams-stunts-army-exhibition/'

In [118]:
parse_page(urls[0])

AttributeError: 'str' object has no attribute 'text'

In [28]:
print(dt_index)

DatetimeIndex(['2019-07-01', '2019-07-02', '2019-07-03', '2019-07-04',
               '2019-07-05', '2019-07-06', '2019-07-07', '2019-07-08',
               '2019-07-09', '2019-07-10',
               ...
               '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25',
               '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29',
               '2020-06-30', '2020-07-01'],
              dtype='datetime64[ns]', length=367, freq='D')


In [26]:
dt_index = pd.date_range(start= '2019-07-01', end='2020-07-01')

In [120]:
soup.find('div' , class_ ='b-article').find_all('p')[:-1]

[<p>The Army-2019 military expo concludes in the Moscow Region's Alabino with a closing show featuring real battle simulations using the newest weapons, tanks, and aircraft.</p>,
 <p>The 5th forum, Army-2019, which showcases modern arms and military equipment, kicked off at Patriot Park in the Moscow Region on Tuesday. The forum will run until Sunday.</p>,
 <p>The forum is a multifaceted event that apart from the main exposition, includes special projects, such as an international exhibition of high-throughput equipment and technology for re-equipping enterprises in the military-industrial complex, Intellectual Technology Exhibition 2019, and Innovation Club special exhibition.</p>]

In [123]:
content = ''
for paragraph in content_list:
    content = content + to_string(paragraph) + ' '

In [124]:
content

"The Army-2019 military expo concludes in the Moscow Region's Alabino with a closing show featuring real battle simulations using the newest weapons, tanks, and aircraft. The 5th forum, Army-2019, which showcases modern arms and military equipment, kicked off at Patriot Park in the Moscow Region on Tuesday. The forum will run until Sunday. The forum is a multifaceted event that apart from the main exposition, includes special projects, such as an international exhibition of high-throughput equipment and technology for re-equipping enterprises in the military-industrial complex, Intellectual Technology Exhibition 2019, and Innovation Club special exhibition. "