# TLS-COVID19 Dataset

## Collect data

### Requirements

In [None]:
# Selenium (required to collect CNN news)
!apt update
!apt install chromium-chromedriver

!pip install selenium

# Date handling
!pip install dateparser

In [None]:
# Packages from Python standard library
import os
import time
import re
import json
import html
from datetime import datetime
from pathlib import Path
from urllib.parse import urlencode

# Third-party packages
import requests
import dateparser
from bs4 import BeautifulSoup
from selenium import webdriver

### Sources

In [None]:
# The root directory where the datasets will be placed
DATA_DIR = 'data/'
# Create dir if does not exist
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)

# Used in requests
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}

# Functions used across all liveblog collectors

def clean_html(text):
  text = html.unescape(text)
  tags_regex = re.compile('<.*?>')
  text = re.sub(tags_regex, '', str(text))
  return text

def pre_proc(text):
  text = clean_html(text)
  # Some news contain special characters like \r, \t, \n
  text = ' '.join(text.split())
  return text

# Returns date as string in format yyyy-mm-dd hh:mm
def format_date(date_str):
  return dateparser.parse(date_str).strftime('%Y-%m-%d %H:%M')

def write_json(file_path, json_data):
  print('Writing ' + file_path)
  with open(file_path, 'w', encoding='utf8') as fp:
      json.dump(json_data, fp, ensure_ascii=False, indent=4)

# Store data collection information as the number of liveblogs, number of news, number of key moments and first and last date
collection_stats = {}

def get_source_stats(num_lbs, num_news, num_kms, first_date, last_date):
  stats = {
      'num_lbs': num_lbs,
      'num_news': num_news,
      'num_kms': num_kms,
      'first_date': first_date,
      'last_date': last_date
  }
  return stats

#### Publico

In [None]:
# Place publico data under data/publico
DATA_DIR_PUBLICO = os.path.join(DATA_DIR, 'publico/')
# Create dir if does not exist
Path(DATA_DIR_PUBLICO).mkdir(parents=True, exist_ok=True)


# Traverse all liveblog api entries and collect their ids, dates and urls
# Returns a list of dictionaries. Each dict contains the date, id and url of a liveblog
def get_blogs_publico():

    print('===== Collecting Publico liveblogs =====')

    url_lbs_listing = 'https://www.publico.pt/api/list/coronavirus-ao-minuto'
    payload = {'page': 0}

    list_blogs = []
    while True:

        r = requests.get(url_lbs_listing, headers=HEADERS, params=payload)
        response_json = r.json()

        # Exit if there are no more entries
        if len(response_json) < 1:
            break

        # For each liveblog collect its date, id and url
        for lb in response_json:
            lb_dict = {}
            lb_dict['date'] = format_date(lb['data']).split()[0]
            lb_dict['id'] = lb['id']
            lb_dict['url'] = lb['shareUrl']
            print(lb_dict['url'])
            list_blogs.append(lb_dict)

        payload['page'] += 1

    # 287 lbs as of 18/12
    print('# lbs:', len(list_blogs))
    print()

    return list_blogs

# Use the liveblog ids to request the liveblog api endpoint and collect the news under them
# Returns a list of dicts of news and a list of dicts of keymoments. Each dict contains the title, text, date, url and keymoment flag of the news
def get_news_publico():

    lbs = get_blogs_publico()

    print('===== Collecting Publico liveblogs news =====')

    url_liveblog_api = 'https://api.publico.pt/liveblog/'

    list_news, list_kms = [], []
    for i, lb in enumerate(lbs):
        lb_url = lb['url']

        print(str(i+1) + '/' + str(len(lbs)))
        print(lb_url)

        lb_api_url = url_liveblog_api + str(lb['id']) 
        r = requests.get(lb_api_url, headers=HEADERS)
        response_json = r.json()

        # For each liveblog article collect its title, text, date and url
        list_news_lb, list_kms_lb = [], []
        for article in response_json:
            news = {}
            news['title'] = clean_html(article['titulo'])
            news['text'] = pre_proc(article['texto'])
            news['date'] = format_date(article['data'])
            news['is_km'] = str(article['isDestaque'])
            news['url'] = lb_url + '#' + str(article['id'])
            list_news_lb.append(news)

            # If article is key moment (isDestaque) append it to key moments list also
            if article['isDestaque'] == True:
                list_kms_lb.append(news)

        list_news.extend(list_news_lb)
        list_kms.extend(list_kms_lb)

        print('# news:', len(list_news_lb))
        print('# kms:', len(list_kms_lb))
        print()
    
    # Sort news by descending date
    list_news.sort(key=lambda item:item['date'], reverse=True)
    list_kms.sort(key=lambda item:item['date'], reverse=True)

    # Generate and store stats of the collection
    source_stats = get_source_stats(len(lbs), len(list_news), len(list_kms), list_news[-1]['date'].split()[0], list_news[0]['date'].split()[0])
    collection_stats['publico'] = source_stats

    print('Stats: ')
    for k, v in source_stats.items():
      print(k + ': ' + str(v))
    print()
    
    return list_news, list_kms

news_publico, kms_publico = get_news_publico()

# Creates a json file with the liveblog news under data/publico/
publico_news_path = os.path.join(DATA_DIR_PUBLICO, 'news_publico.json')
write_json(publico_news_path, news_publico)

# Creates a json file with the keymoments news under data/publico/
publico_kms_path = os.path.join(DATA_DIR_PUBLICO, 'kms_publico.json')
write_json(publico_kms_path, kms_publico)

#### Observador

In [None]:
# Place observador data under data/observador
DATA_DIR_OBSERVADOR = os.path.join(DATA_DIR, 'observador/')
# Create dir if does not exist
Path(DATA_DIR_OBSERVADOR).mkdir(parents=True, exist_ok=True)


# Use observador api to collect the ids of all coronavirus liveblogs
def get_blogs_ids_observador():

    print('===== Collecting Observador liveblogs ids =====')

    # 47748 is the id of the section/topic "coronavirus" with the associated url "seccao/saude/saude-publica-saude/coronavirus-saude-publica-saude/"
    url_grid = 'https://observador.pt/wp-json/obs_api/v4/grids/filter/category/47748'

    date_format = '%Y%m%d'

    first_date_str = '20200129'
    today_date_str = datetime.today().strftime(date_format)

    lb_ids = []
    offset = today_date_str
    while True:
        
        payload = {'render': 'JSON', 'offset': offset}

        r = requests.get(url_grid, headers=HEADERS, params=payload)
        response_json = r.json()

        interval_lbs = []
        for entry in response_json['rendered']['modules']:
            for module in entry['modules']:
                if module['module'] == 'obs_liveblog-1':
                    interval_lbs.append(module['meta-id'])

        lb_ids.extend(interval_lbs)

        next_offset = response_json['rendered']['offset']

        print('From: ' + next_offset + ' To: ' + offset)
        # print('# days: ' + str((datetime.strptime(offset, date_format) - datetime.strptime(next_offset, date_format)).days))
        # print('# lbs: ' + str(len(interval_lbs)))

        offset = next_offset

        if offset < first_date_str:
            break
    
    lb_ids = sorted(list(set(lb_ids)))

    # 316 lbs as of 18/12
    print('# lbs:', len(lb_ids))
    print()

    return lb_ids

# Use observador api to collect the urls of all coronavirus liveblogs
def get_blogs_urls_observador():

    blogs_ids = get_blogs_ids_observador()

    print('===== Collecting Observador liveblogs urls =====')

    url_news_endpoint = 'https://api.observador.pt/wp/items/id/'
    # Alternative
    # url_news_endpoint = 'https://observador.pt/observador_api/req/3_0/items/id/'

    # List of dicts. Each dict contains id, url and date
    lbs = []
    for i, id in enumerate(blogs_ids):
        lb_url = url_news_endpoint + str(id)

        print(str(i+1) + '/' + str(len(blogs_ids)))
        print(lb_url)

        r = requests.get(lb_url, headers=HEADERS)
        response_json = r.json()

        news_dict = {
            'id': id,
            'url': response_json['links']['webUri'],
            'date': format_date(response_json['pubDate'])
        }
        lbs.append(news_dict)

    # 316 lbs as of 18/12
    print('# lbs:', len(lbs))
    print()

    return lbs

def get_news_observador():

    lbs = get_blogs_urls_observador()

    print('===== Collecting Observador liveblogs news =====')

    list_news, list_kms = [], []
    for i, lb in enumerate(lbs):
        lb_url = lb['url']

        print(str(i+1) + '/' + str(len(lbs)))
        print(lb_url)

        r = requests.get(lb_url, headers=HEADERS)

        soup = BeautifulSoup(r.text, 'html.parser')

        # Liveblog
        lb_data = soup.find('script', id='liveblog-data', type='application/json').find(text=True)

        lb_json = json.loads(lb_data)

        list_news_lb = []
        for entry in lb_json:
            article = {
                'title': pre_proc(entry['title']),
                'text': pre_proc(entry['content']),
                'date': format_date(entry['date']['datetime']),
                'is_km': 'False',
                'url': lb_url + '#liveblog-entry-' + str(entry['id'])
            }
            list_news_lb.append(article)

        # Key moments
        key_moments_div = soup.find('details', {'class': 'liveblog-highlights-wrapper'}).find('ul', {'class': 'liveblog-highlights-list'}).find_all('li')

        list_kms_lb = []
        for km in key_moments_div:
            article = {
                'title': pre_proc(km.find('span', {'class': 'liveblog-highlights-item-title'})),
                'date': format_date(km.find('time')['datetime']),
                'url': lb_url + '#liveblog-entry-' + str(km['data-id'])
            }
            list_kms_lb.append(article)

        # Flag key moments in news
        for km in list_kms_lb:
            for news in list_news_lb:
                if news['title'] == km['title'] and news['date'] == km['date'] and news['url'] == km['url']:
                    news['is_km'] = 'True'
        
        # Check key moments were correctly merged
        # print('# kms:', len(kms))
        # print('# kms in news:', len([x for x in news if x['is_km'] == 'True']))
        # print()

        list_news.extend(list_news_lb)
        list_kms.extend(list_kms_lb)

        print('# news:', len(list_news_lb))
        print('# kms:', len(list_kms_lb))
        print()

    # Sort news by descending date
    list_news.sort(key=lambda item:item['date'], reverse=True)
    list_kms.sort(key=lambda item:item['date'], reverse=True)

    # Generate and store stats of the collection
    source_stats = get_source_stats(len(lbs), len(list_news), len(list_kms), list_news[-1]['date'].split()[0], list_news[0]['date'].split()[0])
    collection_stats['observador'] = source_stats

    print('Stats: ')
    for k, v in source_stats.items():
      print(k + ': ' + str(v))
    print()
    
    return list_news, list_kms


news_observador, kms_observador = get_news_observador()

# Creates a json file with the liveblog news under data/observador/
observador_news_path = os.path.join(DATA_DIR_OBSERVADOR, 'news_observador.json')
write_json(observador_news_path, news_observador)

# Creates a json file with the keymoments news under data/observador/
observador_kms_path = os.path.join(DATA_DIR_OBSERVADOR, 'kms_observador.json')
write_json(observador_kms_path, kms_observador)

#### CNN

In [None]:
# Place cnn data under data/cnn
DATA_DIR_CNN = os.path.join(DATA_DIR, 'cnn/')
# Create dir if does not exist
Path(DATA_DIR_CNN).mkdir(parents=True, exist_ok=True)

# Setup selenium Chrome driver
def get_chrome_driver():
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')

  driver = webdriver.Chrome('chromedriver', options=options)
  driver.implicitly_wait(30)

  return driver


def get_blogs_urls_cnn(browser):

    print('===== Collecting CNN liveblogs urls =====')

    search_url = 'https://edition.cnn.com/search?'
    payload = {'q': '"coronavirus news"', 'size':20, 'page': 1, 'from': 0}

    lb_urls = set()
    while True:

        url = search_url + urlencode(payload)
        print(url)

        browser.get(url)

        time.sleep(3)

        soup = BeautifulSoup(browser.page_source, 'html.parser')

        results_list = soup.find('div', {'class': 'cnn-search__results-list'})

        for result in results_list:
            try:
                result_content = result.find_next('div', {'class': 'cnn-search__result-contents'})
                headline = result_content.find_next('h3', {'class': 'cnn-search__result-headline'})
                a = headline.find_next('a')
                lb_url = 'https:' + a['href']
                if 'coronavirus news' in a.get_text() and 'live-news' in lb_url:
                    lb_urls.add(lb_url)

            except AttributeError as ae:
                # print(ae)
                pass
                
        # Stop if last page (pagination did not display maximum number of contents)
        if len(results_list) < 2*payload['size']:
            browser.close()
            break

        payload['page'] += 1
        payload['from'] = (payload['page']*payload['size']) - payload['size']

    lb_urls = sorted(list(lb_urls))

    # 266 lbs as of 18/12
    print('# lbs:', len(lb_urls))
    print()
    
    return lb_urls

def get_news_cnn(browser):

    lbs = get_blogs_urls_cnn(browser)

    print('===== Collecting CNN liveblogs news =====')

    list_news, list_kms = [], []
    for i, lb_url in enumerate(lbs):

        print(str(i+1) + '/' + str(len(lbs)))
        print(lb_url)

        r = requests.get(lb_url, headers=HEADERS)

        soup = BeautifulSoup(r.text, 'html.parser')
        try:
          lb_data = soup.find('script', id='liveBlog-schema', type='application/ld+json').find(text=True)
        except:
          print('Could not collect liveblog for this url\n')
          continue

        lb_json = json.loads(lb_data)

        list_news_lb = []
        for entry in lb_json['liveBlogUpdate']:
            
            try:
                title = pre_proc(entry['headline'])
            except:
                title = ''

            article = {
                'title': title,
                'text': pre_proc(entry['articleBody']),
                'date': format_date(entry['datePublished']),
                'is_km': 'False',
                'url': entry['url']
            }
            list_news_lb.append(article)

        list_news.extend(list_news_lb)

        try:
            kms_data = soup.find('aside', id='ls-rail').find('div', class_='sc-dnqmqq render-stellar-contentstyles__List-sc-9v7nwy-1 eUPcFX').find_next('ul')
        except:
            print('There are no key moments for this url')
            kms_data = []

        list_kms_lb = []
        for km in kms_data:
            clean_text = pre_proc(km)
            article = {
                'title': clean_text,
                'text': clean_text,
                # Date is an approximation as key moments do not refer to any date
                'date': list_news_lb[len(list_news_lb)//2]['date'],
                'is_km': 'True',
                'url': lb_url
            }
            list_kms_lb.append(article)

        list_kms.extend(list_kms_lb)

        print('# news:', len(list_news_lb))
        print('# kms:', len(list_kms_lb))
        print()

    # Sort news by descending date
    list_news.sort(key=lambda item:item['date'], reverse=True)
    list_kms.sort(key=lambda item:item['date'], reverse=True)

    # Generate and store stats of the collection
    source_stats = get_source_stats(len(lbs), len(list_news), len(list_kms), list_news[-1]['date'].split()[0], list_news[0]['date'].split()[0])
    collection_stats['cnn'] = source_stats

    print('Stats: ')
    for k, v in source_stats.items():
      print(k + ': ' + str(v))
    print()
    
    return list_news, list_kms


news_cnn, kms_cnn = get_news_cnn(get_chrome_driver())

# Creates a json file with the liveblog news under data/cnn/
cnn_news_path = os.path.join(DATA_DIR_CNN, 'news_cnn.json')
write_json(cnn_news_path, news_cnn)

# Creates a json file with the keymoments news under data/cnn/
cnn_kms_path = os.path.join(DATA_DIR_CNN, 'kms_cnn.json')
write_json(cnn_kms_path, kms_cnn)

#### Guardian

In [None]:
# Place guardian data under data/guardian
DATA_DIR_GUARDIAN = os.path.join(DATA_DIR, 'guardian/')
# Create dir if does not exist
Path(DATA_DIR_GUARDIAN).mkdir(parents=True, exist_ok=True)


def get_blogs_urls_guardian():
    
    print('===== Collecting Guardian liveblogs urls =====')

    # This url contains a listing of all coronavirus liveblogs
    search_url = 'https://www.theguardian.com/world/series/coronavirus-live'
    payload = {'page': 1}
    
    lb_urls = set()
    while True:
        r = requests.get(search_url, headers=HEADERS, params=payload)

        # If we try to get a page that does not exist the url returns to the search_url
        if r.url == search_url:
            break
        
        soup = BeautifulSoup(r.text, 'html.parser')

        lbs_list = soup.find('div', class_='u-cf index-page', role='main')

        lbs_alinks = lbs_list.find_all('a', class_='fc-item__link')

        for a in lbs_alinks:
            url = a['href']
            lb_urls.add(url)
            print(a['href'])
        
        payload['page'] += 1

    lb_urls = sorted(list(lb_urls))

    # 335 lbs as of 18/12
    print('# lbs:', len(lb_urls))
    print()
    
    return lb_urls

def get_news_guardian():

    lbs = get_blogs_urls_guardian()

    print('===== Collecting Guardian liveblogs news =====')

    base_url_guardian = 'https://www.theguardian.com'

    list_news, list_kms = [], []
    # Iterate over liveblogs
    for i, lb_url in enumerate(lbs):

        print(str(i+1) + '/' + str(len(lbs)))
        print(lb_url)

        list_news_lb, list_kms_lb = [], []
        url = lb_url
        # Iterate over each liveblog page
        while True:
            r = requests.get(url, headers=HEADERS)

            soup = BeautifulSoup(r.text, 'html.parser')

            lb = soup.find('div', class_='js-article__container')

            posts = lb.find_all('div', itemprop='liveBlogUpdate')
            for p in posts:

                title = pre_proc(p.find('meta', itemprop='headline')['content'])

                content_pars = p.find('div', class_='block-elements', itemprop='articleBody').find_all('p')
                # Clean paragraphs and join them into the same string
                text = ' '.join([pre_proc(cp) for cp in content_pars])

                date = format_date(p.find('time', class_='js-timestamp')['datetime'])
                
                is_km = 'False'
                
                news_url = base_url_guardian + p.find('a', class_='block-time__link', itemprop='url')['href']

                article = {
                    'title': title,
                    'text': text,
                    'date': date,
                    'is_km': is_km,
                    'url': news_url
                }

                list_news_lb.append(article)

            # Key moments are the same for all pages, inside the same liveblog
            # Only non first pages contain '#liveblog-navigation' in the url
            if '#liveblog-navigation' not in url:
                # key_moments_data = soup.find('ul', class_='timeline js-live-blog__timeline u-unstyled')
                key_moments_data = soup.find_all('li', class_='timeline__item')

                for km in key_moments_data:

                    title = pre_proc(km.find('span', class_='timeline__title u-underline').get_text())

                    date = format_date(km.find('time', class_='js-timestamp')['datetime'])

                    news_url = base_url_guardian + km.find('a', class_='timeline__link')['href']

                    is_km = 'True'

                    article = {
                        'title': title,
                        'date': date,
                        'is_km': is_km,
                        'url': news_url
                    }
                    list_kms_lb.append(article)

            pagination = soup.find('div', id='liveblog-navigation')

            # If there is no pagination (only one page) exit loop
            if not pagination:
                break

            pagination_older = pagination.find('div', class_='liveblog-navigation__older')

            next_page_a = pagination_older.find('a', class_='liveblog-navigation__link liveblog-navigation__link--primary')

            # If there is no next page exit loop
            if not next_page_a:
                break

            url = base_url_guardian + next_page_a['href']

        # Flag key moments in news
        # kms and news with same url have is_km=True
        for km in list_kms_lb:
            for news in list_news_lb:
                if news['url'] == km['url']:
                    news['is_km'] = 'True'

        list_news.extend(list_news_lb)
        list_kms.extend(list_kms_lb)

        print('# news:', len(list_news_lb))
        print('# kms:', len(list_kms_lb))
        print()
    
    # Sort news by descending date
    list_news.sort(key=lambda item:item['date'], reverse=True)
    list_kms.sort(key=lambda item:item['date'], reverse=True)

    # Generate and store stats of the collection
    source_stats = get_source_stats(len(lbs), len(list_news), len(list_kms), list_news[-1]['date'].split()[0], list_news[0]['date'].split()[0])
    collection_stats['guardian'] = source_stats

    print('Stats: ')
    for k, v in source_stats.items():
      print(k + ': ' + str(v))
    print()

    return list_news, list_kms


news_guardian, kms_guardian = get_news_guardian()

# Creates a json file with the liveblog news under data/guardian/
guradian_news_path = os.path.join(DATA_DIR_GUARDIAN, 'news_guardian.json')
write_json(guradian_news_path, news_guardian)

# Creates a json file with the keymoments news under data/guardian/
guardian_kms_path = os.path.join(DATA_DIR_GUARDIAN, 'kms_guardian.json')
write_json(guardian_kms_path, kms_guardian)

### Stats

In [None]:
import seaborn as sns

In [None]:
for source, stats in collection_stats.items():
  print(source)
  print(stats)

In [None]:
sources = list(collection_stats.keys())
n_lbs = [v['num_lbs'] for _, v in collection_stats.items()]

ax = sns.barplot(x=sources, y=n_lbs)
ax.set_title('Number of liveblogs per source');

In [None]:
sns.set_theme(style="whitegrid")

sources = list(collection_stats.keys())

sns.set_color_codes("pastel")
n_news = [v['num_news'] for _, v in collection_stats.items()]
ax = sns.barplot(x=sources, y=n_news, label="Total news", color="b")

sns.set_color_codes("muted")
n_kms = [v['num_kms'] for _, v in collection_stats.items()]
ax = sns.barplot(x=sources, y=n_kms, label="Key moments", color="b")

ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
ax.set_title('Number of news and key moments per source');

### Zip data and save to GDrive

In [None]:
zip_file_name = 'data_' + datetime.today().strftime("%Y-%m-%d") + '.zip'

!zip -r $zip_file_name $DATA_DIR

In [None]:
from google.colab import drive

DRIVE_MOUNT_DIR = '/content/drive'
drive.mount(DRIVE_MOUNT_DIR)

In [None]:
DATA_STORE_DIR = os.path.join(DRIVE_MOUNT_DIR, 'MyDrive/inesc/tls_covid')

!cp -r $zip_file_name $DATA_STORE_DIR

In [None]:
!cp -r drive/MyDrive/inesc/tls_covid/data_2021-01-07.zip .
!unzip data_2021-01-07.zip

## Preprocess datasets

### Requirements

In [None]:
!pip install -U pandas

In [None]:
import os
from pathlib import Path
import pandas as pd

In [None]:
pd.set_option('display.max_colwidth', 2000)

In [None]:
# The root directory where the datasets are placed
DATA_DIR = 'data/'

# Create python vars with the datasets location
DATA_DIR_PUBLICO = os.path.join(DATA_DIR, 'publico/')
NEWS_PUBLICO = os.path.join(DATA_DIR_PUBLICO, 'news_publico.json')

DATA_DIR_OBSERVADOR = os.path.join(DATA_DIR, 'observador/')
NEWS_OBSERVADOR = os.path.join(DATA_DIR_OBSERVADOR, 'news_observador.json')

DATA_DIR_CNN = os.path.join(DATA_DIR, 'cnn/')
NEWS_CNN = os.path.join(DATA_DIR_CNN, 'news_cnn.json')
KMS_CNN = os.path.join(DATA_DIR_CNN, 'kms_cnn.json')

DATA_DIR_GUARDIAN = os.path.join(DATA_DIR, 'guardian/')
NEWS_GUARDIAN = os.path.join(DATA_DIR_GUARDIAN, 'news_guardian.json')

In [None]:
def pre_proc(df_news, noisy_strs_title, noisy_strs_text):
  # Exception handling because CNN KMs do not have text column. Only have title

  # Remove rows with NaN in title or text
  df_news = df_news.dropna(subset=['title'])
  try:
    df_news = df_news.dropna(subset=['text'])
  except KeyError as ke:
    pass

  # Remove rows containing containing noisy strings in title
  df_news = df_news[~df_news['title'].str.contains('|'.join(noisy_strs_title), case=False)]

  # Remove noisy strings from news content
  try:
    df_news['text'] = df_news['text'].str.replace('|'.join(noisy_strs_text), '')
  except KeyError as ke:
    pass

  # Uniformize quotation marks
  df_news['title'] = df_news['title'].str.replace('“|”', '"')

  try:
    df_news['text'] = df_news['text'].str.replace('“|”', '"')
  except KeyError as ke:
    pass

  # Remove rows with empty title or text
  df_news = df_news[df_news['title'] != '']
  try:
    df_news = df_news[df_news['text'] != '']
  except KeyError as ke:
    pass

  # Drop duplicated news with same title, text and date, keeping first occurence
  try:
    df_news = df_news.drop_duplicates(subset=['title', 'text', 'date'])
  except KeyError as ke:
    df_news = df_news.drop_duplicates(subset=['title', 'date'])

  
  # Reorder index
  df_news = df_news.reset_index(drop=True)

  return df_news


def write_df_to_json(df, file_path):
  df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M')
  df.to_json(file_path, orient='records', force_ascii=False, indent=4)


# The root directory where the prepocessed datasets will be placed
DATA_CLEAN_DIR = 'data_clean/'
# Create dir if does not exist
Path(DATA_CLEAN_DIR).mkdir(parents=True, exist_ok=True)

### PT

In the data collection step we generated two .json files (news and kms) for the sake of separation. The news contains all the liveblog standard news as well as the keymoments. So news.json contains all the news articles. In order to standardize and avoid redundancy, in the pre processing step we will only work with the news.json.

#### Publico


In [None]:
noisy_strs_title_publico = [
                        'A primeira página do PÚBLICO',
                        'Fim do acompanhamento',
                        'Quatro opiniões para ler',
                        'Ponto de situação',
                        'Encerramento da cobertura',
                        'Os números da pandemia',
                        'Encerramento do acompanhamento',
                        'Os destaques',
                        'para começar o dia',
                        'a ler',
                        'Vale a pena ler',
                        'o que precisa de saber',
                        'o que deve saber',
                        'Os números actualizados da pandemia',
                        'Os números mundiais da pandemia',
                        'Os últimos números da pandemia',
                        'O número de casos do novo coronavírus',
                        'Acompanhe em directo a conferência',
                        'Veja a conferência de imprensa',
                        'Acompanhe a conferência de imprensa',
                        'Veja em directo a conferência',
                        'Bom dia',
                        'Boa noite',
                        'Resumo dos acontecimentos',
                        'resumo do dia',
                        'resumo da manhã',
                        '^resumo$',
                        '^encerramento$'
]

noisy_strs_text_publico = [
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        r'<span(.*?)>',
                        r'Normal 0 21(.*)',

                        r'^Leia(.*)aqui(.*)',
                        r' Leia(.*)aqui(.*)',
                        r' Leia a entrevista(.*)',
                        r' Leia a reportagem(.*)',
                        r' Leia o resto da reportagem(.*)',
                        r' Leia o artigo(.*)',
                        r' Leia a notícia(.*)',
                        r' Leia o texto(.*)',
                        r' Leia mais(.*)',
                        r' Saiba mais(.*)',
                        r' Ler mais(.*)',
                        r' Leia também(.*)',
                        r'pic.twitter.com/[a-zA-Z0-9]+(\s|$)'
]

In [None]:
# Load json dataset into pandas dataframe
df_publico_lb = pd.read_json(NEWS_PUBLICO)

In [None]:
# Preprocess dataset
df_publico_lb = pre_proc(df_publico_lb, noisy_strs_title_publico, noisy_strs_text_publico)

In [None]:
df_publico_lb.describe()

In [None]:
df_publico_lb.head()

In [None]:
# Place publico data under data_clean/publico
DATA_CLEAN_DIR_PUBLICO = os.path.join(DATA_CLEAN_DIR, 'publico/')
# Create dir if does not exist
Path(DATA_CLEAN_DIR_PUBLICO).mkdir(parents=True, exist_ok=True)

publico_news_clean_path = os.path.join(DATA_CLEAN_DIR_PUBLICO, 'news_publico.json')
write_df_to_json(df_publico_lb, publico_news_clean_path)

#### Observador

In [None]:
noisy_strs_title_observador = [
                        'Ponto da situação',
                        'Ponto de situação no mundo',
                        'Os pontos mais importantes do dia até ao momento'
]

noisy_strs_text_observador = [
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        r' Leia a notícia(.*)',
                        r' Pode ler mais(.*)',
                        r' Ler mais(.*)',
                        r' Saiba mais(.*)',
                        r'Leia mais aqui(.)*',
                        r'\(?([A-Z]+(\s)*)*/OBSERVADOR\)?',
                        r'(\(?Agência\)?\s*)?Lusa\)?$',
                        r'pic.twitter.com/[a-zA-Z0-9]+(\s|$)'
]

In [None]:
# Load json dataset into pandas dataframe
df_observador_lb = pd.read_json(NEWS_OBSERVADOR)

In [None]:
# Preprocess dataset
df_observador_lb = pre_proc(df_observador_lb, noisy_strs_title_observador, noisy_strs_text_observador)

In [None]:
df_observador_lb.describe()

In [None]:
df_observador_lb.head()

In [None]:
# Place observador data under data_clean/observador
DATA_CLEAN_DIR_OBSERVADOR = os.path.join(DATA_CLEAN_DIR, 'observador/')
# Create dir if does not exist
Path(DATA_CLEAN_DIR_OBSERVADOR).mkdir(parents=True, exist_ok=True)

observador_news_clean_path = os.path.join(DATA_CLEAN_DIR_OBSERVADOR, 'news_observador.json')
write_df_to_json(df_observador_lb, observador_news_clean_path)

### EN

#### CNN
Here we have to deal with the two jsons separately as the news_cnn.json does not contain the news in kms_cnn.json

In [None]:
noisy_strs_title_cnn = [
                        'Follow live updates',
                        'Go here for latest updates',
                        'What you need to know',
                        'the latest on the pandemic',
                        'Watch the entire CNN coronavirus town hall',
                        'coronavirus town hall has ended',
                        'global town hall on coronavirus will start soon',
                        'the latest coronavirus update',
                        'the latest coronavirus numbers',
                        'what you may have missed',
                        'Catch up:'
]

noisy_strs_text_cnn = [
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        r'pic.twitter.com/[a-zA-Z0-9]+(\s|$)'
]

In [None]:
# Load json datasets into pandas dataframes
df_cnn_km = pd.read_json(KMS_CNN)
df_cnn_lb = pd.read_json(NEWS_CNN)

In [None]:
# Preprocess datasets
df_cnn_km = pre_proc(df_cnn_km, noisy_strs_title_cnn, noisy_strs_text_cnn)
df_cnn_lb = pre_proc(df_cnn_lb, noisy_strs_title_cnn, noisy_strs_text_cnn)

In [None]:
df_cnn_km.describe()

In [None]:
df_cnn_km.head()

In [None]:
df_cnn_lb.describe()

In [None]:
df_cnn_lb.head()

In [None]:
# Place cnn data under data_clean/cnn
DATA_CLEAN_DIR_CNN = os.path.join(DATA_CLEAN_DIR, 'cnn/')
# Create dir if does not exist
Path(DATA_CLEAN_DIR_CNN).mkdir(parents=True, exist_ok=True)

cnn_news_clean_path = os.path.join(DATA_CLEAN_DIR_CNN, 'news_cnn.json')
write_df_to_json(df_cnn_lb, cnn_news_clean_path)

cnn_km_clean_path = os.path.join(DATA_CLEAN_DIR_CNN, 'kms_cnn.json')
write_df_to_json(df_cnn_km, cnn_km_clean_path)

#### Guardian

In [None]:
noisy_strs_title_guardian = [
                        'Summary',
                        'Key developments in the global coronavirus',
                        'What we know so far'
]

noisy_strs_text_guardian = [
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        r'Hi everyone, this is(.*)',
                        r'Hi, Helen Sullivan(.*)',
                        r'Good evening from(.*)',
                        r'We’ve launched a(.*)',
                        r'We’ve fired up a(.*)',
                        r'That’s it for this blog(.*)',
                        r'Read more.*$',
                        r'More info.*$',
                        r'pic.twitter.com/[a-zA-Z0-9]+(\s|$)'
]

In [None]:
# Load json datasets into pandas dataframes
df_guardian_lb = pd.read_json(NEWS_GUARDIAN)

In [None]:
# Preprocess datasets
df_guardian_lb = pre_proc(df_guardian_lb, noisy_strs_title_guardian, noisy_strs_text_guardian)

In [None]:
df_guardian_lb.describe()

In [None]:
df_guardian_lb.head()

In [None]:
# Place guardian data under data_clean/guardian
DATA_CLEAN_DIR_GUARDIAN = os.path.join(DATA_CLEAN_DIR, 'guardian/')
# Create dir if does not exist
Path(DATA_CLEAN_DIR_GUARDIAN).mkdir(parents=True, exist_ok=True)

guardian_news_clean_path = os.path.join(DATA_CLEAN_DIR_GUARDIAN, 'news_guardian.json')
write_df_to_json(df_guardian_lb, guardian_news_clean_path)

### Zip data and save to GDrive

In [None]:
zip_file_name = 'data-clean_' + datetime.today().strftime("%Y-%m-%d") + '.zip'

!zip -r $zip_file_name $DATA_CLEAN_DIR

In [None]:
from google.colab import drive

DRIVE_MOUNT_DIR = '/content/drive'
drive.mount(DRIVE_MOUNT_DIR)

In [None]:
DATA_STORE_DIR = os.path.join(DRIVE_MOUNT_DIR, 'MyDrive/inesc/tls_covid')

!cp -r $zip_file_name $DATA_STORE_DIR

In [None]:
!cp -r drive/MyDrive/inesc/tls_covid/data-clean_2020-12-31.zip .
!unzip data-clean_2020-12-31.zip

## Generate TLS dataset

[Timeline17](http://www.l3s.de/~gtran/timeline/) structure

Use key moments to identify entities and keywords of interest. We refer to the set of entities and keywords as topics.

### Requirements

In [None]:
# Not necessary to run this cell
# Just to check which GPU is allocated (P100 > T4 > P4 > K80) and CUDA version
!nvidia-smi

In [None]:
!pip install -U pandas
!pip install -U spacy[cuda101]
!python -m spacy download en_core_web_sm
!python -m spacy download pt_core_news_sm
!pip install git+https://github.com/LIAAD/yake
!pip install unidecode
!pip install syntok

In [None]:
import os
import json
import re
from pathlib import Path

import pandas as pd
from tqdm import tqdm
tqdm.pandas()

import spacy
spacy_gpu = spacy.prefer_gpu()
print('spaCy GPU enabled:', spacy_gpu)
import pt_core_news_sm
import en_core_web_sm

import yake

import unidecode

import syntok.segmenter as segmenter

In [None]:
pd.set_option('display.max_colwidth', 512)

In [None]:
YAKE_PARAMS = {
    'language': '',
    'max_ngram_size': 4,
    'deduplication_thresold': 0.9,
    'deduplication_algo': 'seqm',
    'windowSize': 1,
    'numOfKeywords': 3
}

LAN_PT = 'pt'
LAN_EN = 'en'

In [None]:
# The root directory where the datasets are placed
DATA_DIR = 'data_clean/'

# Create python vars with the datasets location
DATA_DIR_PUBLICO = os.path.join(DATA_DIR, 'publico/')
NEWS_PUBLICO = os.path.join(DATA_DIR_PUBLICO, 'news_publico.json')

DATA_DIR_OBSERVADOR = os.path.join(DATA_DIR, 'observador/')
NEWS_OBSERVADOR = os.path.join(DATA_DIR_OBSERVADOR, 'news_observador.json')

DATA_DIR_CNN = os.path.join(DATA_DIR, 'cnn/')
NEWS_CNN = os.path.join(DATA_DIR_CNN, 'news_cnn.json')
KMS_CNN = os.path.join(DATA_DIR_CNN, 'kms_cnn.json')

DATA_DIR_GUARDIAN = os.path.join(DATA_DIR, 'guardian/')
NEWS_GUARDIAN = os.path.join(DATA_DIR_GUARDIAN, 'news_guardian.json')

### Functions

In [None]:
def concatenate_title_text(df_news):
  try:
    df_news['title_and_text'] = df_news['title'] + '. ' + df_news['text']
  except KeyError:
    df_news['title_and_text'] = df_news['title']

  # Drop duplicates by title_and_text, keeping first occurence
  df_news = df_news.drop_duplicates(subset=['title_and_text'])

  return df_news

# Return a list set (without duplicates)
# as we are not interested in how many times an entity appears
# but rather if it appears or not
def get_ents(nlp, text):
  
  try:
    doc = nlp(text)

    ents = [ent.text.lower() for ent in doc.ents]
  
    return list(set(ents))

  except TypeError as te:
    return []

def get_kws(kw_extractor, text):

  try: 
    keywords = kw_extractor.extract_keywords(text)
    kws_without_scores = [kw[0] for kw in keywords]

    return list(set(kws_without_scores))
  
  except ValueError:
    return []
  except AttributeError:
    return []

def parse_name(text):
  
  text = text.replace(' ', '_')

  text = text.lower()
  text = unidecode.unidecode(text)

  return text

def write_json(file_path, json_data):
    #print('Writing ' + file_path)
    with open(file_path, 'w', encoding='utf8') as fp:
        json.dump(json_data, fp, ensure_ascii=False, indent=4)

In [None]:
# Fucntions related to writing the datasets 

# Valid languages
LANGUAGES = ['pt', 'en']


def get_sents(text):
  sentences = []
  for paragraph in segmenter.process(text):
    for sentence in paragraph:
      s_sentence = ''
      for token in sentence:
        s_sentence += token.value + ' '
      sentences.append(s_sentence)
  
  return sentences


def get_inputDocs(df, topics):
  gt = []
  for i, topic in enumerate(topics):
    print(str(i+1) + '/' + str(len(topics)))
    print(topic)
    inner_dict = {}
    topic_name = topic[0]
    # If single topic, without synonyms
    if len(topic) == 1:
      # topic_name = topic[0].replace('-', '_')
      df_loc = df[df.apply(lambda r: topic_name in r['topics'], axis=1)][['date', 'title', 'text', 'url']].iloc[::-1]
    # If topic with synonyms
    else:
      dfs = []
      for st in topic:
        df_sub = df[df.apply(lambda r: st in r['topics'], axis=1)][['date', 'title', 'text', 'url']].iloc[::-1]
        dfs.append(df_sub)
      # Merge topics
      df_loc = pd.concat(dfs)
      # Remove possible duplicates from merging topics
      df_loc = df_loc.drop_duplicates(subset=['date', 'title', 'text', 'url'])
      # topic_name = '-'.join([t.replace('-','_') for t in topic])
    
    inner_dict[topic_name] = df_loc.to_dict(orient='records')
    gt.append(inner_dict)

  # Convert to format {ent: {date: [list_news]}}
  input_docs = {}
  for topic in gt:
    for k, v in topic.items():
      inner_dict = {}
      for news in v:
        inner_dict.setdefault(news['date'],[]).append((news['title'], news['text'], news['url']))

      input_docs[k] = inner_dict

  return input_docs


def write_inputDocs(dataset, lan, source_name, with_json=False):
  
  if lan not in LANGUAGES:
    print('Please provide a valid language')
    return

  dataset_dir = 'dataset_' + lan

  dataset_dir_txt = os.path.join(dataset_dir, 'txt/')
  Path(dataset_dir_txt).mkdir(parents=True, exist_ok=True)

  if with_json:
    dataset_dir_json = os.path.join(dataset_dir, 'json/')
    Path(dataset_dir_json).mkdir(parents=True, exist_ok=True)

  source_name_suffix = '_' + source_name

  for i, topic in enumerate(dataset):
    print(str(i+1) + '/' + str(len(dataset)))
    print(topic)

    file_name = parse_name(topic)

    # TXT
    ent_dir_txt = os.path.join(dataset_dir_txt, file_name + source_name_suffix)
    Path(ent_dir_txt).mkdir(parents=True, exist_ok=True)

    ent_inputdocs_txt = os.path.join(ent_dir_txt, 'input_docs/')
    Path(ent_inputdocs_txt).mkdir(parents=True, exist_ok=True)

    if with_json:
      # JSON
      ent_dir_json = os.path.join(dataset_dir_json, file_name + source_name_suffix)
      Path(ent_dir_json).mkdir(parents=True, exist_ok=True)

      ent_inputdocs_json = os.path.join(ent_dir_json, 'input_docs/')
      Path(ent_inputdocs_json).mkdir(parents=True, exist_ok=True)

    for date, news in dataset[topic].items():

      # TXT
      ent_inputdocs_liveblog_day_txt = os.path.join(ent_inputdocs_txt, date)
      Path(ent_inputdocs_liveblog_day_txt).mkdir(parents=True, exist_ok=True)

      counter = 0
      for n in news:
        fn = str(counter) + '.txt'
        file_path_txt_inputdocs = os.path.join(ent_inputdocs_liveblog_day_txt, fn)
        with open(file_path_txt_inputdocs, 'a') as txt_file:
          # n[1] is text
          news_sents = get_sents(n[1])
          for ns in news_sents:
            txt_file.write(ns + '\n')
        counter += 1

      if with_json:
        # JSON
        ent_inputdocs_liveblog_day_json = os.path.join(ent_inputdocs_json, date)
        Path(ent_inputdocs_liveblog_day_json).mkdir(parents=True, exist_ok=True)

        counter = 0
        for n in news:
          fn = str(counter) + '.json'
          file_path_json_inputdocs = os.path.join(ent_inputdocs_liveblog_day_json, fn)
          # n[1] is text
          news_sents = get_sents(n[1])
          write_json(file_path_json_inputdocs, news_sents)
          counter += 1
        

def get_timelines(df, topics):
  gt = []

  for i, topic in enumerate(topics):
    print(str(i+1) + '/' + str(len(topics)))
    print(topic)
    inner_dict = {}
    topic_name = topic[0]
    # If single topic, without synonyms
    if len(topic) == 1:
      # topic_name = topic[0].replace('-', '_')
      df_loc = df[df.apply(lambda r: topic_name in r['topics'], axis=1)][['date', 'title', 'text', 'url']].iloc[::-1]
    # If topic with synonyms
    else:
      dfs = []
      for st in topic:
        df_loc = df[df.apply(lambda r: st in r['topics'], axis=1)][['date', 'title', 'text', 'url']].iloc[::-1]
        dfs.append(df_loc)
      # Merge topics
      df_loc = pd.concat(dfs)
      # Remove possible duplicates from merging topics
      df_loc = df_loc.drop_duplicates(subset=['date', 'title', 'text', 'url'])
      # topic_name = '-'.join([t.replace('-','_') for t in topic])

    inner_dict[topic_name] = df_loc.to_dict(orient='records')
    gt.append(inner_dict)

  # Convert to format {ent: {date: [list_news]}}
  timelines = {}
  for topic in gt:
    for k, v in topic.items():
      inner_dict = {}
      for news in v:
        inner_dict.setdefault(news['date'],[]).append((news['title'], news['text'], news['url']))

      timelines[k] = inner_dict

  return timelines


def write_timelines(dataset, lan, source_name, with_json=False):

  if lan not in LANGUAGES:
    print('Please provide a valid language')
    return

  dataset_dir = 'dataset_' + lan

  dataset_dir_txt = os.path.join(dataset_dir, 'txt/')
  Path(dataset_dir_txt).mkdir(parents=True, exist_ok=True)

  if with_json:
    dataset_dir_json = os.path.join(dataset_dir, 'json/')
    Path(dataset_dir_json).mkdir(parents=True, exist_ok=True)

  source_name_suffix = '_' + source_name

  txt_delimiter = '--------------------------------'

  for i, topic in enumerate(dataset):
    print(str(i+1) + '/' + str(len(dataset)))
    print(topic)

    file_name = parse_name(topic)

    # TXT
    file_name_txt = file_name + '.txt'

    ent_timelines_keymoments_txt = os.path.join(dataset_dir_txt, file_name + source_name_suffix, 'timelines')
    Path(ent_timelines_keymoments_txt).mkdir(parents=True, exist_ok=True)

    file_path_txt = os.path.join(ent_timelines_keymoments_txt, file_name_txt)

    for k, v in dataset[topic].items():
      with open(file_path_txt, 'a') as txt_file:
        txt_file.write(k + '\n')
        for news in v:
          # news[0] is title
          txt_file.write(news[0] + '\n')
        txt_file.write(txt_delimiter + '\n')
    
    if with_json:
      # JSON
      file_name_json = file_name + '.json'

      ent_timelines_keymoments_json = os.path.join(dataset_dir_json, file_name + source_name_suffix, 'timelines')
      Path(ent_timelines_keymoments_json).mkdir(parents=True, exist_ok=True)

      file_path_json = os.path.join(ent_timelines_keymoments_json, file_name_json)
      
      json_data_to_write = {}
      for date, news_list in dataset[topic].items():
        json_data_to_write[date] = [news[0] for news in news_list]

      write_json(file_path_json, json_data_to_write)


# Function to write data in bulk to json
def data_to_cmh(data, source, lan):
  news_list = []

  for topic, content in data.items():
    for date, news in content.items():
      for n in news:
        news_dict = {}
        news_dict['source'] = source
        news_dict['lan'] = lan
        news_dict['topic'] = topic
        news_dict['date'] = str(date)
        news_dict['title'] = n[0]
        news_dict['news'] = n[1]
        news_dict['url'] = n[2]
        news_list.append(news_dict)

  return news_list

### PT

In [None]:
# Load Spacy pt ner model
nlp_ner_pt = pt_core_news_sm.load(disable=['tagger', 'parser'])

# YAKE pt
YAKE_PARAMS['language'] = LAN_PT

kw_extractor_pt = yake.KeywordExtractor(
            lan=YAKE_PARAMS['language'], top=YAKE_PARAMS['numOfKeywords'])

#### Publico

In [None]:
df_publico_lb = pd.read_json(NEWS_PUBLICO)

# Create a new column with the concatenation of title and text (used to find topics)
df_publico_lb = concatenate_title_text(df_publico_lb)

##### Get topics

In [None]:
print('Getting spaCy entities')
df_publico_lb['ents'] = df_publico_lb['title_and_text'].progress_apply(lambda t: get_ents(nlp_ner_pt, t))

print('Getting yake keywords')
df_publico_lb['kws'] = df_publico_lb['title_and_text'].progress_apply(lambda t: get_kws(kw_extractor_pt, t))

print('Merging entities and keywords')
df_publico_lb['topics'] = df_publico_lb.progress_apply(lambda r: list(set(r['ents'] + r['kws'])), axis=1)

#### Observador

In [None]:
df_observador_lb = pd.read_json(NEWS_OBSERVADOR)

# Create a new column with the concatenation of title and text (used to find topics)
df_observador_lb = concatenate_title_text(df_observador_lb)

##### Get topics

In [None]:
print('Getting spaCy entities')
df_observador_lb['ents'] = df_observador_lb['title_and_text'].progress_apply(lambda t: get_ents(nlp_ner_pt, t))

print('Getting yake keywords')
df_observador_lb['kws'] = df_observador_lb['title_and_text'].progress_apply(lambda t: get_kws(kw_extractor_pt, t))

print('Merging entities and keywords')
df_observador_lb['topics'] = df_observador_lb.progress_apply(lambda r: list(set(r['ents'] + r['kws'])), axis=1)

#### Topics selection

##### Count topics occurences

In [None]:
# Topics are chosen based on their occurence in key moments

df_publico_km = df_publico_lb.loc[df_publico_lb['is_km'] == 'True']

df_observador_km = df_observador_lb.loc[df_observador_lb['is_km'] == 'True']

In [None]:
# For each source create a list of lists with topics. Each inner list corresponds to a news. Used to keep track of the number of news in which each topic appears

publico_topics_per_news_km = df_publico_km.topics.tolist()

observador_topics_per_news_km = df_observador_km.topics.tolist()

In [None]:
# List topics for Portuguese

publico_km_topics = set([a for b in publico_topics_per_news_km for a in b])

observador_km_topics = set([a for b in observador_topics_per_news_km for a in b])

pt_km_topics = list(set(publico_km_topics | observador_km_topics))

In [None]:
# For each topic count the number of key moments per source in which it appears

pt_km_topics_count = {}

for topic in tqdm(pt_km_topics):
    
  source_topic_count = {}

  source_topic_count['publico'] = len([list_of_topics for list_of_topics in publico_topics_per_news_km if topic in list_of_topics])
  
  source_topic_count['observador'] = len([list_of_topics for list_of_topics in observador_topics_per_news_km if topic in list_of_topics])

  pt_km_topics_count[topic] = source_topic_count

len(pt_km_topics_count)

In [None]:
# Filter topics by minimum number of occurrences

MIN_NUM_KEY_MOMENTS = 5

for topic, counts in list(pt_km_topics_count.items()):

  # delete topic if there are less than MIN_NUM_KEY_MOMENTS key moments in one of the sources
  if (counts['publico'] < MIN_NUM_KEY_MOMENTS or counts['observador'] < MIN_NUM_KEY_MOMENTS):
    del pt_km_topics_count[topic]

len(pt_km_topics_count)

##### Keymoments/liveblog ratio

In [None]:
# For each source create a list of lists with topics. Each inner list corresponds ta a news. Used to keep track of the number of news in which each topic appears

publico_topics_per_news_lb = df_publico_lb.topics.tolist()

observador_topics_per_news_lb = df_observador_lb.topics.tolist()

In [None]:
# For each topic copute a ratio of occurences between key moments and liveblog per source

ratio_dict_pt = {}

for topic in tqdm(pt_km_topics_count):

  # Count occurences in liveblog
  publico_lb_count = len([list_of_topics for list_of_topics in publico_topics_per_news_lb if topic in list_of_topics])
  observador_lb_count = len([list_of_topics for list_of_topics in observador_topics_per_news_lb if topic in list_of_topics])

  publico_km_count = pt_km_topics_count[topic]['publico']
  observador_km_count = pt_km_topics_count[topic]['observador']

  inner_dict = {}

  inner_dict['publico_lb_count'] = publico_lb_count
  inner_dict['publico_km_count'] = publico_km_count
  if publico_lb_count < 1:
    inner_dict['publico_ratio_km_lb'] = 0
  else:
    inner_dict['publico_ratio_km_lb'] = publico_km_count/publico_lb_count

  inner_dict['observador_lb_count'] = observador_lb_count
  inner_dict['observador_km_count'] = observador_km_count
  if observador_lb_count < 1:
    inner_dict['observador_ratio_km_lb'] = 0
  else:
    inner_dict['observador_ratio_km_lb'] = observador_km_count/observador_lb_count

  ratio_dict_pt[topic] = inner_dict

len(ratio_dict_pt)

In [None]:
RATIO = 0.5

for topic, counts in list(ratio_dict_pt.items()):
  if counts['publico_ratio_km_lb'] > RATIO or counts['observador_ratio_km_lb'] > RATIO:
    del ratio_dict_pt[topic]

len(ratio_dict_pt)

In [None]:
rdp = pd.DataFrame.from_dict(ratio_dict_pt, orient='index')
rdp = rdp.sort_index(ascending=True)
rdp.to_excel("ratio_dict_pt.xlsx")

In [None]:
# Sort ratio dict by key (topic)
ratio_dict_pt = {k: ratio_dict_pt[k] for k in sorted(ratio_dict_pt)}

# Use list of lists to handle synonyms
topics_pt = [[t] for t in list(ratio_dict_pt.keys())]
print(sum(len(t) for t in topics_pt))

In [None]:
write_json('ratio_dict_pt.json', ratio_dict_pt)

with open('topics_pt.txt', 'w') as f:
  for topic in topics_pt:
    f.write("%s\n" % topic[0])

##### Filter queries manually

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
with open('topics_pt.txt', 'r') as fp:
  topics_pt = [line.rstrip() for line in fp]

# There may be one topic per line, or multiple topics (synonyms) per line separated by comma
topics_pt = [t.split(',') for t in topics_pt]

# Check if all topics exist in the original topics identification
for topics in topics_pt:
  for t in topics:
    if t not in list(ratio_dict_pt.keys()):
      print(t + ' does not exist')

print(sum(len(t) for t in topics_pt))
print(len(topics_pt))
print(topics_pt)

#### Write dataset

##### Input Docs

News texts from whole liveblog

In [None]:
# Remove time from date and convert it to string

df_publico_lb['date'] = pd.to_datetime(df_publico_lb['date']).dt.date.astype(str)

df_observador_lb['date'] = pd.to_datetime(df_observador_lb['date']).dt.date.astype(str)

In [None]:
inputDocs_publico = get_inputDocs(df_publico_lb, topics_pt)

inputDocs_observador = get_inputDocs(df_observador_lb, topics_pt)

In [None]:
# Generate TLS dataset

print('Writing publico input docs dataset')
write_inputDocs(inputDocs_publico, 'pt', 'publico', with_json=False)

print('Writing observador input docs dataset')
write_inputDocs(inputDocs_observador, 'pt', 'observador', with_json=False)

In [None]:
# Generate CMH dataset

publico_lb_json = data_to_cmh(inputDocs_publico, 'publico', 'pt')
write_json('publico_lb.json', publico_lb_json)

observador_lb_json = data_to_cmh(inputDocs_observador, 'observador', 'pt')
write_json('observador_lb.json', observador_lb_json)

##### Timelines

News titles from key moments

In [None]:
# Remove time from date and convert it to string

df_publico_km['date'] = pd.to_datetime(df_publico_km['date']).dt.date.astype(str)

df_observador_km['date'] = pd.to_datetime(df_observador_km['date']).dt.date.astype(str)

In [None]:
timelines_publico = get_timelines(df_publico_km, topics_pt)

timelines_observador = get_timelines(df_observador_km, topics_pt)

In [None]:
# Generate TLS dataset

print('Writing publico timeline dataset')
write_timelines(timelines_publico, 'pt', 'publico', with_json=False)

print('Writing observador timeline dataset')
write_timelines(timelines_observador, 'pt', 'observador', with_json=False)

In [None]:
# Generate CMH dataset

publico_km_json = data_to_cmh(timelines_publico, 'publico', 'pt')
write_json('publico_km.json', publico_km_json)

observador_km_json = data_to_cmh(timelines_observador, 'observador', 'pt')
write_json('observador_km.json', observador_km_json)

In [None]:
from datetime import datetime

# Save data to drive
TLS_DATA_DIR = 'dataset_pt/'

zip_file_name = 'dataset_pt-' + datetime.today().strftime("%Y-%m-%d") + '.zip'

!zip -r $zip_file_name $TLS_DATA_DIR


DATA_STORE_DIR = os.path.join(DRIVE_MOUNT_DIR, 'MyDrive/inesc/tls_covid')

!cp -r $zip_file_name $DATA_STORE_DIR

### EN

In [None]:
# Load Spacy en ner model
nlp_ner_en = en_core_web_sm.load(disable=['tagger', 'parser'])

# YAKE en
YAKE_PARAMS['language'] = LAN_EN

kw_extractor_en = yake.KeywordExtractor(
            lan=YAKE_PARAMS['language'], top=YAKE_PARAMS['numOfKeywords'])

#### CNN

In [None]:
df_cnn_lb = pd.read_json(NEWS_CNN)

# Create a new column with the concatenation of title and text (used to find topics)
df_cnn_lb = concatenate_title_text(df_cnn_lb)

df_cnn_km = pd.read_json(KMS_CNN)

##### Get topics

In [None]:
print('Getting spaCy entities')
df_cnn_km['ents'] = df_cnn_km['title'].progress_apply(lambda t: get_ents(nlp_ner_en, t))

print('Getting yake keywords')
df_cnn_km['kws'] = df_cnn_km['title'].progress_apply(lambda t: get_kws(kw_extractor_en, t))

print('Merging entities and keywords')
df_cnn_km['topics'] = df_cnn_km.progress_apply(lambda r: list(set(r['ents'] + r['kws'])), axis=1)

In [None]:
print('Getting spaCy entities')
df_cnn_lb['ents'] = df_cnn_lb['title_and_text'].progress_apply(lambda t: get_ents(nlp_ner_en, t))

print('Getting yake keywords')
df_cnn_lb['kws'] = df_cnn_lb['title_and_text'].progress_apply(lambda t: get_kws(kw_extractor_en, t))

print('Merging entities and keywords')
df_cnn_lb['topics'] = df_cnn_lb.progress_apply(lambda r: list(set(r['ents'] + r['kws'])), axis=1)

#### Guardian

In [None]:
df_guardian_lb = pd.read_json(NEWS_GUARDIAN)

# Create a new column with the concatenation of title and text (used to find topics)
df_guardian_lb = concatenate_title_text(df_guardian_lb)

##### Get topics

In [None]:
print('Getting spaCy entities')
df_guardian_lb['ents'] = df_guardian_lb['title_and_text'].progress_apply(lambda t: get_ents(nlp_ner_en, t))

print('Getting yake keywords')
df_guardian_lb['kws'] = df_guardian_lb['title_and_text'].progress_apply(lambda t: get_kws(kw_extractor_en, t))

print('Merging entities and keywords')
df_guardian_lb['topics'] = df_guardian_lb.progress_apply(lambda r: list(set(r['ents'] + r['kws'])), axis=1)

#### Topics selection

##### Count topics occurences

In [None]:
# Topics are chosen based on their occurence in key moments

df_cnn_km = df_cnn_km

df_guardian_km = df_guardian_lb.loc[df_guardian_lb['is_km'] == 'True']

In [None]:
# For each source create a list of lists with topics. Each inner list corresponds to a news. Used to keep track of the number of news in which each topic appears

cnn_topics_per_news_km = df_cnn_km.topics.tolist()

guardian_topics_per_news_km = df_guardian_km.topics.tolist()

In [None]:
# List topics for English

cnn_km_topics = set([a for b in cnn_topics_per_news_km for a in b])

guardian_km_topics = set([a for b in guardian_topics_per_news_km for a in b])

en_km_topics = list(set(cnn_km_topics | guardian_km_topics))

In [None]:
# For each topic count the number of key moments per source in which it appears

en_km_topics_count = {}

for topic in tqdm(en_km_topics):
    
  source_topic_count = {}

  source_topic_count['cnn'] = len([list_of_topics for list_of_topics in cnn_topics_per_news_km if topic in list_of_topics])
  
  source_topic_count['guardian'] = len([list_of_topics for list_of_topics in guardian_topics_per_news_km if topic in list_of_topics])

  en_km_topics_count[topic] = source_topic_count

len(en_km_topics_count)

In [None]:
# Filter topics by minimum number of occurrences in key moments

MIN_NUM_KEY_MOMENTS = 5

for topic, counts in list(en_km_topics_count.items()):

  # delete topic if there are less than MIN_NUM_KEY_MOMENTS key moments in one of the sources
  if (counts['cnn'] < MIN_NUM_KEY_MOMENTS or counts['guardian'] < MIN_NUM_KEY_MOMENTS):
    del en_km_topics_count[topic]

len(en_km_topics_count)

##### Keymoments/liveblog ratio

In [None]:
# For each source create a list of lists with topics. Each inner list corresponds ta a news. Used to keep track of the number of news in which each topic appears

cnn_topics_per_news_lb = df_cnn_lb.topics.tolist()

guardian_topics_per_news_lb = df_guardian_lb.topics.tolist()

In [None]:
# For each topic copute a ratio of occurences between key moments and liveblog per source

ratio_dict_en = {}

for topic in tqdm(en_km_topics_count):

  # Count occurences in liveblog
  cnn_lb_count = len([list_of_topics for list_of_topics in cnn_topics_per_news_lb if topic in list_of_topics])
  guardian_lb_count = len([list_of_topics for list_of_topics in guardian_topics_per_news_lb if topic in list_of_topics])

  cnn_km_count = en_km_topics_count[topic]['cnn']
  guardian_km_count = en_km_topics_count[topic]['guardian']

  inner_dict = {}

  inner_dict['cnn_lb_count'] = cnn_lb_count
  inner_dict['cnn_km_count'] = cnn_km_count
  if cnn_lb_count < 1:
    inner_dict['cnn_ratio_km_lb'] = 0
  else:
    inner_dict['cnn_ratio_km_lb'] = cnn_km_count/cnn_lb_count

  inner_dict['guardian_lb_count'] = guardian_lb_count
  inner_dict['guardian_km_count'] = guardian_km_count
  if guardian_lb_count < 1:
    inner_dict['guardian_ratio_km_lb'] = 0
  else:
    inner_dict['guardian_ratio_km_lb'] = guardian_km_count/guardian_lb_count

  ratio_dict_en[topic] = inner_dict

len(ratio_dict_en)

In [None]:
RATIO = 0.5

for topic, counts in list(ratio_dict_en.items()):
  if counts['cnn_ratio_km_lb'] > RATIO or counts['guardian_ratio_km_lb'] > RATIO:
    del ratio_dict_en[topic]

len(ratio_dict_en)

In [None]:
rde = pd.DataFrame.from_dict(ratio_dict_en, orient='index')
rde = rde.sort_index(ascending=True)
rde.to_excel("ratio_dict_en.xlsx") 

In [None]:
# Sort ratio dict by key (topic)
ratio_dict_en = {k: ratio_dict_en[k] for k in sorted(ratio_dict_en)}

# Use list of lists to be prepared to synonyms
topics_en = [[t] for t in list(ratio_dict_en.keys())]

In [None]:
write_json('ratio_dict_en.json', ratio_dict_en)

with open('topics_en.txt', 'w') as f:
  for topic in topics_en:
    f.write("%s\n" % topic[0])

##### Filter queries manually

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
with open('topics_en.txt', 'r') as fp:
  topics_en = [line.rstrip() for line in fp]

# There may be one topic per line, or multiple topics (synonyms) per line separated by comma
topics_en = [t.split(',') for t in topics_en]

# Check if all topics exist in the original topics identification
for topics in topics_en:
  for t in topics:
    if t not in list(ratio_dict_en.keys()):
      print(t + ' does not exist')

print(sum(len(t) for t in topics_en))
print(len(topics_en))
print(topics_en)

#### Write dataset

##### Input Docs

News texts from whole liveblog

In [None]:
# Remove time from date and convert it to string

df_cnn_lb['date'] = pd.to_datetime(df_cnn_lb['date']).dt.date.astype(str)

df_guardian_lb['date'] = pd.to_datetime(df_guardian_lb['date']).dt.date.astype(str)

In [None]:
inputDocs_cnn = get_inputDocs(df_cnn_lb, topics_en)

inputDocs_guardian = get_inputDocs(df_guardian_lb, topics_en)

In [None]:
# Generate TLS dataset

print('Writing cnn input docs dataset')
write_inputDocs(inputDocs_cnn, 'en', 'cnn', with_json=False)

print('Writing guardian input docs dataset')
write_inputDocs(inputDocs_guardian, 'en', 'guardian', with_json=False)

In [None]:
# Generate CMH dataset

cnn_lb_json = data_to_cmh(inputDocs_cnn, 'cnn', 'en')
write_json('cnn_lb.json', cnn_lb_json)

guardian_lb_json = data_to_cmh(inputDocs_guardian, 'guardian', 'en')
write_json('guardian_lb.json', guardian_lb_json)

##### Timelines

News titles from key moments

In [None]:
# Remove time from date and convert it to string

df_cnn_km['date'] = pd.to_datetime(df_cnn_km['date']).dt.date.astype(str)

df_guardian_km['date'] = pd.to_datetime(df_guardian_km['date']).dt.date.astype(str)

In [None]:
timelines_cnn = get_timelines(df_cnn_km, topics_en)

timelines_guardian = get_timelines(df_guardian_km, topics_en)

In [None]:
# Generate TLS dataset

print('Writing cnn timeline dataset')
write_timelines(timelines_cnn, 'en', 'cnn', with_json=False)

print('Writing guardian timeline dataset')
write_timelines(timelines_guardian, 'en', 'guardian', with_json=False)

In [None]:
# Generate CMH dataset

cnn_km_json = data_to_cmh(timelines_cnn, 'cnn', 'en')
write_json('cnn_km.json', cnn_km_json)

guardian_km_json = data_to_cmh(timelines_guardian, 'guardian', 'en')
write_json('guardian_km.json', guardian_km_json)

In [None]:
from datetime import datetime

# Save data to drive
TLS_DATA_DIR = 'dataset_en/'

zip_file_name = 'dataset_en-' + datetime.today().strftime("%Y-%m-%d") + '.zip'

!zip -r $zip_file_name $TLS_DATA_DIR


# DATA_STORE_DIR = os.path.join(DRIVE_MOUNT_DIR, 'MyDrive/inesc/tls_covid')

# !cp -r $zip_file_name $DATA_STORE_DIR

## Statistics

Statistics provided in paper

In [None]:
stats_dict = {}

def get_topics_freqs(list_of_topics, ratio_dict, lan):
  topic_freq = {}

  for topic in list_of_topics:
    # Aggregate all synonym topics into first one
    topic_name = topic[0]
    freq = 0
    for t in topic:
      if lan == 'pt':
        freq += ratio_dict[t]['publico_km_count'] + ratio_dict[t]['publico_lb_count'] + ratio_dict[t]['observador_km_count'] + ratio_dict[t]['observador_lb_count']
      elif lan == 'en':
        freq += ratio_dict[t]['cnn_km_count'] + ratio_dict[t]['cnn_lb_count'] + ratio_dict[t]['guardian_km_count'] + ratio_dict[t]['guardian_lb_count']

    topic_freq[topic_name] = freq

  return topic_freq

def get_num_sents(text):
  sentences = []
  for paragraph in segmenter.process(text):
    for sentence in paragraph:
      s_sentence = ''
      for token in sentence:
        s_sentence += token.value + ' '
      sentences.append(s_sentence)
  
  return len(sentences)

def count_sents_timelines(news_dict):
  count = 0
  for _, n in news_dict.items():
    for _, l in n.items():
      for news in l:
        count += get_num_sents(news[0])
  return count

def count_sents_inputDocs(news_dict):
  count = 0
  for _, n in news_dict.items():
    for _, l in n.items():
      for news in l:
        count += get_num_sents(news[1])
  return count

def get_dates_list(news_dict):
  dates = []
  for _, n in news_dict.items():
    for d in n:
      dates.append(d)
  
  return dates

### WordClouds

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

#### PT

In [None]:
topic_freq_pt = get_topics_freqs(topics_pt, ratio_dict_pt, 'pt')

In [None]:
wordcloud = WordCloud(background_color='white', max_font_size=100, width=512, height=512).generate_from_frequencies(topic_freq_pt)
plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")

#### EN

In [None]:
topic_freq_en = get_topics_freqs(topics_en, ratio_dict_en, 'en')

In [None]:
wordcloud = WordCloud(background_color='white', min_font_size=20, relative_scaling=0, max_font_size=100, prefer_horizontal=1, scale=10, width=512, height=512).generate_from_frequencies(topic_freq_en)

plt.figure(figsize=(16,9))
plt.imshow(wordcloud)
plt.axis("off")

### PT

In [None]:
# Number of topics
num_topics_pt = len(topics_pt)
num_topics_pt_expanded = (sum(len(t) for t in topics_pt))

print('Number of topics in portuguese: ' + str(num_topics_pt) + ' (' + str(num_topics_pt_expanded) + ')')

#### Publico

In [None]:
print('Publico first date:', df_publico_lb.iloc[-1].date)

In [None]:
# Sentences Liveblog
num_sents_lb_publico = count_sents_inputDocs(inputDocs_publico)

avg_sents_topic_lb_publico = num_sents_lb_publico / num_topics_pt

print('Average sentences per topic in liveblog publico: ' + str(round(avg_sents_topic_lb_publico,2)))

# Dates Liveblog
num_dates_lb_publico = len(get_dates_list(inputDocs_publico))

avg_dates_topic_lb_publico = num_dates_lb_publico / num_topics_pt

print('Average dates per topic in liveblog publico: ' + str(round(avg_dates_topic_lb_publico,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_lb_publico = avg_sents_topic_lb_publico/avg_dates_topic_lb_publico

print('Average senteces/dates per topic in liveblog publico: ' + str(round(avg_sentences_dates_lb_publico,2)))

In [None]:
# Sentences Timeline
num_sents_tl_publico = count_sents_timelines(timelines_publico)

avg_sents_topic_tl_publico = num_sents_tl_publico / num_topics_pt

print('Average sentences per topic in timeline publico: ' + str(round(avg_sents_topic_tl_publico,2)))

# Dates Timeline
num_dates_tl_publico = len(get_dates_list(timelines_publico))

avg_dates_topic_tl_publico = num_dates_tl_publico / num_topics_pt

print('Average dates per topic in timeline publico: ' + str(round(avg_dates_topic_tl_publico,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_tl_publico = avg_sents_topic_tl_publico/avg_dates_topic_tl_publico

print('Average senteces/dates per topic in timeline publico: ' + str(round(avg_sentences_dates_tl_publico,2)))

In [None]:
# Compression Sents
compression_sents_publico = (avg_sents_topic_tl_publico/avg_sents_topic_lb_publico)*100

print('Compression of sentences publico: ' + str(round(compression_sents_publico,2)))

# Compression Dates
compression_dates_publico = (avg_dates_topic_tl_publico/avg_dates_topic_lb_publico)*100

print('Compression of dates publico: ' + str(round(compression_dates_publico,2)))

In [None]:
# Number of input docs

n_inputDocs_publico = 0
for topic, date_news in inputDocs_publico.items():
  for date, news_list in date_news.items():
    n_inputDocs_publico += len(news_list)

print('Number of input docs in publico: ' + str(n_inputDocs_publico))

In [None]:
# Number of timeline entries

n_timelines_publico = 0
for topic, date_news in timelines_publico.items():
  for date, news_list in date_news.items():
    n_timelines_publico += len(news_list)

print('Number of timeline entries in publico: ' + str(n_timelines_publico))

In [None]:
stats_publico = {
    'n_topics': num_topics_pt,
    'lang': 'pt',
    'n_inputDocs': n_inputDocs_publico,
    'inputDocs_avg_sents': round(avg_sents_topic_lb_publico,2),
    'inputDocs_avg_dates': round(avg_dates_topic_lb_publico,2),
    'inputdocs_avg_sents_per_dates': round(avg_sentences_dates_lb_publico,2),
    'timeline_avg_sents': round(avg_sents_topic_tl_publico,2),
    'timeline_avg_dates': round(avg_dates_topic_tl_publico,2),
    'timeline_avg_sents_per_dates': round(avg_sentences_dates_tl_publico,2),
    'compression_sents': round(compression_sents_publico,2),
    'compression_dates': round(compression_dates_publico,2)
}

stats_dict['publico'] = stats_publico

#### Observador

In [None]:
print('Observador first date:', df_observador_lb.iloc[-1].date)

In [None]:
# Sentences Liveblog
num_sents_lb_observador = count_sents_inputDocs(inputDocs_observador)

avg_sents_topic_lb_observador = num_sents_lb_observador / num_topics_pt

print('Average sentences per topic in liveblog observador: ' + str(round(avg_sents_topic_lb_observador,2)))

# Dates Liveblog
num_dates_lb_observador = len(get_dates_list(inputDocs_observador))

avg_dates_topic_lb_observador = num_dates_lb_observador / num_topics_pt

print('Average dates per topic in liveblog observador: ' + str(round(avg_dates_topic_lb_observador,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_lb_observador = avg_sents_topic_lb_observador/avg_dates_topic_lb_observador

print('Average senteces/dates per topic in liveblog observador: ' + str(round(avg_sentences_dates_lb_observador,2)))

In [None]:
# Sentences Timeline
num_sents_tl_observador = count_sents_timelines(timelines_observador)

avg_sents_topic_tl_observador = num_sents_tl_observador / num_topics_pt

print('Average sentences per topic in timeline observador: ' + str(round(avg_sents_topic_tl_observador,2)))

# Dates Timeline
num_dates_tl_observador = len(get_dates_list(timelines_observador))

avg_dates_topic_tl_observador = num_dates_tl_observador / num_topics_pt

print('Average dates per topic in timeline observador: ' + str(round(avg_dates_topic_tl_observador,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_tl_observador = avg_sents_topic_tl_observador/avg_dates_topic_tl_observador

print('Average senteces/dates per topic in timeline observador: ' + str(round(avg_sentences_dates_tl_observador,2)))

In [None]:
# Compression Sents
compression_sents_observador = (avg_sents_topic_tl_observador/avg_sents_topic_lb_observador)*100

print('Compression of sentences observador: ' + str(round(compression_sents_observador,2)))

# Compression Dates
compression_dates_observador = (avg_dates_topic_tl_observador/avg_dates_topic_lb_observador)*100

print('Compression of dates observador: ' + str(round(compression_dates_observador,2)))

In [None]:
# Number of input docs

n_inputDocs_observador = 0
for topic, date_news in inputDocs_observador.items():
  for date, news_list in date_news.items():
    n_inputDocs_observador += len(news_list)

print('Number of input docs in observador: ' + str(n_inputDocs_observador))

In [None]:
# Number of timeline entries

n_timelines_observador = 0
for topic, date_news in timelines_observador.items():
  for date, news_list in date_news.items():
    n_timelines_observador += len(news_list)

print('Number of timeline entries in observador: ' + str(n_timelines_observador))

In [None]:
stats_observador = {
    'n_topics': num_topics_pt,
    'lang': 'pt',
    'n_inputDocs': n_inputDocs_observador,
    'inputDocs_avg_sents': round(avg_sents_topic_lb_observador,2),
    'inputDocs_avg_dates': round(avg_dates_topic_lb_observador,2),
    'inputdocs_avg_sents_per_dates': round(avg_sentences_dates_lb_observador,2),
    'timeline_avg_sents': round(avg_sents_topic_tl_observador,2),
    'timeline_avg_dates': round(avg_dates_topic_tl_observador,2),
    'timeline_avg_sents_per_dates': round(avg_sentences_dates_tl_observador,2),
    'compression_sents': round(compression_sents_observador,2),
    'compression_dates': round(compression_dates_observador,2)
}

stats_dict['observador'] = stats_observador

#### Total

In [None]:
print('Number of input docs in portuguese: ' + str(n_inputDocs_publico + n_inputDocs_observador))
print('Number of timelines in portuguese: ' + str(n_timelines_publico + n_timelines_observador))

### EN

In [None]:
num_topics_en = len(topics_en)
num_topics_en_expanded = (sum(len(t) for t in topics_en))

print('Number of topics in english: ' + str(num_topics_en) + ' (' + str(num_topics_en_expanded) + ')')

#### CNN

In [None]:
print('CNN first date:', df_cnn_lb.iloc[-1].date)

In [None]:
# Sentences Liveblog
num_sents_lb_cnn = count_sents_inputDocs(inputDocs_cnn)

avg_sents_topic_lb_cnn = num_sents_lb_cnn / num_topics_en

print('Average sentences per topic in liveblog cnn: ' + str(round(avg_sents_topic_lb_cnn,2)))

# Dates Liveblog
num_dates_lb_cnn = len(get_dates_list(inputDocs_cnn))

avg_dates_topic_lb_cnn = num_dates_lb_cnn / num_topics_en

print('Average dates per topic in liveblog cnn: ' + str(round(avg_dates_topic_lb_cnn,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_lb_cnn = avg_sents_topic_lb_cnn/avg_dates_topic_lb_cnn

print('Average senteces/dates per topic in liveblog cnn: ' + str(round(avg_sentences_dates_lb_cnn,2)))

In [None]:
# Sentences Timeline
num_sents_tl_cnn = count_sents_timelines(timelines_cnn)

avg_sents_topic_tl_cnn = num_sents_tl_cnn / num_topics_en

print('Average sentences per topic in timeline cnn: ' + str(round(avg_sents_topic_tl_cnn,2)))

# Dates Timeline
num_dates_tl_cnn = len(get_dates_list(timelines_cnn))

avg_dates_topic_tl_cnn = num_dates_tl_cnn / num_topics_en

print('Average dates per topic in timeline cnn: ' + str(round(avg_dates_topic_tl_cnn,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_tl_cnn = avg_sents_topic_tl_cnn/avg_dates_topic_tl_cnn

print('Average senteces/dates per topic in timeline cnn: ' + str(round(avg_sentences_dates_tl_cnn,2)))

In [None]:
# Compression Sents
compression_sents_cnn = (avg_sents_topic_tl_cnn/avg_sents_topic_lb_cnn)*100

print('Compression of sentences cnn: ' + str(round(compression_sents_cnn,2)))

# Compression Dates
compression_dates_cnn = (avg_dates_topic_tl_cnn/avg_dates_topic_lb_cnn)*100

print('Compression of dates cnn: ' + str(round(compression_dates_cnn,2)))

In [None]:
# Number of input docs

n_inputDocs_cnn = 0
for topic, date_news in inputDocs_cnn.items():
  for date, news_list in date_news.items():
    n_inputDocs_cnn += len(news_list)

print('Number of input docs in cnn: ' + str(n_inputDocs_cnn))

In [None]:
# Number of timeline entries

n_timelines_cnn = 0
for topic, date_news in timelines_cnn.items():
  for date, news_list in date_news.items():
    n_timelines_cnn += len(news_list)

print('Number of timeline entries in cnn: ' + str(n_timelines_cnn))

In [None]:
stats_cnn = {
    'n_topics': num_topics_en,
    'lang': 'en',
    'n_inputDocs': n_inputDocs_cnn,
    'inputDocs_avg_sents': round(avg_sents_topic_lb_cnn,2),
    'inputDocs_avg_dates': round(avg_dates_topic_lb_cnn,2),
    'inputdocs_avg_sents_per_dates': round(avg_sentences_dates_lb_cnn,2),
    'timeline_avg_sents': round(avg_sents_topic_tl_cnn,2),
    'timeline_avg_dates': round(avg_dates_topic_tl_cnn,2),
    'timeline_avg_sents_per_dates': round(avg_sentences_dates_tl_cnn,2),
    'compression_sents': round(compression_sents_cnn,2),
    'compression_dates': round(compression_dates_cnn,2)
}

stats_dict['cnn'] = stats_cnn

#### Guardian

In [None]:
print('Guardian first date:', df_guardian_lb.iloc[-1].date)

In [None]:
# Sentences Liveblog
num_sents_lb_guardian = count_sents_inputDocs(inputDocs_guardian)

avg_sents_topic_lb_guardian = num_sents_lb_guardian / num_topics_en

print('Average sentences per topic in liveblog guardian: ' + str(round(avg_sents_topic_lb_guardian,2)))

# Dates Liveblog
num_dates_lb_guardian = len(get_dates_list(inputDocs_guardian))

avg_dates_topic_lb_guardian = num_dates_lb_guardian / num_topics_en

print('Average dates per topic in liveblog guardian: ' + str(round(avg_dates_topic_lb_guardian,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_lb_guardian = avg_sents_topic_lb_guardian/avg_dates_topic_lb_guardian

print('Average senteces/dates per topic in liveblog guardian: ' + str(round(avg_sentences_dates_lb_guardian,2)))

In [None]:
# Sentences Timeline
num_sents_tl_guardian = count_sents_timelines(timelines_guardian)

avg_sents_topic_tl_guardian = num_sents_tl_guardian / num_topics_en

print('Average sentences per topic in timeline guardian: ' + str(round(avg_sents_topic_tl_guardian,2)))

# Dates Timeline
num_dates_tl_guardian = len(get_dates_list(timelines_guardian))

avg_dates_topic_tl_guardian = num_dates_tl_guardian / num_topics_en

print('Average dates per topic in timeline guardian: ' + str(round(avg_dates_topic_tl_guardian,2)))

# Senteces/Dates Liveblog
avg_sentences_dates_tl_guardian = avg_sents_topic_tl_guardian/avg_dates_topic_tl_guardian

print('Average senteces/dates per topic in timeline guardian: ' + str(round(avg_sentences_dates_tl_guardian,2)))

In [None]:
# Compression Sents
compression_sents_guardian = (avg_sents_topic_tl_guardian/avg_sents_topic_lb_guardian)*100

print('Compression of sentences guardian: ' + str(round(compression_sents_guardian,2)))

# Compression Dates
compression_dates_guardian = (avg_dates_topic_tl_guardian/avg_dates_topic_lb_guardian)*100

print('Compression of dates guardian: ' + str(round(compression_dates_guardian,2)))

In [None]:
# Number of input docs

n_inputDocs_guardian = 0
for topic, date_news in inputDocs_guardian.items():
  for date, news_list in date_news.items():
    n_inputDocs_guardian += len(news_list)

print('Number of input docs in guardian: ' + str(n_inputDocs_guardian))

In [None]:
# Number of timeline entries

n_timelines_guardian = 0
for topic, date_news in timelines_guardian.items():
  for date, news_list in date_news.items():
    n_timelines_guardian += len(news_list)

print('Number of timeline entries in guardian: ' + str(n_timelines_guardian))

In [None]:
stats_guardian = {
    'n_topics': num_topics_en,
    'lang': 'en',
    'n_inputDocs': n_inputDocs_guardian,
    'inputDocs_avg_sents': round(avg_sents_topic_lb_guardian,2),
    'inputDocs_avg_dates': round(avg_dates_topic_lb_guardian,2),
    'inputdocs_avg_sents_per_dates': round(avg_sentences_dates_lb_guardian,2),
    'timeline_avg_sents': round(avg_sents_topic_tl_guardian,2),
    'timeline_avg_dates': round(avg_dates_topic_tl_guardian,2),
    'timeline_avg_sents_per_dates': round(avg_sentences_dates_tl_guardian,2),
    'compression_sents': round(compression_sents_guardian,2),
    'compression_dates': round(compression_dates_guardian,2)
}

stats_dict['guardian'] = stats_guardian

#### Total

In [None]:
print('Number of input docs in english: ' + str(n_inputDocs_cnn + n_inputDocs_guardian))
print('Number of timelines in english: ' + str(n_timelines_cnn + n_timelines_guardian))

### Export stats

In [None]:
stats_df = pd.DataFrame.from_dict(stats_dict, orient='index')
stats_df.to_excel("dataset_stats.xlsx")

In [None]:
stats_df

### Timeline chart (**ToDo**)

In [None]:
def gt_to_list(gt):
  news_list = []
  for _, n in gt.items():
    for d, l in n.items():
      for news in l:
        news_list.append((d, news))
  
  return news_list

In [None]:
p_l = gt_to_list(gt_publico)
dfp = pd.DataFrame(p_l, columns=['date', 'news'])
dfp['source'] = 'publico'

In [None]:
o_l = gt_to_list(gt_observador)
dfo = pd.DataFrame(o_l, columns=['date', 'news'])
dfo['source'] = 'observador'

In [None]:
c_l = gt_to_list(gt_cnn)
dfc = pd.DataFrame(c_l, columns=['date', 'news'])
dfc['source'] = 'cnn'

In [None]:
g_l = gt_to_list(gt_guardian)
dfg = pd.DataFrame(g_l, columns=['date', 'news'])
dfg['source'] = 'guardian'

In [None]:
dfs = [dfp, dfo, dfc, dfg]

df_total = pd.concat(dfs)

In [None]:
p_vc = dfp['date'].value_counts().rename_axis('dates').reset_index(name='counts')
p_vc['source'] = 'publico'

o_vc = dfo['date'].value_counts().rename_axis('dates').reset_index(name='counts')
o_vc['source'] = 'observador'

c_vc = dfc['date'].value_counts().rename_axis('dates').reset_index(name='counts')
c_vc['source'] = 'cnn'

g_vc = dfg['date'].value_counts().rename_axis('dates').reset_index(name='counts')
g_vc['source'] = 'guardian'

dfs_vc = [p_vc, o_vc, c_vc, g_vc]

df_vc_total = pd.concat(dfs_vc)

In [None]:
df_vc_total = df_vc_total.sort_values(by='dates', ascending=False)

In [None]:
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

#xxx = dfp['date'].value_counts().plot()
ax = sns.lineplot(x="dates", y="counts", hue='source', data=df_vc_total)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

In [None]:
dfp = dfp.sort_values(by='date', ascending=True)

dfp['date'].value_counts().plot(rot=90);

In [None]:
dfp

In [None]:
dfo = dfo.sort_values(by='date', ascending=True)

dfo['date'].value_counts().plot(rot=90);

In [None]:
dfo

In [None]:
dfc = dfc.sort_values(by='date', ascending=True)

dfc['date'].value_counts().plot(rot=90);

In [None]:
dfc

In [None]:
dfg = dfg.sort_values(by='date', ascending=True)

dfg['date'].value_counts().plot(rot=90);

In [None]:
dfg