## Scraping from News Source

In [None]:
%%capture
!apt update
!apt install chromium-chromedriver
!pip install selenium
!pip install dateparser

In [None]:
import os
import time
import re
import json
import html
from datetime import datetime
from pathlib import Path
from urllib.parse import urlencode
import requests
import dateparser
from bs4 import BeautifulSoup
from selenium import webdriver

In [None]:
DATA_DIR = 'data/'
Path(DATA_DIR).mkdir(parents=True, exist_ok=True)
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}

def clean_html(text):
  text = html.unescape(text)
  tags_regex = re.compile('<.*?>')
  text = re.sub(tags_regex, '', str(text))
  return text

def pre_proc(text):
  text = clean_html(text)
  text = ' '.join(text.split())
  return text

def format_date(date_str):
  return dateparser.parse(date_str).strftime('%Y-%m-%d %H:%M')

def write_json(file_path, json_data):
  print('Writing ' + file_path)
  with open(file_path, 'w', encoding='utf8') as fp:
      json.dump(json_data, fp, ensure_ascii=False, indent=4)

collection_stats = {}

def get_source_stats(num_lbs, num_news, num_kms, first_date, last_date):
  stats = {
      'num_lbs': num_lbs,
      'num_news': num_news,
      'num_kms': num_kms,
      'first_date': first_date,
      'last_date': last_date
  }
  return stats

In [None]:
# The root directory where the datasets are placed
DATA_DIR = 'data/'

# Create python vars with the datasets location
DATA_DIR_PUBLICO = os.path.join(DATA_DIR, 'publico/')
NEWS_PUBLICO = os.path.join(DATA_DIR_PUBLICO, 'news_publico.json')

DATA_DIR_OBSERVADOR = os.path.join(DATA_DIR, 'observador/')
NEWS_OBSERVADOR = os.path.join(DATA_DIR_OBSERVADOR, 'news_observador.json')

DATA_DIR_CNN = os.path.join(DATA_DIR, 'cnn/')
NEWS_CNN = os.path.join(DATA_DIR_CNN, 'news_cnn.json')
KMS_CNN = os.path.join(DATA_DIR_CNN, 'kms_cnn.json')

DATA_DIR_GUARDIAN = os.path.join(DATA_DIR, 'guardian/')
NEWS_GUARDIAN = os.path.join(DATA_DIR_GUARDIAN, 'news_guardian.json')
KMS_GUARDIAN = os.path.join(DATA_DIR_GUARDIAN, 'kms_guardian.json')

In [None]:
def pre_proc(df_news, noisy_strs_title, noisy_strs_text):
  df_news = df_news.dropna(subset=['title'])
  try:
    df_news = df_news.dropna(subset=['text'])
  except KeyError as ke:
    pass
  df_news = df_news[~df_news['title'].str.contains('|'.join(noisy_strs_title), case=False)]
  try:
    df_news['text'] = df_news['text'].str.replace('|'.join(noisy_strs_text), '')
  except KeyError as ke:
    pass
  df_news['title'] = df_news['title'].str.replace('“|”', '"')
  try:
    df_news['text'] = df_news['text'].str.replace('“|”', '"')
  except KeyError as ke:
    pass
  df_news = df_news[df_news['title'] != '']
  try:
    df_news = df_news[df_news['text'] != '']
  except KeyError as ke:
    pass
  try:
    df_news = df_news.drop_duplicates(subset=['title', 'text', 'date'])
  except KeyError as ke:
    df_news = df_news.drop_duplicates(subset=['title', 'date'])
  df_news = df_news.reset_index(drop=True)
  return df_news

def write_df_to_json(df, file_path):
  df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M')
  df.to_json(file_path, orient='records', force_ascii=False, indent=4)

DATA_CLEAN_DIR = 'data_clean/'
Path(DATA_CLEAN_DIR).mkdir(parents=True, exist_ok=True)

### CNN

In [None]:
DATA_DIR_CNN = os.path.join(DATA_DIR, 'cnn/')
Path(DATA_DIR_CNN).mkdir(parents=True, exist_ok=True)

def get_chrome_driver():
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  driver = webdriver.Chrome('chromedriver', options=options)
  driver.implicitly_wait(30)
  return driver


def get_blogs_urls_cnn(browser):
    print('===== Collecting CNN liveblogs urls =====')
    search_url = 'https://edition.cnn.com/search?'
    payload = {'q': '"coronavirus news"', 'size':20, 'page': 1, 'from': 0}
    lb_urls = set()
    while True:
        url = search_url + urlencode(payload)
        print(url)
        browser.get(url)
        time.sleep(3)
        soup = BeautifulSoup(browser.page_source, 'html.parser')
        results_list = soup.find('div', {'class': 'cnn-search__results-list'})
        for result in results_list:
            try:
                result_content = result.find_next('div', {'class': 'cnn-search__result-contents'})
                headline = result_content.find_next('h3', {'class': 'cnn-search__result-headline'})
                a = headline.find_next('a')
                lb_url = a['href']
                if 'https:' not in lb_url:
                  lb_url = 'https:' + a['href']
                if 'coronavirus news' in a.get_text() and 'live-news' in lb_url:
                    lb_urls.add(lb_url)
            except AttributeError as ae:
                # print(ae)
                pass
        if len(results_list) < 2*payload['size']:
            browser.close()
            break
        payload['page'] += 1
        payload['from'] = (payload['page']*payload['size']) - payload['size']
    lb_urls = sorted(list(lb_urls))
    print('# lbs:', len(lb_urls))
    print()
    return lb_urls

def get_news_cnn(browser):

    lbs = get_blogs_urls_cnn(browser)
    print('===== Collecting CNN liveblogs news =====')
    list_news, list_kms = [], []
    for i, lb_url in enumerate(lbs):
        print(str(i+1) + '/' + str(len(lbs)))
        print(lb_url)
        r = requests.get(lb_url, headers=HEADERS)
        soup = BeautifulSoup(r.text, 'html.parser')
        try:
          lb_data = soup.find('script', id='liveBlog-schema', type='application/ld+json').find(text=True)
        except:
          print('Could not collect liveblog for this url\n')
          continue
        lb_json = json.loads(lb_data)
        list_news_lb = []
        for entry in lb_json['liveBlogUpdate']:
            try:
                title = pre_proc(entry['headline'])
            except:
                title = ''
            article = {
                'title': title,
                'text': pre_proc(entry['articleBody']),
                'date': format_date(entry['datePublished']),
                'is_km': 'False',
                'url': entry['url']
            }
            list_news_lb.append(article)
        list_news.extend(list_news_lb)
        try:
            kms_data = soup.find('aside', id='ls-rail').find('div', class_='sc-dnqmqq render-stellar-contentstyles__List-sc-9v7nwy-1 eUPcFX').find_next('ul')
        except:
            print('There are no key moments for this url')
            kms_data = []
        list_kms_lb = []
        for km in kms_data:
            clean_text = pre_proc(km)
            article = {
                'title': clean_text,
                'text': clean_text,
                'date': list_news_lb[len(list_news_lb)//2]['date'],
                'is_km': 'True',
                'url': lb_url
            }
            list_kms_lb.append(article)
        list_kms.extend(list_kms_lb)
        print('# news:', len(list_news_lb))
        print('# kms:', len(list_kms_lb))
        print()

    list_news.sort(key=lambda item:item['date'], reverse=True)
    list_kms.sort(key=lambda item:item['date'], reverse=True)
    source_stats = get_source_stats(len(lbs), len(list_news), len(list_kms), list_news[-1]['date'].split()[0], list_news[0]['date'].split()[0])
    collection_stats['cnn'] = source_stats
    print('Stats: ')
    for k, v in source_stats.items():
      print(k + ': ' + str(v))
    print()
    return list_news, list_kms

news_cnn, kms_cnn = get_news_cnn(get_chrome_driver())
cnn_news_path = os.path.join(DATA_DIR_CNN, 'news_cnn.json')
write_json(cnn_news_path, news_cnn)
cnn_kms_path = os.path.join(DATA_DIR_CNN, 'kms_cnn.json')
write_json(cnn_kms_path, kms_cnn)

### The Guardian

In [None]:
DATA_DIR_GUARDIAN = os.path.join(DATA_DIR, 'guardian/')
Path(DATA_DIR_GUARDIAN).mkdir(parents=True, exist_ok=True)

def get_blogs_urls_guardian():
    print('===== Collecting Guardian liveblogs urls =====')
    search_url = 'https://www.theguardian.com/world/series/coronavirus-live'
    payload = {'page': 1}
    lb_urls = set()
    while True:
        r = requests.get(search_url, headers=HEADERS, params=payload)
        if r.url == search_url:
            break

        soup = BeautifulSoup(r.text, 'html.parser')
        lbs_list = soup.find('div', class_='u-cf index-page', role='main')
        lbs_alinks = lbs_list.find_all('a', class_='fc-item__link')
        for a in lbs_alinks:
            url = a['href']
            lb_urls.add(url)
            print(a['href'])
        payload['page'] += 1
    lb_urls = sorted(list(lb_urls))
    print('# lbs:', len(lb_urls))
    print()
    return lb_urls

def get_news_guardian():
    lbs = get_blogs_urls_guardian()
    print('===== Collecting Guardian liveblogs news =====')
    base_url_guardian = 'https://www.theguardian.com'
    list_news, list_kms = [], []
    # Iterate over liveblogs
    for i, lb_url in enumerate(lbs):
        print(str(i+1) + '/' + str(len(lbs)))
        print(lb_url)
        list_news_lb, list_kms_lb = [], []
        url = lb_url
        try:
            while True:
                r = requests.get(url, headers=HEADERS)
                soup = BeautifulSoup(r.text, 'html.parser')
                lb = soup.find('div', class_='js-article__container')
                posts = lb.find_all('div', itemprop='liveBlogUpdate')
                for p in posts:
                    title = pre_proc(p.find('meta', itemprop='headline')['content'])
                    content_pars = p.find('div', class_='block-elements', itemprop='articleBody').find_all('p')
                    text = ' '.join([pre_proc(cp) for cp in content_pars])
                    date = format_date(p.find('time', class_='js-timestamp')['datetime'])
                    is_km = 'False'
                    news_url = base_url_guardian + p.find('a', class_='block-time__link', itemprop='url')['href']
                    article = {
                        'title': title,
                        'text': text,
                        'date': date,
                        'is_km': is_km,
                        'url': news_url
                    }
                    list_news_lb.append(article)

                if '#liveblog-navigation' not in url:
                    key_moments_data = soup.find_all('div', class_=' block block--content is-key-event')
                    for km in key_moments_data:
                        print('#'*40)
                        title = pre_proc(km.find('meta', itemprop='headline')['content'])
                        print(f'title: {title}')
                        content_pars = km.find('div', class_='block-elements block-elements--no-byline', itemprop='articleBody').find_all('p')
                        text = ' '.join([pre_proc(cp) for cp in content_pars])
                        print(f'text: {text}')
                        date = format_date(km.find('time', class_='js-timestamp')['datetime'])
                        news_url = base_url_guardian + km.find('a')['href']
                        is_km = 'True'
                        article = {
                            'title': title,
                            'date': date,
                            'text':text,
                            'is_km': is_km,
                            'url': news_url
                        }
                        list_kms_lb.append(article)
                pagination = soup.find('div', id='liveblog-navigation')
                if not pagination:
                    break
                pagination_older = pagination.find('div', class_='liveblog-navigation__older')
                next_page_a = pagination_older.find('a', class_='liveblog-navigation__link liveblog-navigation__link--primary')
                if not next_page_a:
                    break
                url = base_url_guardian + next_page_a['href']
        except:
            print('error')
            
        for km in list_kms_lb:
            for news in list_news_lb:
                if news['url'] == km['url']:
                    news['is_km'] = 'True'
        list_news.extend(list_news_lb)
        list_kms.extend(list_kms_lb)
        print('# news:', len(list_news_lb))
        print('# kms:', len(list_kms_lb))
        print()
    list_news.sort(key=lambda item:item['date'], reverse=True)
    list_kms.sort(key=lambda item:item['date'], reverse=True)
    source_stats = get_source_stats(len(lbs), len(list_news), len(list_kms), list_news[-1]['date'].split()[0], list_news[0]['date'].split()[0])
    collection_stats['guardian'] = source_stats
    print('Stats: ')
    for k, v in source_stats.items():
      print(k + ': ' + str(v))
    print()
    return list_news, list_kms

news_guardian, kms_guardian = get_news_guardian()
guradian_news_path = os.path.join(DATA_DIR_GUARDIAN, 'news_guardian.json')
write_json(guradian_news_path, news_guardian)
guardian_kms_path = os.path.join(DATA_DIR_GUARDIAN, 'kms_guardian.json')
write_json(guardian_kms_path, kms_guardian)

In [None]:
noisy_strs_title_cnn = [
                        'Follow live updates',
                        'Go here for latest updates',
                        'What you need to know',
                        'the latest on the pandemic',
                        'Watch the entire CNN coronavirus town hall',
                        'coronavirus town hall has ended',
                        'global town hall on coronavirus will start soon',
                        'the latest coronavirus update',
                        'the latest coronavirus numbers',
                        'what you may have missed',
                        'Catch up:'
]
noisy_strs_text_cnn = [
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        r'pic.twitter.com/[a-zA-Z0-9]+(\s|$)'
]
df_cnn_km = pre_proc(df_cnn_km, noisy_strs_title_cnn, noisy_strs_text_cnn)
df_cnn_lb = pre_proc(df_cnn_lb, noisy_strs_title_cnn, noisy_strs_text_cnn)

In [None]:
noisy_strs_title_guardian = [
                        'Summary',
                        'Key developments in the global coronavirus',
                        'What we know so far'
]
noisy_strs_text_guardian = [
                        r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                        r'Hi everyone, this is(.*)',
                        r'Hi, Helen Sullivan(.*)',
                        r'Good evening from(.*)',
                        r'We’ve launched a(.*)',
                        r'We’ve fired up a(.*)',
                        r'That’s it for this blog(.*)',
                        r'Read more.*$',
                        r'More info.*$',
                        r'pic.twitter.com/[a-zA-Z0-9]+(\s|$)'
]
df_guardian_lb = pre_proc(df_guardian_lb, noisy_strs_title_guardian, noisy_strs_text_guardian)
df_guardian_km = pre_proc(df_guardian_km, noisy_strs_title_guardian, noisy_strs_text_guardian)

In [None]:
DATA_CLEAN_DIR_CNN = os.path.join(DATA_CLEAN_DIR, 'cnn/')
Path(DATA_CLEAN_DIR_CNN).mkdir(parents=True, exist_ok=True)

cnn_news_clean_path = os.path.join(DATA_CLEAN_DIR_CNN, 'news_cnn.json')
write_df_to_json(df_cnn_lb, cnn_news_clean_path)

cnn_km_clean_path = os.path.join(DATA_CLEAN_DIR_CNN, 'kms_cnn.json')
write_df_to_json(df_cnn_km, cnn_km_clean_path)

In [None]:
DATA_CLEAN_DIR_GUARDIAN = os.path.join(DATA_CLEAN_DIR, 'guardian/')
Path(DATA_CLEAN_DIR_GUARDIAN).mkdir(parents=True, exist_ok=True)

guardian_news_clean_path = os.path.join(DATA_CLEAN_DIR_GUARDIAN, 'news_guardian.json')
write_df_to_json(df_guardian_lb, guardian_news_clean_path)

guardian_kms_clean_path = os.path.join(DATA_CLEAN_DIR_GUARDIAN, 'kms_guardian.json')
write_df_to_json(df_guardian_km, guardian_kms_clean_path)

## Spark NLP Colab Setup

In [None]:
import json, os
from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

In [None]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

params = {"spark.driver.memory":"16G",
"spark.kryoserializer.buffer.max":"2000M",
"spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

In [None]:
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", True)
# spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [None]:
spark.conf.get("spark.sql.execution.arrow.enabled")

## NER & Assertion Pipeline

In [None]:
from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

clinical_ner_large = MedicalNerModel.pretrained("ner_clinical_large", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner_clinical")\

clinical_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_clinical"]) \
    .setOutputCol("ner_clinical_chunk")

events_clinical_ner = MedicalNerModel.pretrained("ner_events_clinical", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("events_ner")\

events_clinical_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "events_ner"]) \
    .setOutputCol("events_ner_chunk")

jsl_greedy_ner = MedicalNerModel.pretrained("ner_jsl_greedy", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("jsl_greedy_ner")\

jsl_greedy_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "jsl_greedy_ner"]) \
    .setOutputCol("jsl_greedy_ner_chunk")

deid_ner = MedicalNerModel.pretrained("ner_deid_sd_large", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("deid_ner")\

deid_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "deid_ner"]) \
    .setOutputCol("deid_ner_chunk")

chunk_merger = ChunkMergeApproach().setInputCols(["ner_clinical_chunk", "events_ner_chunk", "jsl_greedy_ner_chunk", "deid_ner_chunk"]).setOutputCol("all_ner_chunks")

clinical_assertion = AssertionDLModel.pretrained("assertion_dl", "en", "clinical/models") \
    .setInputCols(["sentence", "all_ner_chunks", "embeddings"]) \
    .setOutputCol("clinical_assertion")

pipe = Pipeline(stages=[
documentAssembler,
sentenceDetector,
tokenizer,
word_embeddings,
clinical_ner_large,
clinical_ner_converter,
events_clinical_ner,
events_clinical_ner_converter,
jsl_greedy_ner,
jsl_greedy_ner_converter,
deid_ner,
deid_ner_converter,
chunk_merger,
clinical_assertion,
])

pd_df = pd.read_csv('news.csv')
sdf_pos = spark.createDataFrame(pd_df)

ner_results = pipe.fit(sdf_pos).transform(sdf_pos)

In [None]:
pd_ner_results = ner_results.select("id",F.explode(F.arrays_zip('all_ner_chunks.result','all_ner_chunks.metadata', 'clinical_assertion.result')).alias("cols")) \
                    .select("id",F.expr("cols['0']").alias("ner_chunk"),F.expr("cols['1']['entity']").alias("ner_label"),F.expr("cols['1']['sentence']").alias("sent_id"),F.expr("cols['2']").alias("clinical_assertion"),
                            ).toPandas()

## ADE Pipeline

In [None]:
documentAssembler = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector = SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")
  
ade_ner = MedicalNerModel.pretrained("ner_ade_clinical", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ade_ner")

ade_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "ade_ner"]) \
    .setOutputCol("ade_ner_chunk")\
    .setWhiteList(["ADE"])

covid_ner = MedicalNerModel.pretrained("ner_covid_trials", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("covid_ner")

covid_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "covid_ner"]) \
    .setOutputCol("covid_ner_chunk")\
    .setWhiteList(["Vaccine"])

jsl_ner = MedicalNerModel.pretrained("ner_jsl", "en", "clinical/models") \
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("jsl_ner")\

jsl_ner_converter = NerConverterInternal() \
    .setInputCols(["sentence", "token", "jsl_ner"]) \
    .setOutputCol("jsl_ner_chunk")\
    .setWhiteList(["Vaccine"])

chunk_merger = ChunkMergeApproach()\
    .setInputCols("ade_ner_chunk","covid_ner_chunk","jsl_ner_chunk")\
    .setOutputCol("ner_chunk")\


ner_pipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    # entityExtractor,
    word_embeddings,
    ade_ner,
    ade_ner_converter,
    covid_ner,
    covid_ner_converter,
    jsl_ner,
    jsl_ner_converter,
    chunk_merger
    ])

In [None]:
pd_df = pd.read_csv('news.csv')
sdf_pos = spark.createDataFrame(pd_df)

ade_ner_results = ner_pipeline.fit(sdf_pos).transform(sdf_pos)

In [None]:
pd_ade_ner_results = ade_ner_results.select('id', F.explode(F.arrays_zip("ner_chunk.result","ner_chunk.metadata")).alias("cols"))\
                                    .select('id', F.expr("cols['0']").alias("chunk"),
                                              F.expr("cols['1']['entity']").alias("entity"),
                                              F.expr("cols['1']['confidence']").alias("confidence")).toPandas()