In [1]:
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

url = 'https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date='
print(url)

s = requests.Session()
s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 Chrome/34.0.1847.116 Safari/537.36'
r = s.get(url)
if r.ok:
    print(r)

https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=
<Response [200]>


In [2]:
soup = BeautifulSoup(r.content, 'html')
#print(soup.prettify()) # print the parsed data of html

In [3]:
results_per_page = 10
num_results = soup.find('div', class_='disinfo-db-results').find('span').text
num_pages = int(np.ceil(float(num_results) / results_per_page))
print(f'There are {num_results} across {num_pages}')

There are 226 across 23


In [4]:
dataset = []
for page_num in range(num_pages):
    offset = page_num * results_per_page
    url = f'https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset={offset}'
    r = s.get(url)
    if not r.ok:
        print(f'Unable to parse {url}')
        pass
    
    print(f'Parsing {url}')
    soup = BeautifulSoup(r.content, 'html')
    re_soup = soup.find_all('div', class_='disinfo-db-post')

    print(f'Found {len(re_soup)} tags at {url}')
    find_title = lambda res: res.find(attrs={'data-column': 'Title'}).text.strip()
    find_link = lambda res: res.find(attrs={'data-column': 'Title'}).find('a', href=True)['href']
    find_date = lambda res: res.find(attrs={'data-column': 'Date'}).text.strip()
    find_outlets = lambda res: res.find(attrs={'data-column': 'Outlets'}).text.strip()
    find_country = lambda res: res.find(attrs={'data-column': 'Country'}).text.strip()
    for idx, result in enumerate(re_soup):
        title = find_title(result)
        link = find_link(result)
        date = find_date(result)
        outlet = find_outlets(result)
        country = find_country(result)
        entry = [title, link, date, outlet, country]
        #print(entry)
        dataset.append(entry)

Parsing https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=0
Found 10 tags at https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=0
Parsing https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=10
Found 10 tags at https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=10
Parsing https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=20
Found 10 tags at https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=20
Parsing https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=30
Found 10 tags at https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=30
Parsing https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=40
Found 10 tags at https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=40
Parsing https://euvsdisinfo.eu/disinformation-cases/?text=coronavirus&date=&offset=50
Found 10 ta

In [5]:
df = pd.DataFrame(dataset, columns=['Title', 'Link', 'Date', 'Outlet', 'Country'])
display(df)

Unnamed: 0,Title,Link,Date,Outlet,Country
0,"Blaming China for COVID-19 is a ploy, like bla...",https://euvsdisinfo.eu/report/blaming-china-fo...,01.04.2020,Journal of New Eastern Outlook,"EU, Russia, Ukraine, US, The Netherlands, Syria"
1,USAID is a CIA-affiliated terror group that cr...,https://euvsdisinfo.eu/report/usaid-is-a-cia-a...,01.04.2020,Journal of New Eastern Outlook,"China, US"
2,Authoritarian trends in the COVID-19 lockdown ...,https://euvsdisinfo.eu/report/authoritarian-tr...,01.04.2020,RT English,UK
3,The Italian Parliament lowers the flag of the ...,https://euvsdisinfo.eu/report/the-italian-parl...,01.04.2020,fondsk.ru,"EU, Italy"
4,Environmentalists are overjoyed by the fear ar...,https://euvsdisinfo.eu/report/environmentalist...,01.04.2020,RT English,"EU, UK, US"
...,...,...,...,...,...
221,China is preparing to resist a US attack of ma...,https://euvsdisinfo.eu/report/coronavirus-chin...,24.01.2020,60 Minut @ Rossiya 1 [22:35 - 23:39],"China, US"
222,China coronavirus was predicted by Nostradamus,https://euvsdisinfo.eu/report/china-coronaviru...,24.01.2020,Ukraina.ru,China
223,The new coronavirus is a US biological weapon,https://euvsdisinfo.eu/report/the-new-corona-v...,23.01.2020,"Sputnik web Arabic, tvzvezda, tsargrad, akhbar...","China, US"
224,A new Chinese coronavirus was likely elaborate...,https://euvsdisinfo.eu/report/a-new-chinese-co...,22.01.2020,Sputnik Belarus @ Novosti Shiolkovogo Puti tim...,"China, US"


In [6]:
for entry in dataset:
    article_url = entry[1]
    
    r = s.get(article_url)
    if not r.ok:
        print(f'Unable to parse {article_url}')
        pass
    
    print(f'Parsing {article_url}')
    soup = BeautifulSoup(r.content, 'html')
    summary = soup.find('div', class_='b-report__summary-text').text.strip()
    disproof = soup.find('div', class_='b-report__disproof-text').text.strip()
    
    article_title = soup.find('h1', class_='b-catalog__report-title').text.strip()
    
    # get the article source link
    try:
        article_source_link = soup.find('div', class_='b-catalog__link').find('a')['href']
        article_source_media = soup.find('div', class_='b-catalog__link').text.strip().replace(' (Archived', '')
    except:
        article_source_link = ''
        article_source_media = ''
    #print(article_source_link)
    #print(article_source_media)
    
    # get the optional article metadata
    metadata_list = dict()
    meta_list = soup.find('ul', class_='b-catalog__repwidget-list').find_all('li')
    for metadata in meta_list:
        extracted_label = metadata.find('b').extract().text.strip()
        d = metadata.text.strip()
        metadata_list[extracted_label] = d

    #print(metadata_list)
    
    reported_in =      metadata_list['Reported in:'] if 'Reported in:' in metadata_list else ''
    publication_date = metadata_list['DATE OF PUBLICATION:'] if 'DATE OF PUBLICATION:' in metadata_list else ''
    target_audience =  metadata_list['Language/target audience:'] if 'Language/target audience:' in metadata_list else ''
    country =          metadata_list['Country:'] if 'Country:' in metadata_list else ''
    keywords =         metadata_list['Keywords:'] if 'Keywords:' in metadata_list else ''
    
    #print(article_source_link, reported_in, publication_date, target_audience, country, keywords)
    
    article_entry = [summary, disproof, article_title, article_source_link, article_source_media, reported_in, publication_date, target_audience, country, keywords]
    entry.extend(article_entry)

Parsing https://euvsdisinfo.eu/report/blaming-china-for-covid-19-is-a-ploy-like-blaming-russia-for-mh17/
Parsing https://euvsdisinfo.eu/report/usaid-is-a-cia-affiliated-terror-group-that-created-the-coronavirus/
Parsing https://euvsdisinfo.eu/report/authoritarian-trends-in-the-covid-19-lockdown-are-a-harbinger-of-the-future/
Parsing https://euvsdisinfo.eu/report/the-italian-parliament-lowers-the-flag-of-the-european-union/
Parsing https://euvsdisinfo.eu/report/environmentalists-are-overjoyed-by-the-fear-around-covid-19-they-see-it-as-an-opportunity/
Parsing https://euvsdisinfo.eu/report/covid-19-a-large-scale-sociological-experiment-to-see-how-much-repression-the-people-will-accept/
Parsing https://euvsdisinfo.eu/report/it-is-too-early-to-tell-whether-any-extra-people-will-die-because-of-covid-19/
Parsing https://euvsdisinfo.eu/report/eu-has-collapsed-countries-are-building-borders/
Parsing https://euvsdisinfo.eu/report/european-identity-and-solidarity-are-in-crisis/
Parsing https://eu

In [7]:
df = pd.DataFrame(dataset, columns=['Title', 'Link', 'Date', 'Outlet', 'Country', 'Summary', 'Disproof', 'Article Title', 'Article Source', 'Article Media', 'Reported In', 'Publication Date', 'Audience', 'Article Country', 'Keywords'])
display(df)
df.to_csv('euvsdisinfo_results.csv', index=False)

Unnamed: 0,Title,Link,Date,Outlet,Country,Summary,Disproof,Article Title,Article Source,Article Media,Reported In,Publication Date,Audience,Article Country,Keywords
0,"Blaming China for COVID-19 is a ploy, like bla...",https://euvsdisinfo.eu/report/blaming-china-fo...,01.04.2020,Journal of New Eastern Outlook,"EU, Russia, Ukraine, US, The Netherlands, Syria",The assumption that China created COVID 19 is ...,Conspiracy theory with no evidence given; a st...,"Disinfo: Blaming China for COVID-19 is a ploy,...",https://journal-neo.org/2020/03/31/covid-and-t...,Journal of New Eastern Outlook,Issue190,01/04/2020,English,"EU, Russia, Ukraine, US, The Netherlands, Syria","coronavirus, Conspiracy theory, Chemical weapo..."
1,USAID is a CIA-affiliated terror group that cr...,https://euvsdisinfo.eu/report/usaid-is-a-cia-a...,01.04.2020,Journal of New Eastern Outlook,"China, US","With the accusation against China, all that is...","This is a conspiracy theory, presented with no...",Disinfo: USAID is a CIA-affiliated terror grou...,https://journal-neo.org/2020/03/31/covid-and-t...,Journal of New Eastern Outlook,Issue190,01/04/2020,English,"China, US","coronavirus, Conspiracy theory, Biological wea..."
2,Authoritarian trends in the COVID-19 lockdown ...,https://euvsdisinfo.eu/report/authoritarian-tr...,01.04.2020,RT English,UK,What will 6 months of Covid-19 do to our socie...,Conspiracy theory with no evidence given; this...,Disinfo: Authoritarian trends in the COVID-19 ...,https://www.rt.com/op-ed/484652-covid-future-s...,RT English,Issue190,01/04/2020,English,UK,"coronavirus, Conspiracy theory"
3,The Italian Parliament lowers the flag of the ...,https://euvsdisinfo.eu/report/the-italian-parl...,01.04.2020,fondsk.ru,"EU, Italy",The Italian Parliament lowers the flag of the ...,A misleading headline to support a recurring n...,Disinfo: The Italian Parliament lowers the fla...,https://www.fondsk.ru/news/2020/03/31/v-italja...,fondsk.ru,Issue190,01/04/2020,Russian,"EU, Italy","coronavirus, EU disintegration"
4,Environmentalists are overjoyed by the fear ar...,https://euvsdisinfo.eu/report/environmentalist...,01.04.2020,RT English,"EU, UK, US",Many hardline environmentalists are overjoyed ...,Recurring pro-Kremlin conspiracy theory about ...,Disinfo: Environmentalists are overjoyed by th...,https://www.rt.com/op-ed/484627-covid-climate-...,RT English,Issue190,01/04/2020,English,"EU, UK, US","coronavirus, Conspiracy theory, Climate, Consp..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221,China is preparing to resist a US attack of ma...,https://euvsdisinfo.eu/report/coronavirus-chin...,24.01.2020,60 Minut @ Rossiya 1 [22:35 - 23:39],"China, US",China is preparing to resist an attack of weap...,Conspiracy theory.\nIn response to the outbrea...,Disinfo: China is preparing to resist a US att...,,,Issue 182,24/01/2020,Russian,"China, US","coronavirus, Lugar Laboratory, Biological weapons"
222,China coronavirus was predicted by Nostradamus,https://euvsdisinfo.eu/report/china-coronaviru...,24.01.2020,Ukraina.ru,China,A deciphered prediction of Nostradamus about a...,Conspiracy theory. This is a pro-Kremlin narra...,Disinfo: China coronavirus was predicted by No...,,,Issue 181,24/01/2020,Russian,China,"coronavirus, Conspiracy theory, Biological wea..."
223,The new coronavirus is a US biological weapon,https://euvsdisinfo.eu/report/the-new-corona-v...,23.01.2020,"Sputnik web Arabic, tvzvezda, tsargrad, akhbar...","China, US",The appearance of coronavirus could be the res...,This is not the first time the claims that the...,Disinfo: The new coronavirus is a US biologica...,,,Issue 181,23/01/2020,"Arabic, Russian","China, US","coronavirus, Conspiracy theory, Virus / bacter..."
224,A new Chinese coronavirus was likely elaborate...,https://euvsdisinfo.eu/report/a-new-chinese-co...,22.01.2020,Sputnik Belarus @ Novosti Shiolkovogo Puti tim...,"China, US",A new coronavirus coincides with a number of s...,This is a conspiracy theory aimed to cast a sh...,Disinfo: A new Chinese coronavirus was likely ...,,,Issue 181,22/01/2020,Russian,"China, US","coronavirus, Conspiracy theory, Virus / bacter..."


In [8]:
article_data_cols = ['Summary', 'Disproof', 'Article Title', 'Article Source', 'Article Media', 'Reported In', 'Publication Date', 'Audience', 'Article Country', 'Keywords']
df_articles = df[article_data_cols]
display(df_articles)

df_articles.to_csv('euvsdisinfo_articles.csv', index=False)

Unnamed: 0,Summary,Disproof,Article Title,Article Source,Article Media,Reported In,Publication Date,Audience,Article Country,Keywords
0,The assumption that China created COVID 19 is ...,Conspiracy theory with no evidence given; a st...,"Disinfo: Blaming China for COVID-19 is a ploy,...",https://journal-neo.org/2020/03/31/covid-and-t...,Journal of New Eastern Outlook,Issue190,01/04/2020,English,"EU, Russia, Ukraine, US, The Netherlands, Syria","coronavirus, Conspiracy theory, Chemical weapo..."
1,"With the accusation against China, all that is...","This is a conspiracy theory, presented with no...",Disinfo: USAID is a CIA-affiliated terror grou...,https://journal-neo.org/2020/03/31/covid-and-t...,Journal of New Eastern Outlook,Issue190,01/04/2020,English,"China, US","coronavirus, Conspiracy theory, Biological wea..."
2,What will 6 months of Covid-19 do to our socie...,Conspiracy theory with no evidence given; this...,Disinfo: Authoritarian trends in the COVID-19 ...,https://www.rt.com/op-ed/484652-covid-future-s...,RT English,Issue190,01/04/2020,English,UK,"coronavirus, Conspiracy theory"
3,The Italian Parliament lowers the flag of the ...,A misleading headline to support a recurring n...,Disinfo: The Italian Parliament lowers the fla...,https://www.fondsk.ru/news/2020/03/31/v-italja...,fondsk.ru,Issue190,01/04/2020,Russian,"EU, Italy","coronavirus, EU disintegration"
4,Many hardline environmentalists are overjoyed ...,Recurring pro-Kremlin conspiracy theory about ...,Disinfo: Environmentalists are overjoyed by th...,https://www.rt.com/op-ed/484627-covid-climate-...,RT English,Issue190,01/04/2020,English,"EU, UK, US","coronavirus, Conspiracy theory, Climate, Consp..."
...,...,...,...,...,...,...,...,...,...,...
221,China is preparing to resist an attack of weap...,Conspiracy theory.\nIn response to the outbrea...,Disinfo: China is preparing to resist a US att...,,,Issue 182,24/01/2020,Russian,"China, US","coronavirus, Lugar Laboratory, Biological weapons"
222,A deciphered prediction of Nostradamus about a...,Conspiracy theory. This is a pro-Kremlin narra...,Disinfo: China coronavirus was predicted by No...,,,Issue 181,24/01/2020,Russian,China,"coronavirus, Conspiracy theory, Biological wea..."
223,The appearance of coronavirus could be the res...,This is not the first time the claims that the...,Disinfo: The new coronavirus is a US biologica...,,,Issue 181,23/01/2020,"Arabic, Russian","China, US","coronavirus, Conspiracy theory, Virus / bacter..."
224,A new coronavirus coincides with a number of s...,This is a conspiracy theory aimed to cast a sh...,Disinfo: A new Chinese coronavirus was likely ...,,,Issue 181,22/01/2020,Russian,"China, US","coronavirus, Conspiracy theory, Virus / bacter..."
