In [None]:
import spacy
spacy.load('en_core_web_sm')
from epitator.annotator import AnnoDoc
from epitator.geoname_annotator import GeonameAnnotator
from epitator.resolved_keyword_annotator import ResolvedKeywordAnnotator
from epitator.count_annotator import CountAnnotator
from epitator.date_annotator import DateAnnotator
from boilerpipe.extract import Extractor
from itertools import groupby
import datetime
import sys
import pandas as pd
import re
from tqdm import tqdm_notebook as tqdm
import numpy as np
import epitator

# Scrape

In [None]:
from who_scraper import *
# Scrape all the WHO DONs of the year 2018
all_links = scrape(years=['2018'],proxies=None)
# Extract the main text of the given links
from boilerpipe.extract import Extractor
def extract(list_of_links):
    """Extracts the main content from a list of links and returns a list of texts (str)

    list_of_links -- a list containing URLs of webpages to get the main content from
    """
    if type(list_of_links) == str:
        list_of_links = [list_of_links]
    return[Extractor(extractor='ArticleExtractor', url=url).getText().replace('\n','') \
         for url in tqdm(list_of_links)]
parsed_whos_df = pd.DataFrame.from_dict(create_annotated_database(extract(all_links)))
parsed_whos_df.head()


## Compare with Ereignisdatenbank (incident report). From here not put into .py

In [None]:
# Read in with columns with sources only
ereignisdatenbank = pd.read_csv("Ereignisse_utf8.csv",sep=";")

In [None]:
sources = ereignisdatenbank.iloc[:,15:26] # Get only the columns mentioning sources
sources = sources.dropna(how="all").reset_index(drop=True) # Drop empty rows at the end
sources.head()

In [None]:
mask = sources.copy(deep=True) # Create a mask for filtering
for column in sources.columns:
    mask[column] = sources[column].str.contains('who',na=False) # Extract all the entries that have the word "who"
    mask[column] = sources[column].str.contains('don',na=False) # and "don"
indices = [i for i in range(len(mask)) if not mask.iloc[i].any()] 
sources_filtered = sources.drop(np.reshape(indices,(len(indices),))) # Drop all rows that don't mention "who" or "don"

In [None]:
from datenbank_clean import *
sources_filtered = sources_filtered.apply(edb_to_timestamp)

In [None]:
# Idea which is not correct and not complete
date_matches = {}
for column in sources_filtered.columns[::-3]: # Use only the columns mentioning dates
    date_matches[column] = [i for i in range(len(parsed_whos_df))\
                            if list(map(lambda x: str(x)[:-3],parsed_whos_df["date"].tolist()))[i] \
                            in list(map(lambda x: str(x)[:-12],sources_filtered[column].tolist()))]

In [None]:
import itertools
indices_that_matched = list(set(itertools.chain(*date_matches.values())))

In [None]:
ereignisdatenbank.iloc[sources_filtered.index.tolist(),[3,6,7,9]]

In [None]:
parsed_whos_df.iloc[indices_that_matched,1:].sort_values("date")

In [None]:
# Prettify the link description
link_description = [re.search(r'don/(.*)/en',all_links[i])[1]\
                    .replace('-', ' ',2).replace('-',', ',2).replace('-',' ')\
                    for i in range(len(all_links))]

In [None]:
# Extract the most important columns
compare = parsed_whos_df.iloc[:,[1,3,4]].copy()
compare['link_description'] = pd.Series(link_description,index=compare.index)

# To present

In [None]:
compare["date"].iloc[0]

In [None]:
# These are the links that were faulty during annotation
to_check = compare[compare["keyword"].isnull()==True].index.values
links_to_check = np.asarray(all_links)[to_check] # Get the links that caused the bad annotations

In [None]:
annotated_faulty_text = create_annotated_database(extract(links_to_check),raw=True)

In [None]:
faulty_df = pd.DataFrame.from_dict(annotated_faulty_text).iloc[:,1:]
faulty_df

In [None]:
# Entries that had the occurence of a geographical entity more than once
parsed_whos_df[parsed_whos_df["geoname"].str.len()>1]

# Geo Tests

In [None]:
import pycountry

In [None]:
country_names = [list(pycountry.countries)[i].name for i in range(len(pycountry.countries))]

In [None]:
sorted(country_names)

In [None]:
from googletrans import Translator

In [None]:
import geograpy
url = 'http://www.bbc.com/news/world-europe-26919928'
places = geograpy.get_place_context(url=url)

# Goodnes Test

In [None]:
sources_filtered.head()

In [None]:
sources_filtered = sources_filtered.fillna("nan")

In [None]:
links_from_sources_filtered = [sources_filtered["Link zur Quelle 1"].iloc[i] 
                               if ("pdf" not in sources_filtered["Link zur Quelle 1"].iloc[i]) 
                               and (sources_filtered["Link zur Quelle 1"].iloc[i] != "nan") 
                               else sources_filtered["Link zur Quelle 2"].iloc[i] 
                               for i in range(len(sources_filtered))]

In [None]:
extracted = extract(links_from_sources_filtered)

In [None]:
parsed_links_ereignisdatenbank = create_annotated_database(extracted,raw=True)

In [None]:
df = pd.DataFrame.from_dict(parsed_links_ereignisdatenbank)
df = df.iloc[:,1:]
df["links"] = links

In [None]:
to_compare_from_ereignisdatenbank = ereignisdatenbank.iloc[sources_filtered.index.tolist()\
                                                           ,[3,6,7,8,10,11,12,13,14]].reset_index().drop("index",axis=1)

In [None]:
# WHY DOES THIS NOT WORK
to_compare_from_ereignisdatenbank["Warcheinlische Fälle"].apply((lambda x: int(x) if not np.isnan(x) else x))

In [None]:
extract("http://www.promedmail.org/post/5838919")

In [None]:
comparison_df = pd.concat([df, to_compare_from_ereignisdatenbank], axis=1, sort=False)

In [None]:
comparison_df["date"] = comparison_df["date"].astype(object) # To restore lists

## Date comparison

In [None]:
comparison_df = ereignisdatenbank_to_timestamp(comparison_df)
comparison_df["combined_dates"] = comparison_df.iloc[:,7:9].values.tolist()

In [None]:
comparison_df["combined_counts"] = comparison_df.iloc[:,[10,12]].values.tolist()

In [None]:
comparison_df = comparison_df.iloc[:,[0,1,2,3,5,13,14]]

In [None]:
comparison_df.head()

In [None]:
matches = {"date":[],"cases":[]}
for index, row in comparison_df.iterrows():
    print(row['combined_counts'])
    matches["date"].append([any(date in epi_date  for epi_date in row["date"]) for date in row['combined_dates']])
    matches["cases"].append([any(count in epi_case for epi_case in row["confirmed_cases"]) for count in row['combined_counts']])

In [None]:
matches["cases"]

In [None]:
epi = ['2018-04-01', '2018-03-02', '2018-01-17', '2018-04-06', '2018-01-17', '2018-03-02', '2018-03-03', '2018-03-07', '2018-03-08', '2018-03-08', '2018-03-09', '2018-03-08', '2018-04-04', '2018-04-02']

In [None]:
erg = ['2018-01-17', 'nan']

In [None]:
[any(date in epi_date for epi_date in epi) for date in erg]