# Generation of Classical Swine Fever outbreak dataset - Japan 2018/2019

This workbook generates a .csv-file with data on the current outbreaks of classical swine fever in Japan. 

The data is scraped from the following site: http://www.oie.int/wahis_2/public/wahid.php/Reviewreport/Review?reportid=27871

Every week, give or take, a new report is published. The entire data collection process is rather slow - maybe 10-20 min. in all.

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from datetime import datetime
from time import sleep
from random import randint

from geopy.geocoders import Nominatim
import os

One url such as: http://www.oie.int/wahis_2/public/wahid.php/Reviewreport/Review?page_refer=MapFullEventReport&reportid=29772 only contains the data of one report. To collect all data from all reports I have to scrape all existing urls for all existing reports. Luckily all reports contain links to all other reports.

In [2]:
## To collect all urls I start by scraping the html from one of the reports. Which report should be unimportant.
url_random = 'http://www.oie.int/wahis_2/public/wahid.php/Reviewreport/Review?page_refer=MapFullEventReport&reportid=29772'
response = requests.get(url_random)
html = response.text
soup = BeautifulSoup(html, 'html')

In [3]:
## To collect all hyperlinks from the html BeautifulSoup finds all html codes with 'a' and 'href'.  
links = []
for url in soup.find_all('a'):
    link = url.get('href')
    links.append(link)

print('Number of hyperlinks in the initial webpage:', len(links))
print(links[:2])

## The first link collected is not for a report, so it is excluded:
links = links[1:]
print('Number of relevant hyperlinks:', len(links))
links[:2]

Number of hyperlinks in the initial webpage: 29
[None, "javascript:open_report('/wahis_2/public/wahid.php/Reviewreport/Review?', '27871')"]
Number of relevant hyperlinks: 28


["javascript:open_report('/wahis_2/public/wahid.php/Reviewreport/Review?', '27871')",
 "javascript:open_report('/wahis_2/public/wahid.php/Reviewreport/Review?', '27924')"]

In [4]:
## From each hyperlink i collect the unique report-ID. OBS: The following code is sensitive to changes.
report_number = []

for link in links:                            
    link = link[-7:-2]                        # First time I tried this code, the index was from -6:-1
    if link[0] == '2':                                            
        report_number.append(link)                               
    if link[0] == '3':                        # First time I tried this all report codes started with '2'
        report_number.append(link)
    
print('Number of report numbers:', len(report_number))
report_number[0:2]

Number of report numbers: 28


['27871', '27924']

In [5]:
## I combine the reportid collected in the report_number list with the standard URL for all the hyperlinks
all_links = ['http://www.oie.int/wahis_2/public/wahid.php/Reviewreport/Review?page_refer=MapFullEventReport&reportid='+ i for i in report_number]
print('Number of links:', len(all_links))

all_links[0:2]

Number of links: 28


['http://www.oie.int/wahis_2/public/wahid.php/Reviewreport/Review?page_refer=MapFullEventReport&reportid=27871',
 'http://www.oie.int/wahis_2/public/wahid.php/Reviewreport/Review?page_refer=MapFullEventReport&reportid=27924']

In [6]:
## Here I loop through all the URLs to make a list of the html of all the reports.
## Takes around 10 minutes to load due to slow response from oie.int and a time delay. 
##(The delay might be adjusted without problems though)

all_html = []
for link in all_links:
    sleep(randint(8,15))
    response = requests.get(link)
    html = response.text
    soup = BeautifulSoup(html, 'html')
    all_html.append(soup)
    
print('Number of html scrapes:', len(all_html))

Number of html scrapes: 28


In [35]:
## Now I sort and alter the data into lists to get my data columns.

## Empty lists to make our data columns for each relevant scrape
outbreak_id = []
city = []
prefecture = []
outbreak_date = []
species = []
no_susceptible = []
no_cases = []
no_deaths = []

## Looping through all the report html scrapes
for report in all_html:
    ## I find the report number
    report_no = report.find('td', {'width':'30%'}).text[-2:]
    
    ## Empty list of outbreaks
    outbreaks = []
    
    ## The tables are collected from the html code
    tables = report.find_all('table', {'class':'TableFoyers'})
    for tr in tables:
        td = tr.find_all('td')
        row = [tr.text.strip() for tr in td]
        for i in row:
            text = str(i)
            if 'Outbreak' in text and any(char.isdigit() for char in text):
                outbreaks.append(row) 

    #
    outbreaks = [[element or '0' for element in outbreak] for outbreak in outbreaks]
    for outbreak in outbreaks:
        
        ## outbreak_id (the report number added the outbreak counter):
        out = outbreak[0]
        out2 = report_no + '.' + out[9:100]
        outbreak_id.append(out2)
        
        ## The city and prefecture
        out = outbreak[1]
        out = out.replace('-',' ')
        split = out.split(" ")
        city.append(split[0])
        if split[2] == 'City,':                                     # Must be added because one is called "Higashi Osaka City, Osaka"
            prefecture.append(split[3])
        else:
            prefecture.append(split[2])
        
        ## Date of outbreak
        out = datetime.strptime(outbreak[3], '%d/%m/%Y').date()
        outbreak_date.append(out)
    
        ## Species
        out = outbreak[16]
        species.append(out)
    
        ## Number of susceptible animals
        out = outbreak[17]
        no_susceptible.append(out)
    
        ## Number of cases
        out = outbreak[18]
        no_cases.append(out)
        
        ## Number of deaths
        out = outbreak[19]
        no_deaths.append(out)


In [36]:
# To check format and observations:
print('Number of outbreaks:', len(city))

#print(outbreak_id)
#print(city)
#print(prefecture)
#print(outbreak_date)
#print(len(species))
#print(no_susceptible)
#print(no_cases)
#print(no_deaths)

Number of outbreaks: 291


Finnally I make the dataset of all the collected data from the OIE:

In [38]:
## The lists are converted into a combined dataframe:
csf = pd.DataFrame({'ID': outbreak_id,
                              'City': city,
                              'Prefecture': prefecture,
                              'Date': outbreak_date,
                              'Species': species,
                              'Susceptibles': no_susceptible,
                              'Cases': no_cases,
                              'Deaths': no_deaths                            
})

print(csf.info())
print(csf.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 8 columns):
ID              291 non-null object
City            291 non-null object
Prefecture      291 non-null object
Date            291 non-null object
Species         291 non-null object
Susceptibles    291 non-null object
Cases           291 non-null object
Deaths          291 non-null object
dtypes: object(8)
memory usage: 18.3+ KB
None
          ID  City Prefecture        Date                       Species  \
0  on.1  (1)  Gifu       Gifu  2018-09-03                         Swine   
1        1.1  Gifu       Gifu  2018-09-13  Wild boar:Sus scrofa(Suidae)   
2        2.1  Gifu       Gifu  2018-09-15  Wild boar:Sus scrofa(Suidae)   
3        2.2  Gifu       Gifu  2018-09-18  Wild boar:Sus scrofa(Suidae)   
4        2.3  Gifu       Gifu  2018-09-21  Wild boar:Sus scrofa(Suidae)   

  Susceptibles Cases Deaths  
0          610    29     29  
1            0     1      1  
2            0     1 

### Collection of outbreak coordinates
I want to be able to make a map so I also collect geospatial data. For this I use GeoLocator.

In [39]:
## If the city names are corrected, the city names can be used to collect the coordinates from GeoLocator:
new = csf['City'].str.split()
csf['City'] = new.str[0]

csf = csf.replace(to_replace ='Ohmihachiman', value ='Omihachiman')  ## Ohmihachiman does not work with GeoLocator

csf['Prefecture'] = np.where(csf['City'] == 'Tahara', 'Aichi', csf['Prefecture']) ## Report 26 states two outbreaks in Tahara, Gifu, but Tahara is in Aichi

csf['Geo_Lookup'] = csf['City'] + ' ' + csf['Prefecture']
csf = csf



In [40]:
### OBS: Very slow (6 min.) - to avoid being cut off I include a timeout ###

geolocator = Nominatim(user_agent="school project")

latitude = []
longitude = []
for city in csf['Geo_Lookup']:
    try:
        location = geolocator.geocode(city, timeout=50)
        lat = location.latitude
        lon = location.longitude
        latitude.append(lat)
        longitude.append(lon)
    except:
        print(city)      # If an error occurs the whole loop stops. This is included to know what throws the error.
        raise
  

In [41]:
# I check that the length matches the number of observations:
print(len(latitude))
print(len(longitude))

291
291


In [42]:
# The list with the coordinates are converted to a dataframe
coordinates = pd.DataFrame({'Latitude': latitude,
                              'Longitude': longitude,                            
})

# I check the compatability
print(coordinates.info())
print(csf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 2 columns):
Latitude     291 non-null float64
Longitude    291 non-null float64
dtypes: float64(2)
memory usage: 4.6 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 9 columns):
ID              291 non-null object
City            291 non-null object
Prefecture      291 non-null object
Date            291 non-null object
Species         291 non-null object
Susceptibles    291 non-null object
Cases           291 non-null object
Deaths          291 non-null object
Geo_Lookup      291 non-null object
dtypes: object(9)
memory usage: 20.5+ KB
None


In [43]:
# The coordinates-dataframe is appended to the big dataset 
csf['Latitude'] = coordinates['Latitude']
csf['Longitude'] = coordinates['Longitude']

csf = csf

print(csf.head(-4))

                ID          City Prefecture        Date  \
0        on.1  (1)          Gifu       Gifu  2018-09-03   
1              1.1          Gifu       Gifu  2018-09-13   
2              2.1          Gifu       Gifu  2018-09-15   
3              2.2          Gifu       Gifu  2018-09-18   
4              2.3          Gifu       Gifu  2018-09-21   
5              3.1          Gifu       Gifu  2018-09-26   
6              3.2  Kakamigahara       Gifu  2018-09-27   
7              3.3  Kakamigahara       Gifu  2018-09-28   
8              3.4          Gifu       Gifu  2018-09-28   
9              4.1          Gifu       Gifu  2018-09-30   
10             4.2          Gifu       Gifu  2018-10-02   
11             4.3  Kakamigahara       Gifu  2018-10-03   
12             4.4  Kakamigahara       Gifu  2018-10-03   
13             4.5          Gifu       Gifu  2018-10-04   
14             4.6          Gifu       Gifu  2018-10-05   
15             4.7          Gifu       Gifu  2018-10-05 

In [44]:
export_csv = csf.to_csv('CSF_Japan_data.csv', index = False)

# I create a folder for all generated datasets, just for safe keeping.
if not os.path.exists('Old datasets'):
    os.mkdir('Old datasets')

# I save an additional CSV with todays date to the old datasets folder.
todays_date = datetime.now().strftime('%d%m%Y')
export_csv_date = csf.to_csv('Old datasets/CSF_Outbreaks_Japan_'+todays_date+'.csv', index = False)