## Capstone - Data Scraping

In [224]:
# Import Libraries

import pandas as pd
import numpy as np
import json
import urllib
import requests
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import urllib2
from urllib2 import urlopen
from bs4 import BeautifulSoup
import html5lib
from selenium import webdriver
import time
import string

### Country Code Scraping

In [28]:
# Country Code Scraping - Country codes are needed in order to scrape data for individual countries

country_url = 'http://www.nationsonline.org/oneworld/country_code_list.htm'
driver = webdriver.Chrome('/Users/jayecribb/Downloads/chromedriver')
driver.get(country_url)
htmlSource = driver.page_source
soup = BeautifulSoup(htmlSource,'html5lib' )
# Get codes from soup
# Country codes are contained within a td tag with "text-align:center"
tds = soup.findAll('td', style = "text-align:center" ) 
country_codes = []

for td in tds[1::3]: # Country codes are in every third position 
    country_codes.append(td.text)

### Global Forest Watch Scraping

This webpage has dynamical content so in order to scrape it we need to use the library Selenium. Using the Webdriver, it will automatically open up the webpage as though we are using a browser and scrape through the dynamical content

#### Scraping functions for specified data

In [47]:
def tree_cover_value(soup):
    try:
        return soup.find('div', class_='tree-cover').find('span').string + soup.find('div', class_='tree-cover').find('span').findNext('span').text
    except:
        return np.nan

In [48]:
def total_area_percent(soup):
    try:
        return soup.find('div', class_='total-area').find('span', class_='amount').string
    except:
        return np.nan    

In [49]:
def total_loss(soup):
    try:
        return soup.find('div', class_='total-loss').contents[0]
    except:
        return np.nan

In [50]:
def total_loss_14(soup):
    try:
        return soup.find('div', class_='graph-amount').find('span').text
    except:
        return np.nan

In [51]:
def total_gain(soup):
    try:
        return soup.find('div', class_="graph-amount graph-gain-amount").find('span').text
    except:
        return np.nan

In [52]:
def natural(soup):
    try:
        return soup.find('div', class_ = 'forests-type-graph').findAll('text')[0].text
    except:
        return np.nan

In [53]:
def primary(soup):
    try:
        return soup.find('div', class_ = 'forests-type-graph').findAll('text')[1].text
    except:
        return np.nan

In [54]:
def planted(soup):
    try:
        return soup.find('div', class_ = 'forests-type-graph').findAll('text')[2].text
    except:
        return np.nan

In [55]:
def loss_outside_plantation(soup):
    try:
        return soup.find('p', class_ = "plantations-loss").text
    except:
        return np.nan

In [56]:
def loss_outside_plantation_per(soup):
    try:
        return soup.find('p', class_ = "plantations-percentage").text
    except:
        return np.nan

In [57]:
def certified_forest(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[1].find('span').text
    except:
        return np.nan    

In [458]:
def total_economy(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[2].findAll('span')[0].text
    except:
        return np.nan

In [459]:
def percentage_economy(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[2].findAll('span')[1].text
    except:
        return np.nan

In [460]:
def employment(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[3].text
    except:
        return np.nan

In [461]:
def reforestation_rate_10(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[4].find('span').text
    except:
        return np.nan

In [462]:
def carbon(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[5].find('span').text
    except:
        return np.nan

In [463]:
def GHG_emissions(soup):
    sections = soup.findAll('h3', class_="section-title" )
    try:
        return sections[6].find('span').text
    except:
        return np.nan 

In [176]:
def conventions(soup):
    
    convention = []
    
    try:
        conventions = soup.find('section', class_="country-section country-conventions-section" ).findAll('span')
        for i in conventions:
            convention.append(i.text)

        return convention
    except:
        return [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]  

### Scraping

Scraping needs to be implented in batches due to the computing power of my laptop. Scraping is done in batches of 20, each time creating a new data frame and saving to csv. Due to the output of the Conventions function, this will need to be scraped to a seperate data frame and eventually joined to the others.

In [77]:
# Scrape Function

data = []

for country in country_codes[240:]:
    url_template = 'http://www.globalforestwatch.org/country/'+ country
    driver = webdriver.Chrome('/Users/jayecribb/Downloads/chromedriver')
    
    try:
        driver.get(url_template)
        htmlSource = driver.page_source
        soup = BeautifulSoup(htmlSource,'html5lib' )
        data.append([country, tree_cover_value(soup), total_area_percent(soup), total_loss_14(soup),
              total_gain(soup), natural(soup), primary(soup), planted(soup), loss_outside_plantation(soup),
              loss_outside_plantation_per(soup),certified_forest(soup), total_economy(soup),
              percentage_economy(soup), employment(soup), reforestation_rate_10(soup),
              carbon(soup), GHG_emissions(soup)])
        time.sleep(2)
    except:
        pass
        
df = pd.DataFrame(data)
df.to_csv('data13.csv', encoding = 'utf-8', header = False)    

In [193]:
# Scrape conventions seperatley

data = []

for country in country_codes[240:]:
    url_template = 'http://www.globalforestwatch.org/country/'+ country
    driver = webdriver.Chrome('/Users/jayecribb/Downloads/chromedriver')
    
    try:
        driver.get(url_template)
        htmlSource = driver.page_source
        soup = BeautifulSoup(htmlSource,'html5lib' )
        data.append(conventions(soup))
        time.sleep(2)
    except:
        pass
        
df = pd.DataFrame(data)
df.to_csv('convention13.csv', encoding = 'utf-8', header = False)    

In [497]:
# Read and concatinate scraped files
df01 = pd.read_csv('data01.csv', header = None)
df02 = pd.read_csv('data02.csv', header = None)
df03 = pd.read_csv('data03.csv', header = None)
df04 = pd.read_csv('data04.csv', header = None)
df05 = pd.read_csv('data05.csv', header = None)
df06 = pd.read_csv('data06.csv', header = None)
df07 = pd.read_csv('data07.csv', header = None)
df08 = pd.read_csv('data08.csv', header = None)
df09 = pd.read_csv('data09.csv', header = None)
df10 = pd.read_csv('data10.csv', header = None)
df11 = pd.read_csv('data11.csv', header = None)
df12 = pd.read_csv('data12.csv', header = None)
df13 = pd.read_csv('data13.csv', header = None)
all_data = pd.concat([df01,df02,df03,df04,df05,df06,df07,df08,df09,df10,df11,df12,df13 ])
all_data.drop(0, axis = 1, inplace = True)

In [243]:
# Defining feature names

all_data.columns = ['country', 'tree_cover', 'tree_cover_%_2000', 'tree_cover_loss_2014',
                       'cover_gain', 'forest_type_nat', 'forest_type_pri', 'forest_type_pla',
                       'loss_outside_plant', 'loss_outside_plant_%','certified_forest', 'total_economy',
                       'percentage_economy', 'employment', 'reforestation_rate_10','carbon',
                       'GHG_emissions']

ValueError: Length mismatch: Expected axis has 18 elements, new values have 17 elements

In [122]:
# Save concatinated data frames to csv. After inspecting this data frame a few features needed ammending. 
# This was implemented in Excel as the data frame wasn't too large 

all_data.to_csv('all_data.csv')

In [498]:
# Loading in ammended data frame

all_data = pd.read_excel('new_data.xlsx')

In [486]:
# Read and concatinate convention data frames

dfc01 = pd.read_csv('convention01.csv', header = None)
dfc02 = pd.read_csv('convention02.csv', header = None)
dfc03 = pd.read_csv('convention03.csv', header = None)
dfc04 = pd.read_csv('convention04.csv', header = None)
dfc05 = pd.read_csv('convention05.csv', header = None)
dfc06 = pd.read_csv('convention06.csv', header = None)
dfc07 = pd.read_csv('convention07.csv', header = None)
dfc08 = pd.read_csv('convention08.csv', header = None)
dfc09 = pd.read_csv('convention09.csv', header = None)
dfc10 = pd.read_csv('convention10.csv', header = None)
dfc11 = pd.read_csv('convention11.csv', header = None)
dfc12 = pd.read_csv('convention12.csv', header = None)
dfc13 = pd.read_csv('convention13.csv', header = None)
conventions = pd.concat([dfc01,dfc02,dfc03,dfc04,dfc05,dfc06,dfc07,dfc08,dfc09,dfc10,dfc11,dfc12,dfc13 ])
conventions.drop(0, axis = 1, inplace = True)
conventions.columns = ['CBD', 'UNFCCC', 'KYOTO', 'UNCCD', 'ITTA', 'CITES', 'RAMSAR', 'WORLD HERITAGE',
                      'NLBI', 'ILO 169']

In [487]:
# Merging the two data frames by index

conventions.reset_index(inplace=True)
conventions.drop('index', inplace = True, axis = 1)
all_data.reset_index(inplace=True)
first_merge = pd.concat([all_data, conventions], axis = 1)

In [488]:
conventions

Unnamed: 0,CBD,UNFCCC,KYOTO,UNCCD,ITTA,CITES,RAMSAR,WORLD HERITAGE,NLBI,ILO 169
0,Ratification,Non-Annex I Party,Accession,Accession,Non Member,Accession,Non Party,Ratification,Member State,Not Ratified
1,,,,,,,,,,
2,Accession,Non-Annex I Party,Accession,Accession,Consuming Member,Accession,Contracting Party,Ratification,Member State,Not Ratified
3,Ratification,Non-Annex I Party,Accession,Ratification,Non Member,Accession,Contracting Party,Ratification,Member State,Not Ratified
4,,,,,,,,,,
5,,,,,,,,,,
6,Ratification,Non-Annex I Party,Accession,Ratification,Non Member,Non Party,Non Party,Ratification,Member State,Not Ratified
7,,,,,,,,,,
8,,,,,,,,,,
9,,,,,,,,,,


In [409]:
#all_data.to_csv('country_names_data.csv', encoding = 'utf-8')

Another data set to be included is economic data, aggregated by country. This data was collected from http://data.worldbank.org/ and downloaded as a csv file

In [467]:
# Reading economic data
economics = pd.read_csv('economic_data.csv')

In [468]:
# Merge and save to csv
second_merge = first_merge.merge(economics, how = 'inner', left_on='country_name', right_on='Country' )
second_merge.to_csv('final_data.csv', encoding = 'utf-8')

In [495]:
#all_data.to_csv('country_names_data.csv', encoding = 'utf-8')


In [494]:
# Import data to Local Postgres Database
import psycopg2
import sqlalchemy
from sqlalchemy import create_engine

engine = create_engine('postgresql://jayecribb@localhost:5432/capstone')
economics.to_sql('economics',  engine, if_exists='replace')
conventions.to_sql('conventions',  engine, if_exists='replace')
all_data.to_sql('forests',  engine, if_exists='replace')

In [500]:
all_data[all_data['country_code']=='GBR']

Unnamed: 0,country_name,country_code,tree_cover,tree_cover_%_2000,tree_cover_loss_2014,cover_gain,forest_type_nat,forest_type_pri,forest_type_pla,loss_outside_plant,loss_outside_plant_%,certified_forest,total_economy,percentage_economy,employment,reforestation_rate_10,carbon,GHG_emissions
233,United Kingdom,GBR,3.6MHa,,,,0.23,,0.77,,,"2,949,464 ha",USD 9.5 billion,0.4% of the GDP.,134 thousand people are directly employed by t...,14 kha/year,237 million metric tons of carbon stocks,


In [503]:
data = []


url_template = 'http://www.globalforestwatch.org/country/'+ 'SDN'
driver = webdriver.Chrome('/Users/jayecribb/Downloads/chromedriver')

try:
    driver.get(url_template)
    htmlSource = driver.page_source
    soup = BeautifulSoup(htmlSource,'html5lib' )
    data.append([country, tree_cover_value(soup), total_area_percent(soup), total_loss_14(soup),
          total_gain(soup), natural(soup), primary(soup), planted(soup), loss_outside_plantation(soup),
          loss_outside_plantation_per(soup),certified_forest(soup), total_economy(soup),
          percentage_economy(soup), employment(soup), reforestation_rate_10(soup),
          carbon(soup), GHG_emissions(soup)])
    time.sleep(2)
except:
    pass

In [504]:
data

[[u'ZWE',
  u'74KHa',
  u'0',
  u'4',
  u'8',
  u'61%',
  u'7%',
  u'32%',
  nan,
  nan,
  u'0 ha ',
  u'USD 117.2 million',
  u'0.2% of the GDP.',
  u'5 thousand people are directly employed by the forestry sector, according to 2011 FAO data.',
  u'5940 kha/year',
  u'153 million metric tons of carbon stocks',
  u'16.2% of GHG emissions']]