## Web Scraping
### Henrique Martins Prado
##### References: https://towardsdatascience.com/web-scraping-5649074f3ead

#### Bibliotecas

In [1]:
import requests
from bs4 import BeautifulSoup

### Etapa 1

#### Requests URLs

In [2]:
# A primeira coisa a ser feita é fazer o download da página e para isso é utilizada a bilbioteca requests
page = requests.get("https://www.fasttrack.co.uk/league-tables/tech-track-100/league-table/")

#### Fetch Web Page

In [3]:
# Utilizo a biblioteca BeautifulSoup para analisar o HTML. Em outras palavras, extrair os dados que vou precisar
soup = BeautifulSoup(page.content, 'html.parser')

# Apenas apresentar o HTML captado. Se torna uma boa prática esta etapa a fim de verificar se foram captados corretamente os dados
print(soup.prettify())

<!-- Template Name: League Table page
-->
<!DOCTYPE html>
<!--[if lt IE 7 ]> <html class="ie ie6 no-js" lang="en-GB"> <![endif]-->
<!--[if IE 7 ]>    <html class="ie ie7 no-js" lang="en-GB"> <![endif]-->
<!--[if IE 8 ]>    <html class="ie ie8 no-js" lang="en-GB"> <![endif]-->
<!--[if IE 9 ]>    <html class="ie ie9 no-js" lang="en-GB"> <![endif]-->
<!--[if gt IE 9]><!-->
<html class="no-js" lang="en-GB">
 <!--<![endif]-->
 <!-- the "no-js" class is for Modernizr. -->
 <head id="live2-fasttrack-com">
  <link data-minify="1" href="https://www.fasttrack.co.uk/wp-content/cache/min/1/421e5c571d078ac24efe54d8be938161.css" rel="stylesheet"/>
  <meta charset="utf-8"/>
  <!-- Always force latest IE rendering engine (even in intranet) & Chrome Frame -->
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title>
   League table - Fast Track
  </title>
  <meta content="League table - Fast Track" name="title"/>
  <meta content="" name="description"/>
  <meta content="" name="keyword

### Etapa 2

#### Scraping

In [4]:
name = soup.find('div', {'class': 'fi-p_name'})
print(name)

None


In [5]:
# import libraries
from bs4 import BeautifulSoup
import urllib.request
import csv

In [6]:
# specify the url
urlpage =  'http://www.fasttrack.co.uk/league-tables/tech-track-100/league-table/'

In [7]:
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)

# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')

In [8]:
print(soup)

<!-- Template Name: League Table page
-->
<!DOCTYPE html>

<!--[if lt IE 7 ]> <html class="ie ie6 no-js" lang="en-GB"> <![endif]-->
<!--[if IE 7 ]>    <html class="ie ie7 no-js" lang="en-GB"> <![endif]-->
<!--[if IE 8 ]>    <html class="ie ie8 no-js" lang="en-GB"> <![endif]-->
<!--[if IE 9 ]>    <html class="ie ie9 no-js" lang="en-GB"> <![endif]-->
<!--[if gt IE 9]><!-->
<html class="no-js" lang="en-GB">
<!--<![endif]-->
<!-- the "no-js" class is for Modernizr. -->
<head id="live2-fasttrack-com"><link data-minify="1" href="https://www.fasttrack.co.uk/wp-content/cache/min/1/421e5c571d078ac24efe54d8be938161.css" rel="stylesheet"/>
<meta charset="utf-8"/>
<!-- Always force latest IE rendering engine (even in intranet) & Chrome Frame -->
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<title>
        League table - Fast Track    </title>
<meta content="League table - Fast Track" name="title"/>
<meta content="" name="description"/>
<meta content="" name="keyword"/>
<meta con

In [9]:
# find results within table
table = soup.find('table', attrs={'class': 'tableSorter'})
#results = table.find_all('tr')
#print('Number of results', len(results))

In [10]:
# create and write headers to a list 
rows = []
rows.append(['Rank', 'Company Name', 'Webpage', 'Description', 'Location', 'Year end', 'Annual sales rise over 3 years', 'Sales £000s', 'Staff', 'Comments'])
print(rows)

[['Rank', 'Company Name', 'Webpage', 'Description', 'Location', 'Year end', 'Annual sales rise over 3 years', 'Sales £000s', 'Staff', 'Comments']]


In [11]:
# import libraries
import urllib.request
from bs4 import BeautifulSoup
import csv


# specify the url
urlpage =  'http://www.fasttrack.co.uk/league-tables/tech-track-100/league-table/' 
print(urlpage)
# query the website and return the html to the variable 'page'
page = urllib.request.urlopen(urlpage)
# parse the html using beautiful soup and store in variable 'soup'
soup = BeautifulSoup(page, 'html.parser')
# find results within table
table = soup.find('table', attrs={'class': 'tableSorter'})
results = table.find_all('tr')
print('Number of results', len(results))

# create and write headers to a list 
rows = []
rows.append(['Rank', 'Company Name', 'Webpage', 'Description', 'Location', 'Year end', 'Annual sales rise over 3 years', 'Sales £000s', 'Staff', 'Comments'])

# loop over results
for result in results:
    # find all columns per result
    data = result.find_all('td')
    # check that columns have data 
    if len(data) == 0: 
        continue
    
    # write columns to variables
    rank = data[0].getText()
    company = data[1].getText()
    location = data[2].getText()
    yearend = data[3].getText()
    salesrise = data[4].getText()
    sales = data[5].getText()
    staff = data[6].getText()
    comments = data[7].getText()
    
    # print('Company is', company)
    # Company is WonderblyPersonalised children's books
    # print('Sales', sales)
    # Sales *25,860

    # extract description from the name
    companyname = data[1].find('span', attrs={'class':'company-name'}).getText()    
    description = company.replace(companyname, '')
    
    # remove unwanted characters
    sales = sales.strip('*').strip('†').replace(',','')
    
    # go to link and extract company website
    url = data[1].find('a').get('href')
    page = urllib.request.urlopen(url)
    # parse the html using beautiful soup and store in variable 'soup'
    soup = BeautifulSoup(page, 'html.parser')
    # find the last result in the table and get the link
    try:
        tableRow = soup.find('table').find_all('tr')[-1]
        webpage = tableRow.find('a').get('href')
    except:
        webpage = None
    
    # write each result to rows
    rows.append([rank, companyname, webpage, description, location, yearend, salesrise, sales, staff, comments])


print(rows)

    
## Create csv and write rows to output file
with open('techtrack100.csv','w', newline='') as f_output:
    csv_output = csv.writer(f_output)
    csv_output.writerows(rows)

http://www.fasttrack.co.uk/league-tables/tech-track-100/league-table/


AttributeError: 'NoneType' object has no attribute 'find_all'