# Web scraping

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import scrapy
from scrapy.crawler import CrawlerProcess

# Exercici 1

Realitza web scraping d'una pàgina de la borsa de Madrid (https://www.bolsamadrid.es) utilitzant BeautifulSoup i Selenium.

Buscarem els valors de les accions de les diferents companyies del IBEX35 amb BeautifulSoup i Selenium

## BeautifulSoup

In [2]:
# Bolsa madrid link
url = 'https://www.bolsamadrid.es'

In [3]:
# Download the HTML
html = requests.get(url)

In [4]:
# Parse the HTML
soup = BeautifulSoup(html.content, 'html.parser')

In [5]:
# Get the Acciones links
links = [link.get('href') for link in soup.find_all('a', string='Acciones')]

In [6]:
# Accciones link
url_acciones = url + links[0]

In [7]:
# Download the HTML
html_acciones = requests.get(url_acciones)

In [8]:
# Parse the HTML
soup_acciones = BeautifulSoup(html_acciones.content, 'html.parser')

In [9]:
# Get the table of Acciones
table_acciones = soup_acciones.find(id='ctl00_Contenido_tblAcciones')

In [10]:
# Check table code
print(table_acciones.prettify())

<table cellpadding="3" cellspacing="0" class="TblPort" id="ctl00_Contenido_tblAcciones" width="100%">
 <tr align="center">
  <th scope="col">
   Nombre
  </th>
  <th scope="col">
   Últ.
  </th>
  <th scope="col">
   % Dif.
  </th>
  <th scope="col">
   Máx.
  </th>
  <th scope="col">
   Mín.
  </th>
  <th scope="col">
   Volumen
  </th>
  <th scope="col">
   Efectivo (miles €)
  </th>
  <th scope="col">
   Fecha
  </th>
  <th class="Ult" scope="col">
   Hora
  </th>
 </tr>
 <tr align="right">
  <td align="left" class="DifFlSb">
   <a href="/esp/aspx/Empresas/FichaValor.aspx?ISIN=ES0125220311">
    ACCIONA
   </a>
  </td>
  <td>
   183,3000
  </td>
  <td class="DifClSb">
   0,71
  </td>
  <td>
   183,8000
  </td>
  <td>
   181,1000
  </td>
  <td>
   38.549
  </td>
  <td>
   7.033,22
  </td>
  <td align="center">
   07/12/2022
  </td>
  <td align="center" class="Ult">
   17:20:12
  </td>
 </tr>
 <tr align="right">
  <td align="left" class="DifFlSb">
   <a href="/esp/aspx/Empresas/FichaV

In [11]:
# Parse columns and row values
values = []
for row in table_acciones.find_all('tr'):
    if row.find_all('th'):
        columns = [element.get_text() for element in row.find_all('th')]
    elif row.find_all('td'):
        values.append([element.get_text() for element in row.find_all('td')])

In [12]:
# Save values into a dataframe
acciones = pd.DataFrame(values, columns=columns)

In [13]:
# View the dataframe
acciones

Unnamed: 0,Nombre,Últ.,% Dif.,Máx.,Mín.,Volumen,Efectivo (miles €),Fecha,Hora
0,ACCIONA,1833000,71,1838000,1811000,38.549,"7.033,22",07/12/2022,17:20:12
1,ACCIONA ENER,374400,32,376200,370600,95.664,"3.566,64",07/12/2022,17:20:02
2,ACERINOX,93520,-118,94080,92860,230.505,"2.154,53",07/12/2022,17:20:14
3,ACS,272500,7,272900,270500,173.451,"4.709,06",07/12/2022,17:20:43
4,AENA,1248000,-115,1280500,1244500,97.824,"12.317,46",07/12/2022,17:20:20
5,AMADEUS,504200,-12,508200,501600,221.081,"11.164,26",07/12/2022,17:20:23
6,ARCELORMIT.,253100,-152,255000,250050,309.135,"7.805,20",07/12/2022,17:20:43
7,B.SANTANDER,28080,-12,28320,27825,17.025.287,"47.702,77",07/12/2022,17:20:41
8,BA.SABADELL,8630,33,8736,8572,8.735.729,"7.567,01",07/12/2022,17:20:19
9,BANKINTER,59800,-17,60420,59080,1.265.246,"7.544,66",07/12/2022,17:20:24


## Selenium

In [14]:
# Open browser
browser = Chrome()

In [15]:
# Get the page
browser.get(url)

In [16]:
# Get the links
links = browser.find_elements(By.LINK_TEXT, 'Acciones')

In [17]:
# Open the acciones webpage
links[0].click()

In [18]:
# Get the table of Acciones
table_acciones = browser.find_element(By.ID, 'ctl00_Contenido_tblAcciones')

In [19]:
# Parse columns and row values
values = []
for row in table_acciones.find_elements(By.TAG_NAME, 'tr'):
    if row.find_elements(By.TAG_NAME, 'th'):
        columns = [element.text for element in row.find_elements(By.TAG_NAME, 'th')]
    elif row.find_elements(By.TAG_NAME, 'td'):
        values.append([element.text for element in row.find_elements(By.TAG_NAME, 'td')])

In [20]:
# Save values into a dataframe
acciones = pd.DataFrame(values, columns=columns)

In [21]:
# View the dataframe
acciones

Unnamed: 0,Nombre,Últ.,% Dif.,Máx.,Mín.,Volumen,Efectivo (miles €),Fecha,Hora
0,ACCIONA,1832000,66,1838000,1811000,38.972,"7.110,69",07/12/2022,17:21:30
1,ACCIONA ENER,374200,27,376200,370600,97.310,"3.628,22",07/12/2022,17:21:13
2,ACERINOX,93460,-125,94080,92860,230.827,"2.157,54",07/12/2022,17:21:11
3,ACS,272400,4,272900,270500,173.686,"4.715,46",07/12/2022,17:21:03
4,AENA,1248500,-111,1280500,1244500,98.449,"12.395,50",07/12/2022,17:21:26
5,AMADEUS,504200,-12,508200,501600,221.311,"11.175,86",07/12/2022,17:21:04
6,ARCELORMIT.,253100,-152,255000,250050,309.135,"7.805,20",07/12/2022,17:20:43
7,B.SANTANDER,28065,-18,28320,27825,17.055.884,"47.788,65",07/12/2022,17:21:38
8,BA.SABADELL,8626,28,8736,8572,8.737.835,"7.568,83",07/12/2022,17:21:04
9,BANKINTER,59800,-17,60420,59080,1.265.997,"7.549,15",07/12/2022,17:21:04


# Exercici 2

Documenta en un word el teu conjunt de dades generat amb la informació que tenen els diferents arxius de Kaggle.

### Títol

Accions del IBEX-35 24/01/2022

### Descripció

Accions de les 35 companyies del IBEX-35 del dia 24/01/2022

### Llicència

[CC0: Public Domain](https://creativecommons.org/publicdomain/zero/1.0/)

### Context

El IBEX-35 (Índice Bursátil Español) és l'índex borsari de referència de la Bolsa de Madrid (Bolsa de Madrid) i està format per 35 empreses. Aquest conjunt de dades conté la història del 24 de gener del 2022. 

### Content
Head of the dataset:
<img src="acciones_head.png" width="700px">

### Acknowledgements

Origen: https://www.bolsamadrid.es/esp/aspx/Mercados/Precios.aspx?indice=ESI100000000

# Exercici 3

Tria una página web que tu vulguis i realitza web scraping mitjançant la llibreria Scrapy. 

In [22]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [23]:
class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split('/')[-2]
        filename = f'quotes/quotes-{page}.txt'
        with open(filename, 'w') as fw:
            for quote in response.css('div.quote'):
                author = quote.xpath('span/small/text()').get()
                quote = quote.css('span.text::text').get()
                fw.write(f"{author}: {quote}\n")
        self.log(f'Saved file {filename}')
        
        next_page = response.css('li.next a::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)
            

In [24]:
# Initialize the crawler
process = CrawlerProcess()

# Specify the spider to use
process.crawl(QuotesSpider)

# Start the crawling process
process.start()

2022-12-07 17:37:22 [scrapy.utils.log] INFO: Scrapy 2.7.1 started (bot: scrapybot)
2022-12-07 17:37:22 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.13, cssselect 1.2.0, parsel 1.7.0, w3lib 2.1.0, Twisted 22.10.0, Python 3.10.5 (v3.10.5:f377153967, Jun  6 2022, 12:36:10) [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 22.1.0 (OpenSSL 3.0.7 1 Nov 2022), cryptography 38.0.4, Platform macOS-13.0.1-arm64-arm-64bit
2022-12-07 17:37:22 [scrapy.crawler] INFO: Overridden settings:
{}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2022-12-07 17:37:22 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-12-07 17:37:22 [scrapy.extensions.telnet] INFO: Telnet Password: 10a3fc99d28665f2
2022-12-07 17:37:22 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.