# Web Scraping

In [1]:
# importing librarys
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import requests

In [2]:
# url where the data is 
url = "https://www.fundamentus.com.br/proventos.php?papel=ENGI3&tipo=2"

In [3]:
# my user agent
headers = { 
    'User-Agent'      : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', 
    'Accept'          : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
    'Accept-Language' : 'en-US,en;q=0.5',
    'DNT'             : '1', # Do Not Track Request Header 
    'Connection'      : 'close'
}
get_data = requests.get(url, headers=headers, timeout=5).text
soup = BeautifulSoup(get_data,"html.parser")

In [4]:
# looking for the table on the page
table = soup.find('table') # in html a table is represented by the <table> tag

In [5]:
# creating the dataframe
extracted_data = pd.DataFrame(columns=['Data', 'Valor', 'Tipo', 'Data de Pagamento', 'Por quantas ações'])

In [6]:
# renaming the columns
extracted_data = extracted_data.rename(columns= {'Data':'Date', 'Valor':'Value', 'Tipo':'Type', 'Data de Pagamento': 'Day of payment', 'Por quantas ações':'How many shares'})

In [7]:
# getting all table rows 
for i in table.tbody.find_all('tr'): 
    # getting all columns in each row
    columns = i.find_all('td') 
    if(columns != []):
        data = columns[0].text.strip()
        valor = columns[1].text.strip()
        tipo = columns[2].text.strip()
        data_pagamento = columns[3].text.strip()
        quantidade_acoes = columns[4].text.strip()
        extracted_data = pd.concat([extracted_data, pd.DataFrame.from_records([{'Date': data,  'Value': valor, 'Type': tipo, 'Day of payment': data_pagamento, 'How many shares': quantidade_acoes}])], ignore_index=True)
        
extracted_data.head(20)

Unnamed: 0,Date,Value,Type,Day of payment,How many shares
0,23/08/2022,2320,DIVIDENDO,01/09/2022,1
1,30/12/2021,4400,DIVIDENDO,02/03/2022,1
2,17/08/2021,1300,DIVIDENDO,29/09/2021,1
3,19/03/2021,2200,DIVIDENDO,30/03/2021,1
4,18/08/2020,560,DIVIDENDO,26/08/2020,1
5,27/02/2020,640,DIVIDENDO,03/04/2020,1
6,13/08/2019,560,DIVIDENDO,23/08/2019,1
7,22/03/2019,300,DIVIDENDO,10/04/2019,1
8,27/02/2019,1300,DIVIDENDO,11/03/2019,1
9,14/08/2018,560,DIVIDENDO,04/09/2018,1


In [8]:
# refactoring the dataframe

# Date
extracted_data['Date'] = pd.to_datetime(extracted_data['Date'], format="%d/%m/%Y", errors='ignore')

# Valor
extracted_data['Value'] = [x.replace(',', '.') for x in extracted_data['Value']]
extracted_data = extracted_data.astype({"Value": float})

# Day of payment
temp = pd.to_datetime(extracted_data["Day of payment"], format="%d/%m/%Y", errors='coerce')
extracted_data["Day of payment"] = extracted_data["Day of payment"].where(temp.isna(), temp.dt.date)

# Type
extracted_data['Type'] = extracted_data['Type'].str.upper()

# How many shares

extracted_data = extracted_data.astype({"How many shares": int})


In [9]:
# exporting data
extracted_data.to_csv('engi3.csv', index=False)