## 1.0 Bibliotecas e Funções

### 1.1. Bibliotecas

In [4]:

# data manipulation libs
import numpy as np
import pandas as pd
# date and time libs
import datetime as dt
from datetime import timedelta
# using a soup lib to scrapp the page
import requests
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
# lib to pass cookies
import http.cookiejar
from lxml.html import fragment_fromstring
import re
# libs to clean data exported data
from collections import OrderedDict
from decimal import Decimal
from functools import reduce


### 1.2. Funções

In [5]:
# classe contendo todas as funções personalizadas
class functions(object):

    def __init__(self) -> None:
        pass

    def format_currency(x):
        return "R${:,.2f}".format(x)

    def format_perc( x):
        return "{}%".format(x)

    def today():
        return dt.date.today()

    def replace_nan(df,column,to_replace,repl):
        df[column] = df[column].replace(to_replace,repl)
    
    def replace_nan_str(df,column:str,to_replace:str,repl:str):
        df[column] = df[column].str.replace(to_replace,repl)

    def change_type(df,column,type):
        df[column] = df[column].astype(type, errors='ignore')

    def options():
        global pd_options
        pd_options = pd.options.mode.chained_assignment = None

    def column_index(df, query_cols):
        cols = df.columns.values
        sidx = np.argsort(cols)
        return sidx[np.searchsorted(cols,query_cols,sorter=sidx)]

    def inicio_mes():
        hoje = dt.datetime.today() 
        inicio_mes_data = hoje - timedelta(hoje.day)+ timedelta(days=1)
        return inicio_mes_data
        
    def round_data(df,columns_to_round):
        df[columns_to_round] = np.round(df[columns_to_round],2)
    
    def centralizar_valor(valor):
        return f'{valor:^10}'

    def merge_all_dfs(dfs,name:str,type_of_merge:str):
        df = reduce(lambda left, right: pd.merge(left,right, on=name, how=type_of_merge), dfs)
        return df
    
    def decimal_point_thousand(df, column):
        df[column] = df[column].apply(lambda x: str(x).replace('.', '', 1))

## 2.0 Extração

In [6]:
# url de extracao
base_url = r"https://www.fundamentus.com.br/resultado.php"

In [7]:
# desabilitar cookies
cookie_jar = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie_jar))
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'),
                         ('Accept', 'text/html, text/plain, text/css, text/sgml, */*;q=0.01')]

### 2.1 Extracao de Html

In [12]:
# usar o opener para acessar a url base
html = opener.open(base_url)
# decodificar em ISO8859
html_content = html.read().decode('ISO-8859-1')

In [13]:
# Fazer o soup
soup = BeautifulSoup(html_content,'html.parser')

In [14]:
#extrair a tabela do html
table = soup.find_all(
   'table'
)

In [15]:
tables = table[0]

In [16]:
thead = tables.find('thead')

In [17]:
headers_cells = thead.find_all('th')

In [18]:
headers = []
for cell in headers_cells:
    headers.append(cell.get_text(strip=True))

In [19]:
acoes_data = []

In [20]:
rows = tables.find_all('tr')

In [21]:
for row in rows[1:]:
    cells = row.find_all('td')
    nome_acao = cells[0].a.get_text(strip=True)  # Obter o texto da tag <a>
    cotacao = cells[1].get_text(strip=True)  # Obter o texto da tag <td>
    p_l = cells[2].get_text(strip=True) 
    p_vp = cells[3].get_text(strip=True)  # Obter o texto da tag <td>
    psr = cells[4].get_text(strip=True) 
    dividend_yield = cells[5].get_text(strip=True) 
    p_ativo = cells[6].get_text(strip=True)  # Obter o texto da tag <td>
    p_cap_giro= cells[7].get_text(strip=True)
    p_ebit = cells[8].get_text(strip=True)
    p_ativ_circ_liq = cells[9].get_text(strip=True)
    ev_ebit = cells[10].get_text(strip=True)
    ev_ebitda = cells[11].get_text(strip=True)
    mrg_ebit = cells[12].get_text(strip=True)
    mrg_liq = cells[13].get_text(strip=True)
    liq_corr = cells[14].get_text(strip=True)
    roic = cells[15].get_text(strip=True)
    roe = cells[16].get_text(strip=True)
    liq_2meses = cells[17].get_text(strip=True)
    patrim_liq = cells[18].get_text(strip=True)
    div_brut_patrimv = cells[19].get_text(strip=True)
    cresc_rec_5av= cells[20].get_text(strip=True)

# criar um dicionario com os valores encontrados

    acoes_data.append({'papel':nome_acao, 
                       'cotacao':cotacao, 
                       'p_l':p_l, 
                       'p_vp':p_vp, 
                       'psr':psr, 
                       'div_yield':dividend_yield, 
                       'p_ativo':p_ativo,
                        'p_cap_giro':p_cap_giro, 
                        'p_ebit':p_ebit, 
                        'p_ativ_circ_liq':p_ativ_circ_liq,
                        'ev_ebit':ev_ebit,
                        'ev_ebitda':ev_ebitda,
                        'mrg_ebit':mrg_ebit,
                        'mrg_liq':mrg_liq,
                        'liq_corr':liq_corr,
                        'roic':roic,
                        'roe':roe, 
                        'liq_2meses':liq_2meses,
                        'patrim_liq':patrim_liq, 
                        'div_brut_patrim':div_brut_patrimv, 
                        'cresc_rec_5a':cresc_rec_5av
    })

In [107]:
stocks_df = pd.DataFrame.from_dict(
    acoes_data
    )

In [108]:
columns_to_replace_perc_ = ['cotacao', 'p_l', 'p_vp', 'psr', 'div_yield', 'p_ativo', 'p_cap_giro', 
                            'p_ebit', 'p_ativ_circ_liq', 'ev_ebit', 'ev_ebitda','div_yield',
                            'mrg_ebit', 'mrg_liq','liq_corr','roic','roe', 'cresc_rec_5a','roe', 'liq_2meses',
       'patrim_liq', 'div_brut_patrim', 'cresc_rec_5a']


In [109]:
for perc in columns_to_replace_perc_:
    functions.replace_nan_str(
        stocks_df,
        perc,
        '%',
        ''
    )

    functions.decimal_point_thousand(
        stocks_df,
        perc
    )

    functions.replace_nan_str(
        stocks_df,
        perc,
        ',',
        '.'
    )
    

In [110]:
for type in columns_to_replace_perc_:
    functions.change_type(
        stocks_df,
        type,
        'float64'
    )

In [111]:
stocks_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987 entries, 0 to 986
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   papel            987 non-null    object 
 1   cotacao          987 non-null    float64
 2   p_l              987 non-null    float64
 3   p_vp             987 non-null    float64
 4   psr              987 non-null    float64
 5   div_yield        987 non-null    float64
 6   p_ativo          987 non-null    float64
 7   p_cap_giro       987 non-null    float64
 8   p_ebit           987 non-null    float64
 9   p_ativ_circ_liq  987 non-null    float64
 10  ev_ebit          987 non-null    float64
 11  ev_ebitda        987 non-null    float64
 12  mrg_ebit         987 non-null    float64
 13  mrg_liq          987 non-null    float64
 14  liq_corr         987 non-null    float64
 15  roic             987 non-null    float64
 16  roe              987 non-null    float64
 17  liq_2meses      

In [17]:
columns_to_cg = ['cotacao','p_l','p_vp','psr','div_yield','p_ativo','p_cap_giro','p_ebit','p_ativ_circ_liq']

In [18]:
for x in columns_to_cg:
    functions.change_type(stocks_df,x,str)
    functions.replace_nan(stocks_df,x,'.','')

stocks_df.columns

Index(['papel', 'cotacao', 'p_l', 'p_vp', 'psr', 'div_yield', 'p_ativo',
       'p_cap_giro', 'p_ebit', 'p_ativ_circ_liq', 'ev_ebit', 'ev_ebitda',
       'mrg_ebit', 'mrg_liq', 'liq_corr', 'roic', 'roe', 'liq_2meses',
       'patrim_liq', 'div_brut_patrim', 'cresc_rec_5a'],
      dtype='object')

In [22]:
stocks_df[['div_yield','mrg_ebit','mrg_liq','roic','roe','cresc_rec_5a']]

Unnamed: 0,div_yield,mrg_ebit,mrg_liq,roic,roe,cresc_rec_5a
0,0.00,0.00,0.00,0.00,4.10,37.74
1,0.00,40.85,28.98,22.40,20.11,31.91
2,0.00,-208.15,-362.66,-13.50,145.70,-41.11
3,0.00,-208.15,-362.66,-13.50,145.70,-41.11
4,0.00,0.00,0.00,0.00,-2.08,13.66
...,...,...,...,...,...,...
982,0.00,0.00,0.00,0.00,8.02,-6.01
983,0.00,0.00,0.00,0.00,0.33,10.58
984,0.00,0.00,0.00,0.00,0.33,10.58
985,0.00,0.00,0.00,0.00,0.33,10.58


Unnamed: 0,div_yield,mrg_ebit,mrg_liq,roic,roe,cresc_rec_5a
0,000,000,000,000,-208,1366
1,000,888,1072,1768,3215,814
2,000,-20815,-36266,-1350,14570,-4111
3,000,4085,2898,2240,2011,3191
4,000,4085,2898,2240,2011,3191
...,...,...,...,...,...,...
1969,000,000,000,000,802,-601
1970,000,000,000,000,033,1058
1971,000,000,000,000,033,1058
1972,000,000,000,000,033,1058
