In [1]:
import os
import re
import pandas as pd
from datetime import date
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm as log_progress
from pandas_profiling import ProfileReport
import plotly.express as px

## Extração do preço de ações da B3 
___

In [2]:
import warnings
warnings.filterwarnings('ignore')

BASE_PATH = "E:\\data-science\\raw-data"
RES_PATH = "E:\\data-science\\datalake"


def extract(path, sep=";"):
    df = pd.DataFrame()
    for file in log_progress(os.listdir(path)):
        sm_df = pd.read_csv(f"{path}\\{file}", sep=sep)
        df = pd.concat([df, sm_df])
    return df

In [3]:
b3_prices = extract(f"{BASE_PATH}\\b3\\historical-quotes\\ia", sep=",")

  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
print(f"Coletados {b3_prices.shape[0]} preços da B3")

Coletados 3266749 preços da B3


In [5]:
b3_prices.head()

Unnamed: 0,register_type,date,name,company,share_type,forward_market_deadline,currency,open_price,max_price,min_price,...,volume,exercise_price,price_correction_indicator,expiration_date,price_factor,exercise_price_points,isin_code,distribution_share_number,type,folder
0,1,2021-01-04,A1AP34,ADVANCE AUTO,DRN,,R$,51.1,52.03,50.62,...,333558.7,0.0,0.0,,1.0,0.0,BRA1APBDR001,105.0,,acoes
1,1,2021-01-04,BOVVM116,BOVVE FM,CI,0.0,R$,1.07,1.07,1.07,...,19274.9,116.0,0.0,2021-01-18,1.0,0.0,BRBOVVCTF009,100.0,PUT,opcoes
2,1,2021-01-04,BOVVM118,BOVVE FM,CI,0.0,R$,1.52,1.52,1.52,...,24110.2,118.0,0.0,2021-01-18,1.0,0.0,BRBOVVCTF009,100.0,PUT,opcoes
3,1,2021-01-04,MULTM225,MULTE /EJ,ON N2,0.0,R$,0.5,0.5,0.5,...,50.0,22.11,0.0,2021-01-18,1.0,0.0,BRMULTACNOR5,127.0,PUT,opcoes
4,1,2021-01-04,SAPR4T,SANEPAR,PN EJ N2,91.0,R$,5.11,5.12,5.02,...,2030.5,0.0,0.0,,1.0,0.0,BRSAPRACNPR6,151.0,,termo


In [6]:
b3_prices.columns

Index(['register_type', 'date', 'name', 'company', 'share_type',
       'forward_market_deadline', 'currency', 'open_price', 'max_price',
       'min_price', 'average_price', 'close_price', 'best_buy_price',
       'best_sell_price', 'transactions', 'quantity', 'volume',
       'exercise_price', 'price_correction_indicator', 'expiration_date',
       'price_factor', 'exercise_price_points', 'isin_code',
       'distribution_share_number', 'type', 'folder'],
      dtype='object')

In [7]:
b3_prices["date"] = pd.to_datetime(b3_prices["date"])

## Filtragem dos campos e seleção apenas das cotas de ações
___

In [8]:
cols = {
    "name": "id",
    "folder": "type",
    "date": "date",
    "close_price": "close",
}

raw_prices = b3_prices[cols.keys()].rename(columns=cols)

In [9]:
raw_prices.head()

Unnamed: 0,id,type,date,close
0,A1AP34,acoes,2021-01-04,51.9
1,BOVVM116,opcoes,2021-01-04,1.07
2,BOVVM118,opcoes,2021-01-04,1.52
3,MULTM225,opcoes,2021-01-04,0.5
4,SAPR4T,termo,2021-01-04,5.03


In [10]:
stocks = raw_prices[raw_prices["type"] == "acoes"]

In [11]:
print(f"{stocks['id'].unique().shape[0]} ações encontradas")

2125 ações encontradas


In [12]:
stocks.to_parquet(f"{RES_PATH}/stock_prices.snappy.parquet")