In [65]:
from urllib.request import urlretrieve
import zipfile
import os
from datetime import date, datetime
import pandas as pd

In [42]:
def download_year(year):
    file_path = f'files/{year}.zip'
    urlretrieve(f'http://bvmf.bmfbovespa.com.br/InstDados/SerHist/COTAHIST_A{year}.ZIP', file_path)
    return file_path

In [11]:
def unzip(file_path):
    with zipfile.ZipFile(file_path, 'r') as zip_ref:
        zip_ref.extractall('files/')

In [12]:
def _to_float(s):
    return float(s[:-2] + '.' + s[-2:])

In [81]:
def _get_info(line):
    return {
        'date': line[2:10],
        'trading_code': line[12:23].strip(),
        'short_name': line[27:38].strip(),
        'open': _to_float(line[56:69].strip()),
        'high': _to_float(line[70:82].strip()),
        'low': _to_float(line[83:95].strip()),
        'close': _to_float(line[109:121].strip()),
        'bid': _to_float(line[122:134].strip()),
        'ask': _to_float(line[135:147].strip()),
        'volume': _to_float(line[170:187].strip())
    }

In [14]:
years = list(range(2016, 2022))

In [None]:
for year in years:
    file = download_year(year)
    unzip(file)

In [44]:
!ls files

COTAHIST_A2016.TXT COTAHIST_A2018.TXT COTAHIST_A2020.TXT
COTAHIST_A2017.TXT COTAHIST_A2019.TXT COTAHIST_A2021.TXT


In [45]:
files = [f'files/COTAHIST_A{year}.TXT' for year in years]

In [68]:
def parse_file(path):
    with open(path, 'r') as file:
        content = file.read()
        
    lines = content.split('\n')[1:-2]   # drop header and footer
    df = pd.DataFrame([_get_info(line) for line in lines])
    df['date'] = df['date'].map(lambda d: datetime.strptime(d, '%Y%m%d'))
    return df.rename({'trading_code': 'ticker'}, axis=1).drop('short_name', axis=1).set_index(['date', 'ticker'])

In [87]:
def make_df(files):
    full_df = pd.DataFrame()
    for file in files:
        df = parse_file(file)
        df = df.reset_index()
        df = df[df['ticker'] == 'ITUB4']    # using itub because they pay a lot of dividends
        full_df = full_df.append(df.set_index('date'), sort=True)
        
    return full_df.sort_index()

### I'll start with 1 year

In [88]:
df = make_df(['files/COTAHIST_A2021.TXT'])

In [93]:
df = df[['open', 'close']]
df['next_open'] = df['open'].shift(-1)
df = df.dropna()
df['overnight'] = df['next_open'] / df['close'] - 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [95]:
# possible candidates, just out of curiosity
df[df['overnight'] < df['overnight'].mean() - 2 * df['overnight'].std()]

Unnamed: 0_level_0,open,close,next_open,overnight
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-02-19,27.89,27.62,26.55,-0.03874
2021-10-01,29.12,29.67,24.75,-0.165824


Checking other sources, indeed there was a dividend on 2021-10-01, but not on 2021-02-19, and there were maany others that didn't show up on this simple filter, so not a great filter

In [97]:
df.to_parquet('')

Unnamed: 0_level_0,open,close,next_open,overnight
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-01-04,31.98,30.90,30.73,-0.005502
2021-01-05,30.73,30.70,30.98,0.009121
2021-01-06,30.98,31.55,31.65,0.003170
2021-01-07,31.65,32.83,32.93,0.003046
2021-01-08,32.93,32.82,32.47,-0.010664
...,...,...,...,...
2021-12-22,21.23,21.23,21.33,0.004710
2021-12-23,21.33,21.34,21.48,0.006560
2021-12-27,21.48,21.56,21.63,0.003247
2021-12-28,21.63,21.48,21.53,0.002328
