# Getting the brazilians funds holdings from CVM web site

**What ?** The Brazilian funds are regulated by a governmental institution called CVM (Comissão de Valores Mobiliários). According to local law, every regulated fund needs to disclose its holdings ticker by ticker portfolio with a 90-day delay. This regulation promotes transparency in the Brazilian financial market and provides valuable information for analyzing fund strategies, thereby facilitating informed decisions regarding money allocation.

**Why ?** 

**How ?** 


<img src="https://lh3.googleusercontent.com/d/1UerOWXdGizjakNJVlGww2hKPmRUr4Eba" alt="icon_mutual_funds_holdings" width="300" align="center">

## Import Libraries

In [1]:
import pandas as pd
import requests
import zipfile
import csv
import re
import io
import sqlite3
from sqlite3 import Error
from datetime import datetime
from datetime import date
import os

In [None]:
pd.options.display.float_format = '{:.4f}'.format

today = date.today().strftime("%Y%m%d") # current data extraction
today = '20240410' #******* manual paramenter test phase


# path folders
path_database = r"C:/Users/lucas/OneDrive/FundsExplore/Databases/"
path_cda_hist_files = r"C:/Users/lucas/OneDrive/FundsExplore/cda_hist_files_"+today+"/"


#%%
#% ##############################################################################################################
#%                                      1 -  CAPTURA COMPOSIÇÃO E DIVERSIFICAÇÃO DE CARTEIRA (CDA)
#% ##############################################################################################################

ano = '2024'
mes = '03'
url_raiz = f'https://dados.cvm.gov.br/dados/FI/DOC/CDA/DADOS/'
url_file = 'cda_fi_'


lista_datas = ['202403',
                   '202402',
                   '202401',
                   '202312',
                   '202311',
                   '202310',
                   '202309',
                   '202308',
                   '202307',
                   '202306',
                   '202305',
                   '202304',
                   '202303',
                   '202302',
                   '202301' ]

#%% requests files from CVM web site and saving the ".zip" files

for data in lista_datas:
    
    # proxies = {"https": "http://10.0.0.24:8080"}
    download = requests.get(url_raiz + url_file + data +'.zip') # proxies = proxies)
    
    # save the zip files at a folder
    with open(path_cda_hist_files + url_file + data +'.zip',"wb") as arquivo_cvm:
        
        arquivo_cvm.write(download.content)
       # arquivo_zip = zipfile.ZipFile(url_file + data +'.zip')


#%% appending files on a final dataframe

df_blc1_appd = pd.DataFrame()
df_blc2_appd = pd.DataFrame()
df_blc3_appd = pd.DataFrame()
df_blc4_appd = pd.DataFrame()
df_blc5_appd = pd.DataFrame()
df_blc6_appd = pd.DataFrame()
df_blc7_appd = pd.DataFrame()
df_blc8_appd = pd.DataFrame()
df_conf_appd = pd.DataFrame()
df_fi_PL_appd = pd.DataFrame()
df_fiim_confid_appd =  pd.DataFrame()
df_fiim_appd =  pd.DataFrame()

#listing all zip files path inside the reference folder
all_files = os.listdir(path_cda_hist_files)
zip_files_with_paths = [os.path.join(path_cda_hist_files, file) for file in all_files if file.endswith('.zip')]

# Extract filenames from each ZipInfo .csv data object
zip_info_list = arquivo_zip.filelist
file_zip_names = [zip_info.filename for zip_info in zip_info_list]

#%%

def read_not_mandatory_csv_files(filename,arquivo_zip):
    
    df_filename = pd.DataFrame()
   
    #print(arquivo_zip.filelist)
    try:
        df_filename = pd.read_csv(arquivo_zip.open(filename),sep = ";", encoding = "ISO-8859-1",low_memory=False)
    except Exception as e:  
        return print("**** "+filename+" not found ****",e) 
    else:
        df_filename = pd.read_csv(arquivo_zip.open(filename),sep = ";", encoding = "ISO-8859-1",low_memory=False)

    return df_filename




#%%

for cda_files in zip_files_with_paths:
    
    with zipfile.ZipFile(cda_files, 'r') as arquivo_zip:
        data = re.search(r'(\d{6})\.zip',arquivo_zip.filename).group(1)
  
        #mandatory data
        df_blc1 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_1_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc2 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_2_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc3 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_3_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc4 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_4_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc5 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_5_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc6 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_6_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc7 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_7_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc8 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_8_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_fi_PL= pd.read_csv(arquivo_zip.open('cda_fi_PL_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
      
        #not mandatory or recent added data
        df_conf = read_not_mandatory_csv_files('cda_fi_CONFID_'+data+'.csv',arquivo_zip)
        df_fiim_confid = read_not_mandatory_csv_files('cda_fiim_CONFID_'+data+'.csv',arquivo_zip)
        df_fiim = read_not_mandatory_csv_files('cda_fiim_'+data+'.csv',arquivo_zip)
        
        
        df_blc1_appd = pd.concat([df_blc1, df_blc1_appd], ignore_index=True)
        df_blc2_appd = pd.concat([df_blc2, df_blc2_appd], ignore_index=True)
        df_blc3_appd = pd.concat([df_blc3, df_blc3_appd], ignore_index=True)
        df_blc4_appd = pd.concat([df_blc4, df_blc4_appd], ignore_index=True)
        df_blc5_appd = pd.concat([df_blc5, df_blc5_appd], ignore_index=True)
        df_blc6_appd = pd.concat([df_blc6, df_blc6_appd], ignore_index=True)
        df_blc7_appd = pd.concat([df_blc7, df_blc7_appd], ignore_index=True)
        df_blc8_appd = pd.concat([df_blc8, df_blc8_appd], ignore_index=True)
        df_fi_PL_appd = pd.concat([df_fi_PL,df_fi_PL_appd], ignore_index=True)
        
        df_conf_appd        = pd.concat([df_conf,df_conf_appd], ignore_index=True)
        df_fiim_confid_appd = pd.concat([df_fiim_confid,df_fiim_confid_appd], ignore_index=True)
        df_fiim_appd        = pd.concat([df_fiim,df_fiim_appd], ignore_index=True)
  
#%% ############################################################################################################
#%                                      2 -  CAPTURA CADASTRO ATUALIZADO DOS FUNDOS
#% ##############################################################################################################

url_raiz_cad_fi = 'https://dados.cvm.gov.br/dados/FI/CAD/DADOS/cad_fi.csv'

s = requests.get(url_raiz_cad_fi).text

df_cad_fi = pd.read_csv(io.StringIO(s), sep = ";", encoding = "ISO-8859-1")


#%%
#% ##############################################################################################################
#%                                      3 -  CRIANDO DATABASE SQL LITE
#% ##############################################################################################################

#criando database SQLite
def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
    except Error as e:
        print(e)
    finally:
        if conn:
            conn.close()

if __name__ == '__main__':
    create_connection(path_database + "CVM_Database_"+today+".db")

#%%
#salvando dados da CVM em tabelas neste dataframe
conn = sqlite3.connect(path_database + "CVM_Database_"+today+".db") # abrindo conexão com database

df_blc1_appd.to_sql('CDA_bloco_1',conn,if_exists='replace',index=False)
df_blc2_appd.to_sql('CDA_bloco_2',conn,if_exists='replace',index=False)
df_blc3_appd.to_sql('CDA_bloco_3',conn,if_exists='replace',index=False)
df_blc4_appd.to_sql('CDA_bloco_4',conn,if_exists='replace',index=False)
df_blc5_appd.to_sql('CDA_bloco_5',conn,if_exists='replace',index=False)
df_blc6_appd.to_sql('CDA_bloco_6',conn,if_exists='replace',index=False)
df_blc7_appd.to_sql('CDA_bloco_7',conn,if_exists='replace',index=False)
df_blc8_appd.to_sql('CDA_bloco_8',conn,if_exists='replace',index=False)
df_conf_appd.to_sql('CDA_confidenc',conn,if_exists='replace',index=False)
df_fiim_confid_appd .to_sql('CDA_fiim_confid',conn,if_exists='replace',index=False)
df_fiim_appd.to_sql('CDA_fiim',conn,if_exists='replace',index=False)
df_fi_PL_appd.to_sql('CDA_fi_pl',conn,if_exists='replace',index=False)
df_cad_fi.to_sql('CAD_FI_cad_fi_atual',conn,if_exists='replace',index=False)


conn.commit()
conn.close()

#%%
#r_df = pd.read_sql("select * from bloco_4",conn)
#print(r_df)


#### Dealint with SQLite database

In [None]:
# write the dataframe into the SQLite database
conn = sqlite3.connect('D:/finance_data/finance_database.db')

df.to_sql('B3_companies_sectors',conn,if_exists='replace',index=False)

In [None]:
# Reading all table names from sqlite_master database
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

for table in tables: #print names
    print(table[0])