# Getting the brazilians funds holdings from CVM web site

**What ?** The Brazilian funds are regulated by a governmental institution called CVM (Comissão de Valores Mobiliários). According to local law, every regulated fund needs to disclose its holdings ticker by ticker portfolio within 90-day delay. This regulation promotes transparency in the Brazilian funds industry and provides valuable information for analysis on funds strategies, thereby facilitating informed decisions regarding money allocation.

**Why ?** Fund holdings are useful in showing how funds are behaving in the markets, including what they are buying, selling, or holding over time. As fund managers are highly qualified money allocators, having access to this information and the insights that come through them is extremely valuable for regular investors who do not have the same level of knowledge or information as a large asset management team. 

**How ?** The holdings dataset is available on the CVM website in a section called CDA-Composição e Diversificação das Aplicações  [link here!](https://dados.cvm.gov.br/dataset/fi-doc-cda). On this page, there are eight packages of .csv files, updated daily for each month. These files contain the ticker-by-ticker holdings of Brazilian funds by asset class. For example, package 1 contains government securities, while package 4 includes cash equity and derivatives such as stocks, debentures, and options positions. To see what is included in each package, visit [this link](https://cvmweb.cvm.gov.br/SWB/Sistemas/SCW/PadroesXML/PadraoXMLCDANetV4.aspx). In this code, we will download every files by month using pandas requests and append each pack to a pandas DataFrame. At the end, we will save the full DataFrame to a local SQLite database for further analysis.



<img src="https://lh3.googleusercontent.com/d/1UerOWXdGizjakNJVlGww2hKPmRUr4Eba" alt="icon_mutual_funds_holdings" width="300" align="center">

### Import Libraries

In [15]:
import pandas as pd
import csv
import os
import tempfile
import re
import io
from datetime import datetime
from datetime import date

import requests
import zipfile
import sqlite3
from sqlite3 import Error

pd.options.display.float_format = '{:.4f}'.format #formating data displayed

### Download files from CVM website

In [None]:
# Defining URL and file name constants
url_raiz = f'https://dados.cvm.gov.br/dados/FI/DOC/CDA/DADOS/' #root url from CVM website data for the CDA (Assets Diversificantion Compositions)
url_file = 'cda_fi_' # root holdings file name from CVM website

lista_datas = ['202404','202403','202402','202401','202312','202311','202310','202309','202308','202307','202306', '202305','202304', \
               '202303','202302','202301' ] #list of dates to be downloaded **** (Manual Parameter) ****

# Creating a subfolder to save the temporary files
subfolder_name = "temp_folder"
if not os.path.exists(subfolder_name): # Check if the subfolder exists, if not, create it
       os.makedirs(subfolder_name)
file_path = os.path.join(subfolder_name) # Define the file path within the subfolder


# Requests files from CVM web site and saving the ".zip" files
for data in lista_datas:
    download = requests.get(url_raiz + url_file + data +'.zip') 
    # save the zip files at a temporary folder
    with open(file_path + '/' + url_file + data +'.zip',"wb") as arquivo_cvm:      
        arquivo_cvm.write(download.content)
        
#listing all zip files path inside the reference folder
all_files = os.listdir(file_path)
zip_files_with_paths = [os.path.join(file_path, file) for file in all_files if file.endswith('.zip')]

zip_files_with_paths

### Append montly dataset into a unique database

In [17]:
# initializing dataframes to appending all files into unique dataset

df_blc1_appd = pd.DataFrame()
df_blc2_appd = pd.DataFrame()
df_blc3_appd = pd.DataFrame()
df_blc4_appd = pd.DataFrame()
df_blc5_appd = pd.DataFrame()
df_blc6_appd = pd.DataFrame()
df_blc7_appd = pd.DataFrame()
df_blc8_appd = pd.DataFrame()
df_conf_appd = pd.DataFrame()
df_fi_PL_appd = pd.DataFrame()
df_fiim_confid_appd =  pd.DataFrame()
df_fiim_appd =  pd.DataFrame()

# function to read files that is not mandatory among the data
def read_not_mandatory_csv_files(filename,arquivo_zip): 
    df_filename = pd.DataFrame()
    try:
        df_filename = pd.read_csv(arquivo_zip.open(filename),sep = ";", encoding = "ISO-8859-1",low_memory=False)
    except Exception as e:  
        return print("**** "+filename+" not found ****",e) 
    else:
        df_filename = pd.read_csv(arquivo_zip.open(filename),sep = ";", encoding = "ISO-8859-1",low_memory=False)

    return df_filename

# for loop to read all files from the dataset and append each montly database into a sigle dataframe 
for cda_files in zip_files_with_paths:
    
    with zipfile.ZipFile(cda_files, 'r') as arquivo_zip:
        data = re.search(r'(\d{6})\.zip',arquivo_zip.filename).group(1)
  
        #mandatory data
        df_blc1 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_1_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc2 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_2_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc3 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_3_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc4 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_4_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc5 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_5_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc6 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_6_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc7 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_7_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_blc8 = pd.read_csv(arquivo_zip.open('cda_fi_BLC_8_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
        df_fi_PL= pd.read_csv(arquivo_zip.open('cda_fi_PL_'+data+'.csv'),sep = ";", encoding = "ISO-8859-1",low_memory=False)
      
        #not mandatory or recently added data
        df_conf = read_not_mandatory_csv_files('cda_fi_CONFID_'+data+'.csv',arquivo_zip)
        df_fiim_confid = read_not_mandatory_csv_files('cda_fiim_CONFID_'+data+'.csv',arquivo_zip)
        df_fiim = read_not_mandatory_csv_files('cda_fiim_'+data+'.csv',arquivo_zip)
        
        
        df_blc1_appd = pd.concat([df_blc1, df_blc1_appd], ignore_index=True)
        df_blc2_appd = pd.concat([df_blc2, df_blc2_appd], ignore_index=True)
        df_blc3_appd = pd.concat([df_blc3, df_blc3_appd], ignore_index=True)
        df_blc4_appd = pd.concat([df_blc4, df_blc4_appd], ignore_index=True)
        df_blc5_appd = pd.concat([df_blc5, df_blc5_appd], ignore_index=True)
        df_blc6_appd = pd.concat([df_blc6, df_blc6_appd], ignore_index=True)
        df_blc7_appd = pd.concat([df_blc7, df_blc7_appd], ignore_index=True)
        df_blc8_appd = pd.concat([df_blc8, df_blc8_appd], ignore_index=True)
        df_fi_PL_appd = pd.concat([df_fi_PL,df_fi_PL_appd], ignore_index=True)
        
        df_conf_appd        = pd.concat([df_conf,df_conf_appd], ignore_index=True)
        df_fiim_confid_appd = pd.concat([df_fiim_confid,df_fiim_confid_appd], ignore_index=True)
        df_fiim_appd        = pd.concat([df_fiim,df_fiim_appd], ignore_index=True)
  

**** cda_fi_CONFID_202301.csv not found **** "There is no item named 'cda_fi_CONFID_202301.csv' in the archive"
**** cda_fiim_CONFID_202301.csv not found **** "There is no item named 'cda_fiim_CONFID_202301.csv' in the archive"
**** cda_fi_CONFID_202302.csv not found **** "There is no item named 'cda_fi_CONFID_202302.csv' in the archive"
**** cda_fiim_CONFID_202302.csv not found **** "There is no item named 'cda_fiim_CONFID_202302.csv' in the archive"
**** cda_fi_CONFID_202303.csv not found **** "There is no item named 'cda_fi_CONFID_202303.csv' in the archive"
**** cda_fiim_CONFID_202303.csv not found **** "There is no item named 'cda_fiim_CONFID_202303.csv' in the archive"
**** cda_fi_CONFID_202304.csv not found **** "There is no item named 'cda_fi_CONFID_202304.csv' in the archive"
**** cda_fiim_CONFID_202304.csv not found **** "There is no item named 'cda_fiim_CONFID_202304.csv' in the archive"
**** cda_fi_CONFID_202305.csv not found **** "There is no item named 'cda_fi_CONFID_2023

NOTE: Some of the databases like cda_fiim_CONFID and cda_fi_CONFID are not present in all dataset, they usualy are present into most recent months. For more information see this link https://dados.cvm.gov.br/dataset/fi-doc-cda 

#### Data preview

In [18]:
df_blc4_appd.head()

Unnamed: 0,TP_FUNDO,CNPJ_FUNDO,DENOM_SOCIAL,DT_COMPTC,TP_APLIC,TP_ATIVO,EMISSOR_LIGADO,TP_NEGOC,QT_VENDA_NEGOC,VL_VENDA_NEGOC,...,VL_AQUIS_NEGOC,QT_POS_FINAL,VL_MERC_POS_FINAL,VL_CUSTO_POS_FINAL,DT_CONFID_APLIC,CD_ATIVO,DS_ATIVO,CD_ISIN,DT_INI_VIGENCIA,DT_FIM_VIGENCIA
0,FAPI,02.010.153/0001-45,BB FAPI FUNDO DE APOSENTADORIA PROGRAMADA INDI...,2024-04-30,Ações,Ação preferencial,N,Para negociação,0.0,0.0,...,95324.38,3000.0,94080.0,,,ITUB4,ITAUUNIBANCO PN N1,BRITUBACNPR1,2009-05-20,
1,FAPI,02.010.153/0001-45,BB FAPI FUNDO DE APOSENTADORIA PROGRAMADA INDI...,2024-04-30,Ações,Ação preferencial,N,Para negociação,2500.0,97525.0,...,103640.27,2500.0,105050.0,,,PETR4,PETROBRAS PN,BRPETRACNPR6,1997-03-24,
2,FAPI,02.661.252/0001-97,FUNDO DE APOSENTADORIA PROGRAMADA INDIVIDUAL A...,2024-04-30,Ações,Ação ordinária,N,Para negociação,0.0,0.0,...,323.6,487.0,7465.71,,,LREN3,LOJAS RENNER ON NM,BRLRENACNOR1,1973-07-24,
3,FAPI,02.661.252/0001-97,FUNDO DE APOSENTADORIA PROGRAMADA INDIVIDUAL A...,2024-04-30,Ações,Ação ordinária,N,Para negociação,0.0,0.0,...,498.8,410.0,19680.0,,,PRIO3,PETRORIO ON NM,BRPRIOACNOR1,2015-06-26,
4,FAPI,02.661.252/0001-97,FUNDO DE APOSENTADORIA PROGRAMADA INDIVIDUAL A...,2024-04-30,Ações,Ação ordinária,N,Para negociação,0.0,0.0,...,0.0,100.0,1909.0,,,SLCE3,SLC AGRICOLA ON NM,BRSLCEACNOR2,2007-06-15,


In [19]:
df_blc4_appd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2675312 entries, 0 to 2675311
Data columns (total 21 columns):
 #   Column              Dtype  
---  ------              -----  
 0   TP_FUNDO            object 
 1   CNPJ_FUNDO          object 
 2   DENOM_SOCIAL        object 
 3   DT_COMPTC           object 
 4   TP_APLIC            object 
 5   TP_ATIVO            object 
 6   EMISSOR_LIGADO      object 
 7   TP_NEGOC            object 
 8   QT_VENDA_NEGOC      float64
 9   VL_VENDA_NEGOC      float64
 10  QT_AQUIS_NEGOC      float64
 11  VL_AQUIS_NEGOC      float64
 12  QT_POS_FINAL        float64
 13  VL_MERC_POS_FINAL   float64
 14  VL_CUSTO_POS_FINAL  float64
 15  DT_CONFID_APLIC     object 
 16  CD_ATIVO            object 
 17  DS_ATIVO            object 
 18  CD_ISIN             object 
 19  DT_INI_VIGENCIA     object 
 20  DT_FIM_VIGENCIA     object 
dtypes: float64(7), object(14)
memory usage: 428.6+ MB


### Write the dataframe into the SQLite database

In [20]:
# connect to a local SQLite database previously created
conn = sqlite3.connect('D:/finance_data/finance_database.db') 

df_blc1_appd.to_sql('CVM_cda_b1',conn,if_exists='replace',index=False)
df_blc2_appd.to_sql('CVM_cda_b2',conn,if_exists='replace',index=False)
df_blc3_appd.to_sql('CVM_cda_b3',conn,if_exists='replace',index=False)
df_blc4_appd.to_sql('CVM_cda_b4',conn,if_exists='replace',index=False)
df_blc5_appd.to_sql('CVM_cda_b5',conn,if_exists='replace',index=False)
df_blc6_appd.to_sql('CVM_cda_b6',conn,if_exists='replace',index=False)
df_blc7_appd.to_sql('CVM_cda_b7',conn,if_exists='replace',index=False)
df_blc8_appd.to_sql('CVM_cda_b8',conn,if_exists='replace',index=False)
df_conf_appd.to_sql('CVM_cda_confidenc',conn,if_exists='replace',index=False)
df_fiim_confid_appd .to_sql('CVM_cda_fiim_confid',conn,if_exists='replace',index=False)
df_fiim_appd.to_sql('CVM_cda_fiim',conn,if_exists='replace',index=False)
df_fi_PL_appd.to_sql('CVM_fi_pl',conn,if_exists='replace',index=False)

conn.commit()
conn.close()