# Getting brazilians companies segment, sectors and governance categorization

**What ?** Code to make the web scraping from the brazilian stocks market companies segments, sectors and governance market cassifications over each ticker

**Why ?** The companies sectors is a valuable information to market analises like prices sectors correlations and companies risk levels by the governance status

**How ?** The sectorization is available to download at the brazilian stock exchange company (B3) website. The zip file will be downloaded by request function and the table with the sectors information it is parsed in order to extract the final data structured table. At the end, the table it will be saved in a local SQL database with SQLite engine.

In [113]:
import pandas as pd
import numpy as np
import sqlite3

import requests
import zipfile
import os

#### Downloand the zip file from B3 website

In [108]:
def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    return local_filename


url = 'https://www.b3.com.br/data/files/57/E6/AA/A1/68C7781064456178AC094EA8/ClassifSetorial.zip'
local_filename = 'file.zip'

# Call the function to download the file
download_file(url, local_filename)

'file.zip'

#### Extracting the excel file content from the zip file

In [109]:
with zipfile.ZipFile(local_filename, 'r') as zip_ref:

    for file_name in zip_ref.namelist():
        # Check if the file is an Excel file (.xlsx)
        if file_name.endswith('.xlsx'):
            # Extract the Excel file to a temporary location
            zip_ref.extract(file_name, path='temp_folder')  # Replace 'temp_folder' with your desired extraction path
            # Load the Excel file into a DataFrame
            df = pd.read_excel('temp_folder/' + file_name,skiprows=6, sheet_name = 'Plan3')
            # Do whatever you need with the DataFrame
            print(df)
            # delete the extracted file after use
            os.remove('temp_folder/' + file_name)
            
df.head()

                                       SETOR ECONÔMICO  \
0                                                  NaN   
1                      Petróleo, Gás e Biocombustíveis   
2                                                  NaN   
3                                                  NaN   
4                                                  NaN   
..                                                 ...   
569  As informações recebidas das empresas admitida...   
570                          nosso site www.b3.com.br.   
571  Para mais esclarecimentos, sugerimos procurar ...   
572  potenciais das negociações com valores mobiliá...   
573                    B3 S.A. - Brasil, Bolsa, Balcão   

                            SUBSETOR                           SEGMENTO  \
0                                NaN                                NaN   
1    Petróleo, Gás e Biocombustíveis  Exploração, Refino e Distribuição   
2                                NaN                       3R PETROLEUM   
3  

Unnamed: 0,SETOR ECONÔMICO,SUBSETOR,SEGMENTO,LISTAGEM,Unnamed: 4
0,,,,CÓDIGO,SEGMENTO
1,"Petróleo, Gás e Biocombustíveis","Petróleo, Gás e Biocombustíveis","Exploração, Refino e Distribuição",,
2,,,3R PETROLEUM,RRRP,NM
3,,,COSAN,CSAN,NM
4,,,ENAUTA PART,ENAT,NM


#### Parsing data file from B3 into structured table

In [110]:
# rename coloumns
new_columns_names = {'SETOR ECONÔMICO':'economic_sector',
                     'SUBSETOR':'subsector',
                     'SEGMENTO':'company_name',
                     'LISTAGEM':'ticker_root',
                     'Unnamed: 4':'governance'}

df.rename(columns = new_columns_names,inplace = True ) #change columns name
df = df.drop(df.index[0]).reset_index(drop=True) #deleting first row that was a header

df['segment'] = df['company_name'] # split company name from segment into two columns

#fill non segment rows as null
df.loc[df['ticker_root'].notnull(),'segment'] = np.nan

# filling columns with the valid sector and segment
df['economic_sector'] = df['economic_sector'].fillna(method='ffill')
df['subsector'] = df['subsector'].fillna(method='ffill')
df['segment'] = df['segment'].fillna(method='ffill')

# filtering only rows with real value (valid tickers)
df = df[(df['ticker_root'].str.len() <= 4) & (df['ticker_root'].notnull())].reset_index(drop = True)

# reordering columns
df = df[['ticker_root','company_name','segment','economic_sector','subsector','governance']]


# eliminating white spaces
df['ticker_root'] = df['ticker_root'].str.strip()
df['segment'] = df['segment'].str.strip()
df['economic_sector'] = df['economic_sector'].str.strip()
df['subsector'] = df['subsector'].str.strip()
df['governance'] = df['governance'].str.strip()


# creating a new variable with the governance names from the codes
new_gov_names = {'NM': 'Cia. Novo Mercado',
'N1': 'Cia. Nível 1 de Governança Corporativa',
'N2': 'Cia. Nível 2 de Governança Corporativa',
'MA': 'Cia. Bovespa Mais',
'M2': 'Cia. Bovespa Mais Nível 2',
'MB': 'Cia. Balcão Org. Tradicional',
'DR1': 'BDR Nível 1',
'DR2': 'BDR Nível 2',
'DR3': 'BDR Nível 3',
'DRE': 'BDR de ETF',
'DRN': 'BDR Não Patrocinado'}

df['governance_names'] = df['governance'].map(new_gov_names)

df.head()

Unnamed: 0,ticker_root,company_name,segment,economic_sector,subsector,governance,governance_names
0,RRRP,3R PETROLEUM,"Exploração, Refino e Distribuição","Petróleo, Gás e Biocombustíveis","Petróleo, Gás e Biocombustíveis",NM,Cia. Novo Mercado
1,CSAN,COSAN,"Exploração, Refino e Distribuição","Petróleo, Gás e Biocombustíveis","Petróleo, Gás e Biocombustíveis",NM,Cia. Novo Mercado
2,ENAT,ENAUTA PART,"Exploração, Refino e Distribuição","Petróleo, Gás e Biocombustíveis","Petróleo, Gás e Biocombustíveis",NM,Cia. Novo Mercado
3,RPMG,PET MANGUINH,"Exploração, Refino e Distribuição","Petróleo, Gás e Biocombustíveis","Petróleo, Gás e Biocombustíveis",,
4,PETR,PETROBRAS,"Exploração, Refino e Distribuição","Petróleo, Gás e Biocombustíveis","Petróleo, Gás e Biocombustíveis",N2,Cia. Nível 2 de Governança Corporativa


#### Saving the dataframe at a local database file with SQLite

In [119]:
conn = sqlite3.connect('D:/finance_data/finance_database.db')

df.to_sql('B3_companies_sectors',conn,if_exists='replace',index=False)

434