# üè¶ Banco Insights 2.0 - Comprehensive EDA

**Objective**: Explore BACEN data structure to understand:
- Column names, data types, and categorical variables
- Sample data from each dataset (first 10 rows)
- Foreign keys, identifiers, and unique constraints
- Categorical values and business meanings
- Data quality and completeness

**Output**: Complete data dictionary for Banco Insights 2.0

## üìã Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import json
import warnings
from pathlib import Path
import os
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

# Set up paths
DATA_PATH = Path('../bacen_project_v1/data')
RAW_DATA_PATH = DATA_PATH / 'data_raw_reports'

print("üöÄ Banco Insights 2.0 EDA - Setup Complete!")
print(f"üìÅ Data path: {DATA_PATH.absolute()}")
print(f"üìä Raw reports: {len(list(RAW_DATA_PATH.glob('*.csv')))} quarterly files found")

üöÄ Banco Insights 2.0 EDA - Setup Complete!
üìÅ Data path: /Users/iagoaffonso/code/IagoAffonso/banco-insights-2.0/EDA/../bacen_project_v1/data
üìä Raw reports: 47 quarterly files found


## üìä Part 1: Data Discovery - File Structure Analysis

In [2]:
# Discover all data files
print("üìÅ DATA FILES DISCOVERY")
print("=" * 50)

# List all CSV files in data directory
csv_files = list(DATA_PATH.glob('*.csv'))
json_files = list(DATA_PATH.glob('*.json'))
raw_files = list(RAW_DATA_PATH.glob('*.csv'))

print(f"\nüìã Main Data Directory ({len(csv_files)} CSV files):")
for file in csv_files:
    try:
        # Get file size safely
        size_mb = file.stat().st_size / (1024 * 1024)
        print(f"  ‚Ä¢ {file.name} ({size_mb:.1f} MB)")
    except Exception as e:
        print(f"  ‚Ä¢ {file.name} (size unknown)")

print(f"\nüìã JSON Files ({len(json_files)} files):")
for file in json_files:
    try:
        size_mb = file.stat().st_size / (1024 * 1024)
        print(f"  ‚Ä¢ {file.name} ({size_mb:.1f} MB)")
    except Exception as e:
        print(f"  ‚Ä¢ {file.name} (size unknown)")

print(f"\nüìã Raw Reports Directory ({len(raw_files)} files):")
print(f"  ‚Ä¢ Raw quarterly files from {raw_files[0].stem.split('_')[1]} to {raw_files[-1].stem.split('_')[1]}")
print(f"  ‚Ä¢ Pattern: data_YYYYMM_Tipo2_RelatorioT.csv")

# Focus on main consolidated files for analysis
main_files = {
    'consolidated_cleaned': 'consolidated_cleaned.csv',
    'consolidated_reports': 'consolidated_reports.csv',
    'financial_metrics': 'financial_metrics.csv',
    'financial_metrics_processed': 'financial_metrics_processed.csv',
    'market_metrics': 'market_metrics.csv',
    'credit_data': 'credit_data.csv',
    'cred_pf': 'cred_pf.csv',
    'cred_pj': 'cred_pj.csv'
}

print(f"\nüéØ MAIN FILES FOR ANALYSIS:")
for name, filename in main_files.items():
    file_path = DATA_PATH / filename
    if file_path.exists():
        try:
            size_mb = file_path.stat().st_size / (1024 * 1024)
            print(f"  ‚úÖ {name}: {filename} ({size_mb:.1f} MB)")
        except Exception as e:
            print(f"  ‚úÖ {name}: {filename} (size unknown)")
    else:
        print(f"  ‚ùå {name}: {filename} (NOT FOUND)")

üìÅ DATA FILES DISCOVERY

üìã Main Data Directory (8 CSV files):
  ‚Ä¢ financial_metrics_processed.csv (235.0 MB)
  ‚Ä¢ cred_pf.csv (89.1 MB)
  ‚Ä¢ credit_data.csv (242.3 MB)
  ‚Ä¢ consolidated_cleaned.csv (4561.8 MB)
  ‚Ä¢ market_metrics.csv (346.9 MB)
  ‚Ä¢ financial_metrics.csv (1110.8 MB)
  ‚Ä¢ consolidated_reports.csv (3704.5 MB)
  ‚Ä¢ cred_pj.csv (153.2 MB)

üìã JSON Files (1 files):
  ‚Ä¢ consolidated_institutions.json (2.7 MB)

üìã Raw Reports Directory (47 files):
  ‚Ä¢ Raw quarterly files from 201412 to 201703
  ‚Ä¢ Pattern: data_YYYYMM_Tipo2_RelatorioT.csv

üéØ MAIN FILES FOR ANALYSIS:
  ‚úÖ consolidated_cleaned: consolidated_cleaned.csv (4561.8 MB)
  ‚úÖ consolidated_reports: consolidated_reports.csv (3704.5 MB)
  ‚úÖ financial_metrics: financial_metrics.csv (1110.8 MB)
  ‚úÖ financial_metrics_processed: financial_metrics_processed.csv (235.0 MB)
  ‚úÖ market_metrics: market_metrics.csv (346.9 MB)
  ‚úÖ credit_data: credit_data.csv (242.3 MB)
  ‚úÖ cred_pf: cred_pf.csv

## üìä Part 2: Smart Data Loading (Sample-Based Analysis)

In [3]:
def safe_load_sample(file_path, sample_size=10000):
    """
    Safely load a sample of data from a CSV file to avoid memory issues
    """
    try:
        # First, read just the header to understand structure
        header_df = pd.read_csv(file_path, nrows=0)
        total_cols = len(header_df.columns)

        # Read a sample of rows
        sample_df = pd.read_csv(file_path, nrows=sample_size, low_memory=False)

        # Get file info
        file_size_mb = file_path.stat().st_size / (1024 * 1024)

        return {
            'data': sample_df,
            'total_columns': total_cols,
            'sample_rows': len(sample_df),
            'file_size_mb': file_size_mb,
            'success': True,
            'error': None
        }
    except Exception as e:
        return {
            'data': None,
            'total_columns': 0,
            'sample_rows': 0,
            'file_size_mb': 0,
            'success': False,
            'error': str(e)
        }

# Load samples from main files
datasets = {}
print("üìä LOADING DATA SAMPLES (10K rows max per file)")
print("=" * 60)

for name, filename in main_files.items():
    file_path = DATA_PATH / filename
    if file_path.exists():
        result = safe_load_sample(file_path, sample_size=10000)
        if result['success']:
            datasets[name] = result['data']
            print(f"‚úÖ {name}: {result['sample_rows']} rows √ó {result['total_columns']} cols ({result['file_size_mb']:.1f} MB)")
        else:
            print(f"‚ùå {name}: Error loading - {result['error']}")
    else:
        print(f"‚ö†Ô∏è {name}: File not found")

print(f"\nüìä Successfully loaded {len(datasets)} datasets for analysis")

üìä LOADING DATA SAMPLES (10K rows max per file)
‚úÖ consolidated_cleaned: 10000 rows √ó 15 cols (4561.8 MB)
‚úÖ consolidated_reports: 10000 rows √ó 10 cols (3704.5 MB)
‚úÖ financial_metrics: 10000 rows √ó 15 cols (1110.8 MB)
‚úÖ financial_metrics_processed: 10000 rows √ó 10 cols (235.0 MB)
‚úÖ market_metrics: 10000 rows √ó 15 cols (346.9 MB)
‚úÖ credit_data: 10000 rows √ó 15 cols (242.3 MB)
‚úÖ cred_pf: 10000 rows √ó 15 cols (89.1 MB)
‚úÖ cred_pj: 10000 rows √ó 15 cols (153.2 MB)

üìä Successfully loaded 8 datasets for analysis


## üìä Part 3: Column Analysis and Data Types

In [4]:
# Comprehensive column analysis
print("üîç COLUMN ANALYSIS & DATA TYPES")
print("=" * 60)

column_analysis = {}

for name, df in datasets.items():
    print(f"\nüìã {name.upper()}")
    print("-" * 40)
    print(f"Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")

    # Column details
    col_info = []
    for col in df.columns:
        dtype = str(df[col].dtype)
        non_null = df[col].count()
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df)) * 100
        unique_count = df[col].nunique()

        # Classify column type
        if dtype in ['int64', 'float64', 'int32', 'float32']:
            col_type = 'Numerical'
        elif 'datetime' in dtype:
            col_type = 'DateTime'
        else:
            col_type = 'Categorical/Text'

        col_info.append({
            'column': col,
            'dtype': dtype,
            'type_category': col_type,
            'non_null': non_null,
            'null_count': null_count,
            'null_pct': null_pct,
            'unique_values': unique_count
        })

    column_analysis[name] = col_info

    # Display summary
    col_df = pd.DataFrame(col_info)
    print("\nColumns:")
    for _, row in col_df.iterrows():
        print(f"  ‚Ä¢ {row['column']:<25} | {row['type_category']:<15} | {row['dtype']:<10} | Unique: {row['unique_values']:<8} | Null: {row['null_pct']:.1f}%")

    # Type summary
    type_summary = col_df['type_category'].value_counts()
    print(f"\nType Summary: {dict(type_summary)}")

üîç COLUMN ANALYSIS & DATA TYPES

üìã CONSOLIDATED_CLEANED
----------------------------------------
Shape: 10,000 rows √ó 15 columns

Columns:
  ‚Ä¢ TipoInstituicao           | Numerical       | int64      | Unique: 1        | Null: 0.0%
  ‚Ä¢ CodInst                   | Categorical/Text | object     | Unique: 586      | Null: 0.0%
  ‚Ä¢ AnoMes                    | Categorical/Text | object     | Unique: 1        | Null: 0.0%
  ‚Ä¢ NomeRelatorio             | Categorical/Text | object     | Unique: 2        | Null: 0.0%
  ‚Ä¢ NumeroRelatorio           | Numerical       | int64      | Unique: 2        | Null: 0.0%
  ‚Ä¢ Grupo                     | Categorical/Text | object     | Unique: 3        | Null: 0.0%
  ‚Ä¢ Conta                     | Numerical       | int64      | Unique: 29       | Null: 0.0%
  ‚Ä¢ NomeColuna                | Categorical/Text | object     | Unique: 30       | Null: 0.0%
  ‚Ä¢ DescricaoColuna           | Categorical/Text | object     | Unique: 29       | Null:

## üìä Part 4: Sample Data Preview (First 10 Rows)

In [5]:
# Show first 10 rows for each dataset
print("üëÄ SAMPLE DATA PREVIEW (First 10 rows)")
print("=" * 60)

for name, df in datasets.items():
    print(f"\nüìã {name.upper()} - Sample Data")
    print("-" * 50)

    # Show first 10 rows
    sample_data = df.head(10)
    print(f"Shape: {sample_data.shape}")
    print("\nFirst 10 rows:")
    display(sample_data)

    # Show column list for reference
    print(f"\nColumn Names: {list(df.columns)}")

üëÄ SAMPLE DATA PREVIEW (First 10 rows)

üìã CONSOLIDATED_CLEANED - Sample Data
--------------------------------------------------
Shape: (10, 15)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo,AnoMes_M,AnoMes_Q,AnoMes_Y,NomeRelatorio_Grupo_Coluna,NomeInstituicao
0,2,253448,2014-12-01,Ativo,2,nagroup,78202,Ativo Total Ajustado \n(i) = (a) + (b) + (c) +...,[11000006] + [12000005] + [13000004] + [1600...,178797000.0,2014-12,2014Q4,2014,Ativo_nagroup_Ativo Total Ajustado \n(i) = (a)...,BANCO NEON S.A.
1,2,253448,2014-12-01,Ativo,2,nagroup,78196,Credores por Antecipa√ß√£o de Valor Residual \n(j),[49908008],0.0,2014-12,2014Q4,2014,Ativo_nagroup_Credores por Antecipa√ß√£o de Valo...,BANCO NEON S.A.
2,2,253448,2014-12-01,Ativo,2,nagroup,78182,Ativo Total \n(k) = (i) - (j),[10000007]+[20000004],178797000.0,2014-12,2014Q4,2014,Ativo_nagroup_Ativo Total \n(k) = (i) - (j),BANCO NEON S.A.
3,2,259231,2014-12-01,Ativo,2,nagroup,78188,Disponibilidades \n(a),[11000006],711288.6,2014-12,2014Q4,2014,Ativo_nagroup_Disponibilidades \n(a),SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...
4,2,259231,2014-12-01,Ativo,2,nagroup,78189,Aplica√ß√µes Interfinanceiras de Liquidez \n(b),[12000005],0.0,2014-12,2014Q4,2014,Ativo_nagroup_Aplica√ß√µes Interfinanceiras de L...,SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...
5,2,259231,2014-12-01,Ativo,2,nagroup,78190,TVM e Instrumentos Financeiros Derivativos \n(c),[13000004],2362857.0,2014-12,2014Q4,2014,Ativo_nagroup_TVM e Instrumentos Financeiros D...,SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...
6,2,259231,2014-12-01,Ativo,2,Opera√ß√µes de Cr√©dito,78191,Opera√ß√µes de Cr√©dito \n(d1),[16000001]-[16900008],186003100.0,2014-12,2014Q4,2014,Ativo_Opera√ß√µes de Cr√©dito_Opera√ß√µes de Cr√©dit...,SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...
7,2,259231,2014-12-01,Ativo,2,Opera√ß√µes de Cr√©dito,78192,Provis√£o sobre Opera√ß√µes de Cr√©dito \n(d2),[16900008],-7838668.0,2014-12,2014Q4,2014,Ativo_Opera√ß√µes de Cr√©dito_Provis√£o sobre Oper...,SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...
8,2,259231,2014-12-01,Ativo,2,Opera√ß√µes de Cr√©dito,78193,Opera√ß√µes de Cr√©dito L√≠quidas de Provis√£o \n(d),[16000001],178164400.0,2014-12,2014Q4,2014,Ativo_Opera√ß√µes de Cr√©dito_Opera√ß√µes de Cr√©dit...,SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...
9,2,259231,2014-12-01,Ativo,2,Arrendamento Mercantil,78194,Arrendamento Mercantil a Receber \n(e1),[17000000]-[17900007],0.0,2014-12,2014Q4,2014,Ativo_Arrendamento Mercantil_Arrendamento Merc...,SICOOB UNIMAIS METROPOLITANA - COOPERATIVA DE ...



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo', 'AnoMes_M', 'AnoMes_Q', 'AnoMes_Y', 'NomeRelatorio_Grupo_Coluna', 'NomeInstituicao']

üìã CONSOLIDATED_REPORTS - Sample Data
--------------------------------------------------
Shape: (10, 10)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo
0,2,253448,201412,Ativo,2,,78202,Ativo Total Ajustado \n(i) = (a) + (b) + (c) +...,[11000006] + [12000005] + [13000004] + [1600...,17879704686
1,2,253448,201412,Ativo,2,,78196,Credores por Antecipa√ß√£o de Valor Residual \n(j),[49908008],0
2,2,253448,201412,Ativo,2,,78182,Ativo Total \n(k) = (i) - (j),[10000007]+[20000004],17879704686
3,2,259231,201412,Ativo,2,,78188,Disponibilidades \n(a),[11000006],71128856
4,2,259231,201412,Ativo,2,,78189,Aplica√ß√µes Interfinanceiras de Liquidez \n(b),[12000005],0
5,2,259231,201412,Ativo,2,,78190,TVM e Instrumentos Financeiros Derivativos \n(c),[13000004],236285671
6,2,259231,201412,Ativo,2,Opera√ß√µes de Cr√©dito,78191,Opera√ß√µes de Cr√©dito \n(d1),[16000001]-[16900008],1860031139
7,2,259231,201412,Ativo,2,Opera√ß√µes de Cr√©dito,78192,Provis√£o sobre Opera√ß√µes de Cr√©dito \n(d2),[16900008],-78386682
8,2,259231,201412,Ativo,2,Opera√ß√µes de Cr√©dito,78193,Opera√ß√µes de Cr√©dito L√≠quidas de Provis√£o \n(d),[16000001],1781644457
9,2,259231,201412,Ativo,2,Arrendamento Mercantil,78194,Arrendamento Mercantil a Receber \n(e1),[17000000]-[17900007],0



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo']

üìã FINANCIAL_METRICS - Sample Data
--------------------------------------------------
Shape: (10, 15)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo,AnoMes_M,AnoMes_Q,AnoMes_Y,NomeRelatorio_Grupo_Coluna,NomeInstituicao
0,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78204,Rendas de Opera√ß√µes de Arrendamento Mercantil ...,[71200004]+[81940000],819733.6,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
1,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78205,Rendas de Opera√ß√µes com TVM \n(a3),[71400000]+[71500003]-[71580009]+[71940003]+[7...,828141.5,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
2,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78206,Rendas de Opera√ß√µes com Instrumentos Financeir...,[71580009]+[81550005]+[71990266]+[81830268],0.0,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
3,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78207,Resultado de Opera√ß√µes de C√¢mbio \n(a5),[71300007]+[81400007],0.0,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
4,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78231,Rendas de Aplica√ß√µes Compuls√≥rias \n(a6),[71955005]+[71960007]+[71965002]+[71990125]+[8...,0.0,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
5,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78208,Receitas de Intermedia√ß√£o Financeira \n(a) = (...,"Somat√≥rio de Rendas de Opera√ß√µes de Cr√©dito, R...",1668139.0,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
6,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Despes...,78209,Despesas de Capta√ß√£o \n(b1),[81100008]+[81980008]+[81986002]+[81912007],0.0,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
7,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Despes...,78210,Despesas de Obriga√ß√µes por Empr√©stimos e Repas...,[81200001]+[81960004],-54.87,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
8,2,C0051389,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Despes...,78211,Despesas de Opera√ß√µes de Arrendamento Mercanti...,[81300004]+[81830550],-688261.4,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,BONCRED
9,2,C0051396,2014-12-01,Demonstra√ß√£o de Resultado,4,Resultado de Intermedia√ß√£o Financeira - Receit...,78203,Rendas de Opera√ß√µes de Cr√©dito \n(a1),[71100001]+[71910002]+[71915007]+[71920009]+[7...,348196100.0,2014-12,2014Q4,2014,Demonstra√ß√£o de Resultado_Resultado de Interme...,HONDA



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo', 'AnoMes_M', 'AnoMes_Q', 'AnoMes_Y', 'NomeRelatorio_Grupo_Coluna', 'NomeInstituicao']

üìã FINANCIAL_METRICS_PROCESSED - Sample Data
--------------------------------------------------
Shape: (10, 10)

First 10 rows:


Unnamed: 0,NomeInstituicao,AnoMes_Q,AnoMes,ComponentType,Component,ValueAbsolute,ValuePercentRevenue,ValuePerClient,NumClients,ReceitaOperacional
0,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Rendas de Opera√ß√µes de Cr√©dito \n(a1),481738900.0,50.85095,294281.550037,1637.0,947354800.0
1,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Rendas de Opera√ß√µes com TVM \n(a3),293853200.0,31.018286,179507.155345,1637.0,947354800.0
2,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Outras Receitas Intermedia√ß√£o,-91106030.0,-9.616887,-55654.266542,1637.0,947354800.0
3,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Rendas de Presta√ß√£o de Servi√ßos \n(d1),76548730.0,8.08026,46761.593867,1637.0,947354800.0
4,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Rendas de Tarifas Banc√°rias \n(d2),6281238.0,0.663029,3837.042205,1637.0,947354800.0
5,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Outras Receitas Operacionais \n(d7),180038700.0,19.004362,109980.89584,1637.0,947354800.0
6,ABC-BRASIL,2014Q2,2014-06-01,revenue_buildup,Receita Operacional,947354800.0,100.0,578713.970751,1637.0,947354800.0
7,ABC-BRASIL,2014Q2,2014-06-01,pl_decomposition,Receita Operacional,947354800.0,100.0,578713.970751,1637.0,947354800.0
8,ABC-BRASIL,2014Q2,2014-06-01,pl_decomposition,Despesas de Intermedia√ß√£o Financeira \n(b) = (...,-499658000.0,-52.742434,-305227.834282,1637.0,947354800.0
9,ABC-BRASIL,2014Q2,2014-06-01,pl_decomposition,Despesas de Pessoal \n(d3),-73068760.0,-7.712924,-44635.769933,1637.0,947354800.0



Column Names: ['NomeInstituicao', 'AnoMes_Q', 'AnoMes', 'ComponentType', 'Component', 'ValueAbsolute', 'ValuePercentRevenue', 'ValuePerClient', 'NumClients', 'ReceitaOperacional']

üìã MARKET_METRICS - Sample Data
--------------------------------------------------
Shape: (10, 15)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo,AnoMes_M,AnoMes_Q,AnoMes_Y,NomeRelatorio_Grupo_Coluna,NomeInstituicao
0,2,9526594,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,1171.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,BANCO MASTER DE INVESTIMENTO S.A.
1,2,9527069,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,413.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA DE ECONOMIA E CR√âDITO M√öTUO DE RIO...
2,2,9552111,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,437.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,"COOPERATIVA DE CR√âDITO DOS MAGISTRADOS, SERVID..."
3,2,9576038,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,374.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,"COOPERATIVA DE CR√âDITO, POUPAN√áA E INVESTIMENT..."
4,2,9576849,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,193.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA DE CR√âDITO DE LIVRE ADMISS√ÉO DE CE...
5,2,9579249,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,282.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA DE ECONOMIA E CR√âDITO M√öTUO DOS EM...
6,2,9590601,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,799.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA DE CR√âDITO DA SERRA CATARINENSE - ...
7,2,9639338,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,304.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA DE ECONOMIA E CR√âDITO M√öTUO DE PRA...
8,2,9720794,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,187.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA DE CR√âDITO M√öTUO DOS INTEGRANTES D...
9,2,10013534,2014-12-01,Carteira de cr√©dito ativa - quantidade de clie...,10,nagroup,113786,Quantidade de clientes com opera√ß√µes ativas,Quantidade de clientes com opera√ß√µes ativas,4.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa - quantidade de clie...,COOPERATIVA CENTRAL DE CR√âDITO COM INTERA√á√ÉO S...



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo', 'AnoMes_M', 'AnoMes_Q', 'AnoMes_Y', 'NomeRelatorio_Grupo_Coluna', 'NomeInstituicao']

üìã CREDIT_DATA - Sample Data
--------------------------------------------------
Shape: (10, 15)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo,AnoMes_M,AnoMes_Q,AnoMes_Y,NomeRelatorio_Grupo_Coluna,NomeInstituicao
0,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,nagroup,24452,Total da Carteira de Pessoa F√≠sica,Volume de cr√©dito disponibilizado a pessoas f√≠...,48796780.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
1,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo com Consigna√ß√£o em Folha,23227,Total,Total do grupo,2143650.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
2,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo sem Consigna√ß√£o em Folha,23235,Total,Total do grupo,24514270.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
3,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Ve√≠culos,23251,Total,Total do grupo,3764477.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
4,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Outros Cr√©ditos,23275,Total,Total do grupo,18374380.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
5,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,nagroup,24452,Total da Carteira de Pessoa F√≠sica,Volume de cr√©dito disponibilizado a pessoas f√≠...,7101888000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
6,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo com Consigna√ß√£o em Folha,23227,Total,Total do grupo,4115148000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
7,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo sem Consigna√ß√£o em Folha,23235,Total,Total do grupo,1452908000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
8,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Ve√≠culos,23251,Total,Total do grupo,181313900.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
9,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Habita√ß√£o,23259,Total,Total do grupo,445100600.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo', 'AnoMes_M', 'AnoMes_Q', 'AnoMes_Y', 'NomeRelatorio_Grupo_Coluna', 'NomeInstituicao']

üìã CRED_PF - Sample Data
--------------------------------------------------
Shape: (10, 15)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo,AnoMes_M,AnoMes_Q,AnoMes_Y,NomeRelatorio_Grupo_Coluna,NomeInstituicao
0,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,nagroup,24452,Total da Carteira de Pessoa F√≠sica,Volume de cr√©dito disponibilizado a pessoas f√≠...,48796780.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
1,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo com Consigna√ß√£o em Folha,23227,Total,Total do grupo,2143650.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
2,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo sem Consigna√ß√£o em Folha,23235,Total,Total do grupo,24514270.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
3,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Ve√≠culos,23251,Total,Total do grupo,3764477.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
4,2,C0031873,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Outros Cr√©ditos,23275,Total,Total do grupo,18374380.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,SOFISA
5,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,nagroup,24452,Total da Carteira de Pessoa F√≠sica,Volume de cr√©dito disponibilizado a pessoas f√≠...,7101888000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
6,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo com Consigna√ß√£o em Folha,23227,Total,Total do grupo,4115148000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
7,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Empr√©stimo sem Consigna√ß√£o em Folha,23235,Total,Total do grupo,1452908000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
8,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Ve√≠culos,23251,Total,Total do grupo,181313900.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB
9,2,C0031976,2014-12-01,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,11,Habita√ß√£o,23259,Total,Total do grupo,445100600.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa F√≠sica - moda...,BRB



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo', 'AnoMes_M', 'AnoMes_Q', 'AnoMes_Y', 'NomeRelatorio_Grupo_Coluna', 'NomeInstituicao']

üìã CRED_PJ - Sample Data
--------------------------------------------------
Shape: (10, 15)

First 10 rows:


Unnamed: 0,TipoInstituicao,CodInst,AnoMes,NomeRelatorio,NumeroRelatorio,Grupo,Conta,NomeColuna,DescricaoColuna,Saldo,AnoMes_M,AnoMes_Q,AnoMes_Y,NomeRelatorio_Grupo_Coluna,NomeInstituicao
0,2,C0049944,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Opera√ß√µes com Receb√≠veis,23307,Total,Total do grupo,994431600.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,BTG PACTUAL
1,2,C0049944,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Com√©rcio Exterior,23339,Total,Total do grupo,744190300.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,BTG PACTUAL
2,2,C0049944,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Outros Cr√©ditos,23347,Total,Total do grupo,110058300.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,BTG PACTUAL
3,2,C0049944,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Financiamento de Infraestrutura/Desenvolviment...,23315,Total,Total do grupo,1687922000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,BTG PACTUAL
4,2,C0050071,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,nagroup,24453,Total da Carteira de Pessoa Jur√≠dica,Volume de cr√©dito disponibilizado a pessoas ju...,1634524000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,JOHN DEERE
5,2,C0050071,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Capital de Giro,23283,Total,Total do grupo,25754440.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,JOHN DEERE
6,2,C0050071,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Investimento,23299,Total,Total do grupo,1106191000.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,JOHN DEERE
7,2,C0050071,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Capital de Giro Rotativo,23291,Total,Total do grupo,101844.5,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,JOHN DEERE
8,2,C0050071,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Outros Cr√©ditos,23347,Total,Total do grupo,120624700.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,JOHN DEERE
9,2,C0050071,2014-12-01,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,13,Financiamento de Infraestrutura/Desenvolviment...,23315,Total,Total do grupo,16973100.0,2014-12,2014Q4,2014,Carteira de cr√©dito ativa Pessoa Jur√≠dica - mo...,JOHN DEERE



Column Names: ['TipoInstituicao', 'CodInst', 'AnoMes', 'NomeRelatorio', 'NumeroRelatorio', 'Grupo', 'Conta', 'NomeColuna', 'DescricaoColuna', 'Saldo', 'AnoMes_M', 'AnoMes_Q', 'AnoMes_Y', 'NomeRelatorio_Grupo_Coluna', 'NomeInstituicao']


## üìä Part 5: Categorical Variables Analysis

In [6]:
# Analyze categorical variables and their values
print("üè∑Ô∏è CATEGORICAL VARIABLES ANALYSIS")
print("=" * 60)

categorical_analysis = {}

for name, df in datasets.items():
    print(f"\nüìã {name.upper()}")
    print("-" * 40)

    # Get categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    categorical_data = {}

    for col in categorical_cols:
        unique_values = df[col].value_counts()
        unique_count = len(unique_values)

        print(f"\nüè∑Ô∏è {col} ({unique_count} unique values):")

        # Show top 10 most frequent values
        top_values = unique_values.head(10)
        for value, count in top_values.items():
            percentage = (count / len(df)) * 100
            print(f"  ‚Ä¢ {str(value)[:50]:<50} | Count: {count:>6} ({percentage:.1f}%)")

        if unique_count > 10:
            print(f"  ... and {unique_count - 10} more unique values")

        categorical_data[col] = {
            'unique_count': unique_count,
            'top_values': dict(top_values.head(10)),
            'value_counts': dict(unique_values)
        }

    categorical_analysis[name] = categorical_data

    if len(categorical_cols) == 0:
        print("  No categorical columns found in this dataset")

üè∑Ô∏è CATEGORICAL VARIABLES ANALYSIS

üìã CONSOLIDATED_CLEANED
----------------------------------------

üè∑Ô∏è CodInst (586 unique values):
  ‚Ä¢ C0051468                                           | Count:     27 (0.3%)
  ‚Ä¢ C0051815                                           | Count:     27 (0.3%)
  ‚Ä¢ C0051482                                           | Count:     26 (0.3%)
  ‚Ä¢ C0051516                                           | Count:     25 (0.2%)
  ‚Ä¢ C0051750                                           | Count:     24 (0.2%)
  ‚Ä¢ C0051781                                           | Count:     24 (0.2%)
  ‚Ä¢ C0051884                                           | Count:     23 (0.2%)
  ‚Ä¢ C0051774                                           | Count:     23 (0.2%)
  ‚Ä¢ 00075847                                           | Count:     23 (0.2%)
  ‚Ä¢ 00106180                                           | Count:     23 (0.2%)
  ... and 576 more unique values

üè∑Ô∏è AnoMes (1 uni

## üìä Part 6: Identifier and Foreign Key Analysis

In [7]:
# Identify potential keys and identifiers
print("üîë IDENTIFIER & FOREIGN KEY ANALYSIS")
print("=" * 60)

identifier_analysis = {}

for name, df in datasets.items():
    print(f"\nüìã {name.upper()}")
    print("-" * 40)

    potential_identifiers = []

    for col in df.columns:
        unique_count = df[col].nunique()
        total_count = len(df)
        uniqueness_ratio = unique_count / total_count

        # Classify based on uniqueness and naming patterns
        col_lower = col.lower()

        # Check for identifier patterns
        identifier_type = None
        if uniqueness_ratio == 1.0:
            identifier_type = "Unique Identifier (100% unique)"
        elif uniqueness_ratio > 0.8:
            identifier_type = "High Uniqueness (>80%)"
        elif 'cod' in col_lower or 'id' in col_lower or 'codigo' in col_lower:
            identifier_type = "Code/ID Pattern"
        elif 'nome' in col_lower and uniqueness_ratio > 0.1:
            identifier_type = "Name Field (High Uniqueness)"
        elif 'anomes' in col_lower or 'data' in col_lower or 'ano' in col_lower:
            identifier_type = "Date/Time Identifier"
        elif uniqueness_ratio < 0.1:
            identifier_type = "Low Uniqueness (<10%) - Categorical"

        if identifier_type:
            potential_identifiers.append({
                'column': col,
                'type': identifier_type,
                'unique_count': unique_count,
                'total_count': total_count,
                'uniqueness_ratio': uniqueness_ratio
            })

    identifier_analysis[name] = potential_identifiers

    # Display results
    print("\nPotential Identifiers and Keys:")
    for item in potential_identifiers:
        print(f"  ‚Ä¢ {item['column']:<25} | {item['type']:<35} | {item['unique_count']:>6}/{item['total_count']:<6} ({item['uniqueness_ratio']:.1%})")

    # Common identifier patterns across datasets
    common_cols = ['CodInst', 'NomeInstituicao', 'AnoMes', 'TipoInstituicao']
    print("\nCommon Identifier Columns:")
    for col in common_cols:
        if col in df.columns:
            unique_count = df[col].nunique()
            print(f"  ‚úÖ {col}: {unique_count} unique values")
        else:
            print(f"  ‚ùå {col}: Not found")

üîë IDENTIFIER & FOREIGN KEY ANALYSIS

üìã CONSOLIDATED_CLEANED
----------------------------------------

Potential Identifiers and Keys:
  ‚Ä¢ TipoInstituicao           | Low Uniqueness (<10%) - Categorical |      1/10000  (0.0%)
  ‚Ä¢ CodInst                   | Code/ID Pattern                     |    586/10000  (5.9%)
  ‚Ä¢ AnoMes                    | Date/Time Identifier                |      1/10000  (0.0%)
  ‚Ä¢ NomeRelatorio             | Low Uniqueness (<10%) - Categorical |      2/10000  (0.0%)
  ‚Ä¢ NumeroRelatorio           | Low Uniqueness (<10%) - Categorical |      2/10000  (0.0%)
  ‚Ä¢ Grupo                     | Low Uniqueness (<10%) - Categorical |      3/10000  (0.0%)
  ‚Ä¢ Conta                     | Low Uniqueness (<10%) - Categorical |     29/10000  (0.3%)
  ‚Ä¢ NomeColuna                | Low Uniqueness (<10%) - Categorical |     30/10000  (0.3%)
  ‚Ä¢ DescricaoColuna           | Low Uniqueness (<10%) - Categorical |     29/10000  (0.3%)
  ‚Ä¢ AnoMes_M         

## üìä Part 7: Numerical Variables Summary

In [8]:
# Analyze numerical variables
print("üìä NUMERICAL VARIABLES ANALYSIS")
print("=" * 60)

numerical_analysis = {}

for name, df in datasets.items():
    print(f"\nüìã {name.upper()}")
    print("-" * 40)

    # Get numerical columns
    numerical_cols = df.select_dtypes(include=[np.number]).columns

    if len(numerical_cols) > 0:
        print(f"\nNumerical Columns ({len(numerical_cols)}):")

        numerical_data = {}

        for col in numerical_cols:
            stats = df[col].describe()

            # Additional statistics
            null_count = df[col].isnull().sum()
            null_pct = (null_count / len(df)) * 100
            zero_count = (df[col] == 0).sum()
            negative_count = (df[col] < 0).sum()

            print(f"\nüìä {col}:")
            print(f"  ‚Ä¢ Count: {stats['count']:,.0f} | Null: {null_count} ({null_pct:.1f}%)")
            print(f"  ‚Ä¢ Mean: {stats['mean']:,.2f} | Std: {stats['std']:,.2f}")
            print(f"  ‚Ä¢ Min: {stats['min']:,.2f} | Max: {stats['max']:,.2f}")
            print(f"  ‚Ä¢ Q1: {stats['25%']:,.2f} | Median: {stats['50%']:,.2f} | Q3: {stats['75%']:,.2f}")
            print(f"  ‚Ä¢ Zero values: {zero_count} | Negative values: {negative_count}")

            numerical_data[col] = {
                'stats': dict(stats),
                'null_count': null_count,
                'null_pct': null_pct,
                'zero_count': zero_count,
                'negative_count': negative_count
            }

        numerical_analysis[name] = numerical_data
    else:
        print("  No numerical columns found in this dataset")
        numerical_analysis[name] = {}

üìä NUMERICAL VARIABLES ANALYSIS

üìã CONSOLIDATED_CLEANED
----------------------------------------

Numerical Columns (5):

üìä TipoInstituicao:
  ‚Ä¢ Count: 10,000 | Null: 0 (0.0%)
  ‚Ä¢ Mean: 2.00 | Std: 0.00
  ‚Ä¢ Min: 2.00 | Max: 2.00
  ‚Ä¢ Q1: 2.00 | Median: 2.00 | Q3: 2.00
  ‚Ä¢ Zero values: 0 | Negative values: 0

üìä NumeroRelatorio:
  ‚Ä¢ Count: 10,000 | Null: 0 (0.0%)
  ‚Ä¢ Mean: 2.09 | Std: 0.67
  ‚Ä¢ Min: 2.00 | Max: 7.00
  ‚Ä¢ Q1: 2.00 | Median: 2.00 | Q3: 2.00
  ‚Ä¢ Zero values: 0 | Negative values: 0

üìä Conta:
  ‚Ä¢ Count: 10,000 | Null: 0 (0.0%)
  ‚Ä¢ Mean: 77,203.92 | Std: 7,274.75
  ‚Ä¢ Min: 23,367.00 | Max: 78,202.00
  ‚Ä¢ Q1: 78,191.00 | Median: 78,195.00 | Q3: 78,198.00
  ‚Ä¢ Zero values: 0 | Negative values: 0

üìä Saldo:
  ‚Ä¢ Count: 10,000 | Null: 0 (0.0%)
  ‚Ä¢ Mean: 577,694,952.95 | Std: 17,633,068,008.59
  ‚Ä¢ Min: -26,520,346,905.80 | Max: 1,064,674,795,911.33
  ‚Ä¢ Q1: 0.00 | Median: 551.23 | Q3: 4,908,623.16
  ‚Ä¢ Zero values: 4452 | Negative valu

## üìä Part 8: Data Quality Assessment

In [9]:
# Comprehensive data quality assessment
print("üîç DATA QUALITY ASSESSMENT")
print("=" * 60)

quality_summary = {}

for name, df in datasets.items():
    print(f"\nüìã {name.upper()}")
    print("-" * 40)

    # Basic quality metrics
    total_rows = len(df)
    total_columns = len(df.columns)
    total_cells = total_rows * total_columns

    # Missing values
    missing_cells = df.isnull().sum().sum()
    missing_pct = (missing_cells / total_cells) * 100

    # Duplicate rows
    duplicate_rows = df.duplicated().sum()
    duplicate_pct = (duplicate_rows / total_rows) * 100

    # Data type distribution
    numeric_cols = len(df.select_dtypes(include=[np.number]).columns)
    text_cols = len(df.select_dtypes(include=['object']).columns)
    datetime_cols = len(df.select_dtypes(include=['datetime']).columns)

    quality_info = {
        'total_rows': total_rows,
        'total_columns': total_columns,
        'total_cells': total_cells,
        'missing_cells': missing_cells,
        'missing_pct': missing_pct,
        'duplicate_rows': duplicate_rows,
        'duplicate_pct': duplicate_pct,
        'numeric_columns': numeric_cols,
        'text_columns': text_cols,
        'datetime_columns': datetime_cols
    }

    quality_summary[name] = quality_info

    # Display quality metrics
    print(f"üìä Shape: {total_rows:,} rows √ó {total_columns} columns")
    print(f"üìä Column types: {numeric_cols} numeric, {text_cols} text, {datetime_cols} datetime")
    print(f"üìä Missing values: {missing_cells:,} cells ({missing_pct:.2f}%)")
    print(f"üìä Duplicate rows: {duplicate_rows:,} ({duplicate_pct:.2f}%)")

    # Quality flags
    quality_flags = []
    if missing_pct > 10:
        quality_flags.append(f"‚ö†Ô∏è High missing data: {missing_pct:.1f}%")
    if duplicate_pct > 5:
        quality_flags.append(f"‚ö†Ô∏è High duplicates: {duplicate_pct:.1f}%")
    if missing_pct < 5 and duplicate_pct < 5:
        quality_flags.append("‚úÖ Good data quality")

    for flag in quality_flags:
        print(f"üìä {flag}")

# Overall quality summary
print(f"\n{'='*60}")
print("üìä OVERALL DATA QUALITY SUMMARY")
print(f"{'='*60}")

quality_df = pd.DataFrame(quality_summary).T
print("\nQuality Metrics by Dataset:")
display(quality_df[['total_rows', 'total_columns', 'missing_pct', 'duplicate_pct']].round(2))

üîç DATA QUALITY ASSESSMENT

üìã CONSOLIDATED_CLEANED
----------------------------------------
üìä Shape: 10,000 rows √ó 15 columns
üìä Column types: 5 numeric, 10 text, 0 datetime
üìä Missing values: 0 cells (0.00%)
üìä Duplicate rows: 0 (0.00%)
üìä ‚úÖ Good data quality

üìã CONSOLIDATED_REPORTS
----------------------------------------
üìä Shape: 10,000 rows √ó 10 columns
üìä Column types: 4 numeric, 6 text, 0 datetime
üìä Missing values: 6,172 cells (6.17%)
üìä Duplicate rows: 0 (0.00%)

üìã FINANCIAL_METRICS
----------------------------------------
üìä Shape: 10,000 rows √ó 15 columns
üìä Column types: 5 numeric, 10 text, 0 datetime
üìä Missing values: 0 cells (0.00%)
üìä Duplicate rows: 0 (0.00%)
üìä ‚úÖ Good data quality

üìã FINANCIAL_METRICS_PROCESSED
----------------------------------------
üìä Shape: 10,000 rows √ó 10 columns
üìä Column types: 5 numeric, 5 text, 0 datetime
üìä Missing values: 0 cells (0.00%)
üìä Duplicate rows: 0 (0.00%)
üìä ‚úÖ Good d

Unnamed: 0,total_rows,total_columns,missing_pct,duplicate_pct
consolidated_cleaned,10000.0,15.0,0.0,0.0
consolidated_reports,10000.0,10.0,6.17,0.0
financial_metrics,10000.0,15.0,0.0,0.0
financial_metrics_processed,10000.0,10.0,0.0,0.0
market_metrics,10000.0,15.0,0.0,0.0
credit_data,10000.0,15.0,0.0,0.0
cred_pf,10000.0,15.0,0.0,0.0
cred_pj,10000.0,15.0,0.0,0.0


## üìã Part 9: Data Dictionary Generation

In [10]:
# Generate comprehensive data dictionary
print("üìñ GENERATING DATA DICTIONARY")
print("=" * 60)

data_dictionary = {
    'metadata': {
        'analysis_date': datetime.now().isoformat(),
        'total_datasets': len(datasets),
        'data_source': 'BACEN IFDATA - Brazilian Banking Data'
    },
    'datasets': {}
}

for name, df in datasets.items():
    dataset_info = {
        'description': f'BACEN dataset: {name}',
        'shape': {'rows': len(df), 'columns': len(df.columns)},
        'quality': quality_summary[name],
        'columns': {},
        'sample_data': df.head(3).to_dict('records')
    }

    # Column details
    for col in df.columns:
        col_info = {
            'data_type': str(df[col].dtype),
            'unique_values': int(df[col].nunique()),
            'null_count': int(df[col].isnull().sum()),
            'null_percentage': float((df[col].isnull().sum() / len(df)) * 100)
        }

        # Add type-specific information
        if df[col].dtype in ['int64', 'float64', 'int32', 'float32']:
            col_info['category'] = 'numerical'
            stats = df[col].describe()
            col_info['statistics'] = {
                'min': float(stats['min']) if not pd.isna(stats['min']) else None,
                'max': float(stats['max']) if not pd.isna(stats['max']) else None,
                'mean': float(stats['mean']) if not pd.isna(stats['mean']) else None,
                'std': float(stats['std']) if not pd.isna(stats['std']) else None
            }
        else:
            col_info['category'] = 'categorical'
            # Get top 5 values for categorical columns
            top_values = df[col].value_counts().head(5)
            col_info['top_values'] = dict(top_values.astype(str))

        # Business meaning inference
        col_lower = col.lower()
        if 'cod' in col_lower or 'codigo' in col_lower:
            col_info['business_meaning'] = 'Identifier/Code'
        elif 'nome' in col_lower:
            col_info['business_meaning'] = 'Name/Description'
        elif 'anomes' in col_lower or 'data' in col_lower:
            col_info['business_meaning'] = 'Date/Time Reference'
        elif 'saldo' in col_lower or 'valor' in col_lower:
            col_info['business_meaning'] = 'Financial Amount'
        elif 'tipo' in col_lower:
            col_info['business_meaning'] = 'Classification/Type'
        elif 'relatorio' in col_lower:
            col_info['business_meaning'] = 'Report Information'
        else:
            col_info['business_meaning'] = 'To be determined'

        dataset_info['columns'][col] = col_info

    data_dictionary['datasets'][name] = dataset_info

print("‚úÖ Data dictionary generated successfully!")
print(f"üìä Analyzed {len(datasets)} datasets with {sum(len(d.columns) for d in datasets.values())} total columns")

üìñ GENERATING DATA DICTIONARY
‚úÖ Data dictionary generated successfully!
üìä Analyzed 8 datasets with 110 total columns


## üìä Part 10: Key Findings Summary

In [11]:
# Summarize key findings
print("üéØ KEY FINDINGS SUMMARY")
print("=" * 60)

# Dataset overview
total_records = sum(len(df) for df in datasets.values())
total_columns = sum(len(df.columns) for df in datasets.values())

print(f"\nüìä DATASET OVERVIEW:")
print(f"  ‚Ä¢ {len(datasets)} main datasets analyzed")
print(f"  ‚Ä¢ {total_records:,} total records (sample)")
print(f"  ‚Ä¢ {total_columns} total columns across all datasets")

# Common identifier columns
common_identifiers = ['CodInst', 'NomeInstituicao', 'AnoMes', 'TipoInstituicao']
print(f"\nüîë COMMON IDENTIFIERS:")
for identifier in common_identifiers:
    datasets_with_id = [name for name, df in datasets.items() if identifier in df.columns]
    if datasets_with_id:
        print(f"  ‚Ä¢ {identifier}: Found in {len(datasets_with_id)} datasets ({', '.join(datasets_with_id)})")

# Data quality insights
print(f"\nüîç DATA QUALITY INSIGHTS:")
high_quality_datasets = [name for name, info in quality_summary.items()
                        if info['missing_pct'] < 5 and info['duplicate_pct'] < 5]
print(f"  ‚Ä¢ {len(high_quality_datasets)} datasets have good quality (low missing/duplicates)")

# Business insights
print(f"\nüíº BUSINESS INSIGHTS:")
financial_columns = []
for name, df in datasets.items():
    for col in df.columns:
        if 'saldo' in col.lower() or 'valor' in col.lower() or 'carteira' in col.lower():
            financial_columns.append(f"{name}.{col}")

print(f"  ‚Ä¢ {len(financial_columns)} financial amount columns identified")
print(f"  ‚Ä¢ Time series data available (AnoMes columns)")
print(f"  ‚Ä¢ Institution-level data perfect for benchmarking")
print(f"  ‚Ä¢ Multiple report types enable comprehensive analysis")

# Recommendations
print(f"\nüöÄ RECOMMENDATIONS FOR BANCO INSIGHTS 2.0:")
recommendations = [
    "Use CodInst as primary institution identifier",
    "Implement time series analysis on AnoMes fields",
    "Create market share calculations using Saldo columns",
    "Build institution profiles using NomeInstituicao",
    "Develop peer comparison features",
    "Implement data quality monitoring",
    "Create automated ETL for raw quarterly reports"
]

for i, rec in enumerate(recommendations, 1):
    print(f"  {i}. {rec}")

print(f"\n{'='*60}")
print("üèÜ EDA COMPLETE - Ready for Banco Insights 2.0 Development!")
print(f"{'='*60}")

üéØ KEY FINDINGS SUMMARY

üìä DATASET OVERVIEW:
  ‚Ä¢ 8 main datasets analyzed
  ‚Ä¢ 80,000 total records (sample)
  ‚Ä¢ 110 total columns across all datasets

üîë COMMON IDENTIFIERS:
  ‚Ä¢ CodInst: Found in 7 datasets (consolidated_cleaned, consolidated_reports, financial_metrics, market_metrics, credit_data, cred_pf, cred_pj)
  ‚Ä¢ NomeInstituicao: Found in 7 datasets (consolidated_cleaned, financial_metrics, financial_metrics_processed, market_metrics, credit_data, cred_pf, cred_pj)
  ‚Ä¢ AnoMes: Found in 8 datasets (consolidated_cleaned, consolidated_reports, financial_metrics, financial_metrics_processed, market_metrics, credit_data, cred_pf, cred_pj)
  ‚Ä¢ TipoInstituicao: Found in 7 datasets (consolidated_cleaned, consolidated_reports, financial_metrics, market_metrics, credit_data, cred_pf, cred_pj)

üîç DATA QUALITY INSIGHTS:
  ‚Ä¢ 7 datasets have good quality (low missing/duplicates)

üíº BUSINESS INSIGHTS:
  ‚Ä¢ 7 financial amount columns identified
  ‚Ä¢ Time series da

## üìÅ Part 11: Export Data Dictionary

In [12]:
# Save data dictionary to JSON file
output_file = 'banco_insights_data_dictionary.json'

# Convert any numpy types to native Python types for JSON serialization
def convert_types(obj):
    if isinstance(obj, dict):
        return {k: convert_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_types(item) for item in obj]
    elif isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    else:
        return obj

# Clean data dictionary for JSON export
clean_dictionary = convert_types(data_dictionary)

# Save to file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(clean_dictionary, f, indent=2, ensure_ascii=False)

print(f"üíæ Data dictionary saved to: {output_file}")
print(f"üìä Dictionary contains {len(clean_dictionary['datasets'])} datasets")
print(f"üîç Total columns documented: {sum(len(ds['columns']) for ds in clean_dictionary['datasets'].values())}")

# Also create a simplified summary CSV
summary_data = []
for dataset_name, dataset_info in clean_dictionary['datasets'].items():
    for col_name, col_info in dataset_info['columns'].items():
        summary_data.append({
            'dataset': dataset_name,
            'column': col_name,
            'data_type': col_info['data_type'],
            'category': col_info['category'],
            'business_meaning': col_info['business_meaning'],
            'unique_values': col_info['unique_values'],
            'null_percentage': round(col_info['null_percentage'], 2)
        })

summary_df = pd.DataFrame(summary_data)
summary_csv = 'banco_insights_columns_summary.csv'
summary_df.to_csv(summary_csv, index=False)

print(f"üìã Column summary saved to: {summary_csv}")
print("\n‚úÖ EDA Complete! Files generated:")
print(f"  ‚Ä¢ {output_file} - Complete data dictionary")
print(f"  ‚Ä¢ {summary_csv} - Column summary table")

üíæ Data dictionary saved to: banco_insights_data_dictionary.json
üìä Dictionary contains 8 datasets
üîç Total columns documented: 110
üìã Column summary saved to: banco_insights_columns_summary.csv

‚úÖ EDA Complete! Files generated:
  ‚Ä¢ banco_insights_data_dictionary.json - Complete data dictionary
  ‚Ä¢ banco_insights_columns_summary.csv - Column summary table
