In [1]:
# %%
from load_path import *
from classes.data_loader import DataLoader
from classes.process_dataframe import DataFrameProcessor
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import json
from pathlib import Path
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot





In [2]:
# %%
# Initialize DataLoader and DataFrameProcessor
loader = DataLoader()
processor = DataFrameProcessor()


In [3]:
# %%
# Load metadata
metadata_path = Path.cwd().parent / "config" / "ctgan_metadata" / "dividas_e_onus.json"
with open(metadata_path, 'r') as f:
    metadata = json.load(f)


In [4]:
# %%
# Create SingleTableMetadata
table_metadata = SingleTableMetadata()
for column, sdtype in metadata['sdtypes'].items():
    table_metadata.add_column(column_name=column, sdtype=sdtype)


In [5]:
# %%
# Initialize CTGAN with metadata
synthesizer = CTGANSynthesizer(
    table_metadata,
    enforce_min_max_values=True,
    enforce_rounding=True,
    epochs=1000
)




In [6]:
# %%
# Set NA handling
processor.set_fill_na(True)


In [7]:
# %%
# Load raw data
df_raw = loader.load_single_csv("dividas_e_onus.csv")



=== Validation Results for dividas_e_onus.csv ===

Overall Status: ✓ Valid

Loading Integrity:
  ✓ Row count matches
  ✓ All values match original CSV

Data Quality:
  ✓ No null values found
  ✓ All numeric columns contain valid numbers

Value Ranges:
  ✓ No negative monetary values found
  ✓ All values within expected ranges



In [8]:
# %%
# Get normalized column names
normalized_cols = processor.get_columns(df_raw)
print("Normalized columns:", normalized_cols)


Normalized columns: ['ano_calendario', 'emprestimos_contraidos_no_exterior', 'estabelecimento_bancario_comercial', 'outras_dividas_e_onus_reais', 'outras_pessoas_juridicas', 'pessoas_fisicas', 'soc_de_credito_financiamento_e_investimento', 'outros', 'invalido']


In [9]:
# %%
# Get original column names
original_cols = processor.get_columns(df_raw, normalized=False)
print("Original columns:", original_cols)


Original columns: ['Ano Calendário', 'Empréstimos contraídos no exterior', 'Estabelecimento bancário comercial', 'Outras dívidas e ônus reais', 'Outras pessoas jurídicas', 'Pessoas físicas', 'Soc. de crédito, financiamento e investimento', 'Outros', 'Inválido']


In [10]:
# %%
# Set up column data types
column_dtypes = {
    'ano_calendario': 'float64',
    'emprestimos_contraidos_no_exterior': 'float64',
    'estabelecimento_bancario_comercial': 'float64',
    'outras_dividas_e_onus_reais': 'float64',
    'outras_pessoas_juridicas': 'float64',
    'pessoas_fisicas': 'float64',
    'soc_de_credito_financiamento_e_investimento': 'float64',
    'outros': 'float64',
    'invalido': 'float64'
}
processor.set_column_dtypes(column_dtypes)


In [11]:
# %%
# Process the DataFrame with types
df = processor.normalize_columns(df_raw)


In [12]:
# %%
# Check data types of all columns
print("\nDataframe dtypes:")
df.info()



Dataframe dtypes:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 9 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   ano_calendario                               14 non-null     float64
 1   emprestimos_contraidos_no_exterior           14 non-null     float64
 2   estabelecimento_bancario_comercial           14 non-null     float64
 3   outras_dividas_e_onus_reais                  14 non-null     float64
 4   outras_pessoas_juridicas                     14 non-null     float64
 5   pessoas_fisicas                              14 non-null     float64
 6   soc_de_credito_financiamento_e_investimento  14 non-null     float64
 7   outros                                       14 non-null     float64
 8   invalido                                     14 non-null     float64
dtypes: float64(9)
memory usage: 1.1 KB


In [13]:
# %%
print("Training CTGAN model...")
synthesizer.fit(df)


Training CTGAN model...


In [14]:
# %%
print("Training complete!")

Training complete!


In [15]:
# %%
# Generate 100 synthetic samples
synthetic_data = synthesizer.sample(num_rows=1000)




In [16]:
# %%
# Display first few rows of synthetic data
print("\nFirst few rows of synthetic data:")
synthetic_data


First few rows of synthetic data:


Unnamed: 0,ano_calendario,emprestimos_contraidos_no_exterior,estabelecimento_bancario_comercial,outras_dividas_e_onus_reais,outras_pessoas_juridicas,pessoas_fisicas,soc_de_credito_financiamento_e_investimento,outros,invalido
0,2011.0,6181.191364,159369.450064,17662.872079,47418.379832,86151.020996,49212.055894,328.661672,5.794374
1,2014.0,5400.678482,138864.844654,32180.005558,39405.083476,91132.224025,98059.684706,511.523133,0.000000
2,2012.0,23919.365128,267630.531599,54643.786030,87028.902516,158221.647153,86725.184271,11.501529,0.000000
3,2010.0,2294.622562,74760.228220,26355.172864,33269.087460,76258.745227,39701.761206,404.192506,63.853194
4,2018.0,23183.323471,372045.116134,51172.889285,78037.731663,150782.663450,67432.639935,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
995,2014.0,14930.043097,241433.108108,42523.777381,80314.345691,144328.152353,73902.614770,7.598206,0.000000
996,2008.0,2294.622562,82344.966708,9814.670524,35293.996577,39700.641850,30268.354273,485.680379,16.197245
997,2007.0,2294.622562,104561.168700,13241.351163,33269.087460,39700.641850,21222.633457,551.241707,91.948488
998,2020.0,17272.922115,338542.990711,44087.236455,107091.169750,138851.313857,100624.372982,126.932621,0.000000


In [17]:
# %%
diagnostic_report = run_diagnostic(
    real_data=df,
    synthetic_data=synthetic_data,
    metadata=table_metadata)


Generating report ...

(1/2) Evaluating Data Validity: |██████████| 9/9 [00:00<00:00, 1878.42it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 817.13it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [18]:
# %%
diagnostic_report.get_details(property_name='Data Validity')


Unnamed: 0,Column,Metric,Score
0,ano_calendario,BoundaryAdherence,1.0
1,emprestimos_contraidos_no_exterior,BoundaryAdherence,1.0
2,estabelecimento_bancario_comercial,BoundaryAdherence,1.0
3,outras_dividas_e_onus_reais,BoundaryAdherence,1.0
4,outras_pessoas_juridicas,BoundaryAdherence,1.0
5,pessoas_fisicas,BoundaryAdherence,1.0
6,soc_de_credito_financiamento_e_investimento,BoundaryAdherence,1.0
7,outros,BoundaryAdherence,1.0
8,invalido,BoundaryAdherence,1.0


In [19]:
# %%
quality_report = evaluate_quality(
    real_data=df,
    synthetic_data=synthetic_data,
    metadata=table_metadata)


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 1123.84it/s]|
Column Shapes Score: 79.49%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 413.34it/s]|
Column Pair Trends Score: 92.84%

Overall Score (Average): 86.17%



In [20]:
# %%
quality_report.get_details(property_name='Column Shapes')


Unnamed: 0,Column,Metric,Score
0,ano_calendario,KSComplement,0.930857
1,emprestimos_contraidos_no_exterior,KSComplement,0.739571
2,estabelecimento_bancario_comercial,KSComplement,0.827143
3,outras_dividas_e_onus_reais,KSComplement,0.788143
4,outras_pessoas_juridicas,KSComplement,0.841
5,pessoas_fisicas,KSComplement,0.878857
6,soc_de_credito_financiamento_e_investimento,KSComplement,0.713143
7,outros,KSComplement,0.776286
8,invalido,KSComplement,0.659286


In [21]:
# %%
fig = get_column_plot(
    real_data=df,
    synthetic_data=synthetic_data,
    metadata=table_metadata,
    column_name='invalido'
)
    
fig.show()

In [22]:
# %%
