# Explora√ß√£o de Dados - Olist E-Commerce
## Bronze Layer - Gera√ß√£o do Data Dictionary

In [35]:
# imports e setups
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

DATA_PATH = Path('../data/raw')
print("Setup finished!")

Setup finished!


In [36]:
# dicion√°rio com todos os datasets
datasets = {
    'orders': 'olist_orders_dataset.csv',
    'order_items': 'olist_order_items_dataset.csv',
    'customers': 'olist_customers_dataset.csv',
    'products': 'olist_products_dataset.csv',
    'sellers': 'olist_sellers_dataset.csv',
    'reviews': 'olist_order_reviews_dataset.csv',
    'payments': 'olist_order_payments_dataset.csv',
    'geolocation': 'olist_geolocation_dataset.csv',
    'category_translation': 'product_category_name_translation.csv'
}

# carregar os datasets
dfs = {}
for name, filename in datasets.items():
    dfs[name] = pd.read_csv(DATA_PATH / filename, nrows=10000)
    print(f"{name}: {dfs[name].shape}")

print("\nDatasets carregados!")

orders: (10000, 8)
order_items: (10000, 7)
customers: (10000, 5)
products: (10000, 9)
sellers: (3095, 4)
reviews: (10000, 7)
payments: (10000, 5)
geolocation: (10000, 5)
category_translation: (71, 2)

Datasets carregados!


In [37]:
## gerar dicion√°rio de dados autom√°tico
def generate_data_dictionary(df, dataset_name):
    """
    Gera dicion√°rio de dados autom√°tico para um dataset
    """
    print(f"\n{'='*80}")
    print(f"üìä DATASET: {dataset_name.upper()}")
    print(f"{'='*80}\n")
    
    # Informa√ß√µes gerais
    print(f"**Total de Linhas**: {len(df):,}")
    print(f"**Total de Colunas**: {len(df.columns)}")
    print(f"**Mem√≥ria**: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB\n")
    
    # Tabela de colunas
    print("### Colunas\n")
    print("| Coluna | Tipo | N√£o-Nulos | Nulos | Nulos % | √önicos | Exemplo |")
    print("|--------|------|-----------|-------|---------|--------|---------|")
    
    for col in df.columns:
        dtype = str(df[col].dtype)
        non_null = df[col].count()
        null_count = df[col].isnull().sum()
        null_pct = (null_count / len(df) * 100)
        unique = df[col].nunique()
        
        # Pegar exemplo (primeiro valor n√£o-nulo)
        example = df[col].dropna().iloc[0] if df[col].count() > 0 else "N/A"
        if isinstance(example, (int, float)):
            example = f"{example:.2f}" if isinstance(example, float) else str(example)
        else:
            example = str(example)[:20]  # Limitar tamanho
        
        print(f"| {col} | {dtype} | {non_null:,} | {null_count:,} | {null_pct:.1f}% | {unique:,} | {example} |")
    
    # Estat√≠sticas descritivas para colunas num√©ricas
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print("\n### Estat√≠sticas Num√©ricas\n")
        print(df[numeric_cols].describe().to_markdown())
    
    # Primeiras linhas
    print("\n### Sample Data (primeiras 3 linhas)\n")
    print(df.head(3).to_markdown())
    
    print("\n" + "="*80 + "\n")

In [38]:
## gerar dicion√°rio de dados para cada dataset
for name, df in dfs.items():
    generate_data_dictionary(df, name)


üìä DATASET: ORDERS

**Total de Linhas**: 10,000
**Total de Colunas**: 8
**Mem√≥ria**: 5.32 MB

### Colunas

| Coluna | Tipo | N√£o-Nulos | Nulos | Nulos % | √önicos | Exemplo |
|--------|------|-----------|-------|---------|--------|---------|
| order_id | object | 10,000 | 0 | 0.0% | 10,000 | e481f51cbdc54678b7cc |
| customer_id | object | 10,000 | 0 | 0.0% | 10,000 | 9ef432eb6251297304e7 |
| order_status | object | 10,000 | 0 | 0.0% | 7 | delivered |
| order_purchase_timestamp | object | 10,000 | 0 | 0.0% | 9,995 | 2017-10-02 10:56:33 |
| order_approved_at | object | 9,979 | 21 | 0.2% | 9,868 | 2017-10-02 11:07:15 |
| order_delivered_carrier_date | object | 9,834 | 166 | 1.7% | 9,511 | 2017-10-04 19:55:00 |
| order_delivered_customer_date | object | 9,720 | 280 | 2.8% | 9,710 | 2017-10-10 21:25:13 |
| order_estimated_delivery_date | object | 10,000 | 0 | 0.0% | 421 | 2017-10-18 00:00:00 |

### Sample Data (primeiras 3 linhas)

|    | order_id                         | customer_id 

In [39]:
## an√°lise de relacionamentos
print("\n" + "="*80)
print("üîó AN√ÅLISE DE RELACIONAMENTOS")
print("="*80 + "\n")

# Definir relacionamentos (tabela origem, coluna FK, tabela destino, coluna PK)
relationships = [
    ('orders', 'customer_id', 'customers', 'customer_id'),
    ('order_items', 'order_id', 'orders', 'order_id'),
    ('order_items', 'product_id', 'products', 'product_id'),
    ('order_items', 'seller_id', 'sellers', 'seller_id'),
    ('reviews', 'order_id', 'orders', 'order_id'),
    ('payments', 'order_id', 'orders', 'order_id'),
]

print("| Tabela Origem | Coluna FK | Tabela Destino | Coluna PK | Match % |")
print("|---------------|-----------|----------------|-----------|---------|")

for source_table, source_col, target_table, target_col in relationships:
    source_df = dfs[source_table]
    target_df = dfs[target_table]
    
    matches = source_df[source_col].isin(target_df[target_col]).sum()
    total = len(source_df)
    match_pct = (matches / total * 100)
    
    print(f"| {source_table} | {source_col} | {target_table} | {target_col} | {match_pct:.1f}% |")


üîó AN√ÅLISE DE RELACIONAMENTOS

| Tabela Origem | Coluna FK | Tabela Destino | Coluna PK | Match % |
|---------------|-----------|----------------|-----------|---------|
| orders | customer_id | customers | customer_id | 10.4% |
| order_items | order_id | orders | order_id | 9.7% |
| order_items | product_id | products | product_id | 30.4% |
| order_items | seller_id | sellers | seller_id | 100.0% |
| reviews | order_id | orders | order_id | 10.0% |
| payments | order_id | orders | order_id | 10.3% |


In [40]:
## an√°lise de cardinalidade
print("\n" + "="*80)
print("üìä CARDINALIDADE")
print("="*80 + "\n")

print(f"Clientes √önicos:    {dfs['customers']['customer_id'].nunique():,}")
print(f"Pedidos √önicos:     {dfs['orders']['order_id'].nunique():,}")
print(f"Produtos √önicos:    {dfs['products']['product_id'].nunique():,}")
print(f"Vendedores √önicos:  {dfs['sellers']['seller_id'].nunique():,}")

print(f"\nItens por Pedido:   {len(dfs['order_items']) / len(dfs['orders']):.2f} (m√©dia)")
print(f"Pedidos por Cliente: {len(dfs['orders']) / dfs['customers']['customer_id'].nunique():.2f} (m√©dia)")


üìä CARDINALIDADE

Clientes √önicos:    10,000
Pedidos √önicos:     10,000
Produtos √önicos:    10,000
Vendedores √önicos:  3,095

Itens por Pedido:   1.00 (m√©dia)
Pedidos por Cliente: 1.00 (m√©dia)


In [41]:
## 7. Qualidade dos Dados
print("\n" + "="*80)
print("üîç QUALIDADE DOS DADOS")
print("="*80 + "\n")

for name, df in dfs.items():
    total_cells = df.shape[0] * df.shape[1]
    missing_cells = df.isnull().sum().sum()
    missing_pct = (missing_cells / total_cells * 100)
    duplicates = df.duplicated().sum()
    
    print(f"\nüìã {name.upper()}")
    print(f"    C√©lulas totais:     {total_cells:,}")
    print(f"    C√©lulas faltando:   {missing_cells:,} ({missing_pct:.2f}%)")
    print(f"    Linhas duplicadas: {duplicates:,}")
    
    # Mostrar colunas com dados faltantes
    missing_cols = df.columns[df.isnull().any()].tolist()
    if missing_cols:
        print(f"    Colunas com nulls: {', '.join(missing_cols)}")


üîç QUALIDADE DOS DADOS


üìã ORDERS
    C√©lulas totais:     80,000
    C√©lulas faltando:   467 (0.58%)
    Linhas duplicadas: 0
    Colunas com nulls: order_approved_at, order_delivered_carrier_date, order_delivered_customer_date

üìã ORDER_ITEMS
    C√©lulas totais:     70,000
    C√©lulas faltando:   0 (0.00%)
    Linhas duplicadas: 0

üìã CUSTOMERS
    C√©lulas totais:     50,000
    C√©lulas faltando:   0 (0.00%)
    Linhas duplicadas: 0

üìã PRODUCTS
    C√©lulas totais:     90,000
    C√©lulas faltando:   756 (0.84%)
    Linhas duplicadas: 0
    Colunas com nulls: product_category_name, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm

üìã SELLERS
    C√©lulas totais:     12,380
    C√©lulas faltando:   0 (0.00%)
    Linhas duplicadas: 0

üìã REVIEWS
    C√©lulas totais:     70,000
    C√©lulas faltando:   14,630 (20.90%)
    Linhas duplicadas: 0
    Colunas com nulls: review_c

In [42]:
## 8. M√©tricas de Neg√≥cio
print("\n" + "="*80)
print("üíº M√âTRICAS DE NEG√ìCIO")
print("="*80 + "\n")

# Per√≠odo dos dados
dfs['orders']['order_purchase_timestamp'] = pd.to_datetime(dfs['orders']['order_purchase_timestamp'])
print(f"üìÖ Per√≠odo dos Dados:")
print(f"    Primeira venda: {dfs['orders']['order_purchase_timestamp'].min()}")
print(f"    √öltima venda:    {dfs['orders']['order_purchase_timestamp'].max()}")

# Status dos pedidos
print(f"\nüì¶ Status dos Pedidos:")
print(dfs['orders']['order_status'].value_counts())

# Distribui√ß√£o geogr√°fica
print(f"\nüåé Top 5 Estados (Clientes):")
print(dfs['customers']['customer_state'].value_counts().head())

# Avalia√ß√µes
print(f"\n‚≠ê Distribui√ß√£o de Avalia√ß√µes:")
print(dfs['reviews']['review_score'].value_counts().sort_index())
print(f"\n    M√©dia: {dfs['reviews']['review_score'].mean():.2f}/5.0")

# Pagamentos
print(f"\nüí≥ M√©todos de Pagamento:")
print(dfs['payments']['payment_type'].value_counts())

# Receita
total_product = dfs['order_items']['price'].sum()
total_freight = dfs['order_items']['freight_value'].sum()
print(f"\nüí∞ Receita:")
print(f"    Produtos: R$ {total_product:,.2f}")
print(f"    Frete:    R$ {total_freight:,.2f}")
print(f"    Total:    R$ {total_product + total_freight:,.2f}")


üíº M√âTRICAS DE NEG√ìCIO

üìÖ Per√≠odo dos Dados:
    Primeira venda: 2016-09-04 21:15:19
    √öltima venda:    2018-09-20 13:54:16

üì¶ Status dos Pedidos:
order_status
delivered      9719
shipped         106
canceled         58
unavailable      55
processing       32
invoiced         28
created           2
Name: count, dtype: int64

üåé Top 5 Estados (Clientes):
customer_state
SP    4224
RJ    1352
MG    1171
RS     542
PR     516
Name: count, dtype: int64

‚≠ê Distribui√ß√£o de Avalia√ß√µes:
review_score
1    1102
2     323
3     804
4    1950
5    5821
Name: count, dtype: int64

    M√©dia: 4.11/5.0

üí≥ M√©todos de Pagamento:
payment_type
credit_card    7399
boleto         1870
voucher         574
debit_card      157
Name: count, dtype: int64

üí∞ Receita:
    Produtos: R$ 1,204,861.29
    Frete:    R$ 200,984.08
    Total:    R$ 1,405,845.37


In [43]:
## 9. Exportar Data Dictionary para Markdown
# Criar arquivo markdown com o dicion√°rio
output_path = Path('../bronze/dict_draft.md')

with open(output_path, 'w', encoding='utf-8') as f:
    f.write("# Bronze Layer - Data Dictionary\n\n")
    f.write("## üìä Olist Brazilian E-Commerce Dataset\n\n")
    f.write("*Gerado automaticamente via notebook de explora√ß√£o*\n\n")
    f.write("---\n\n")
    
    # Para cada dataset
    for name, df in dfs.items():
        f.write(f"## Dataset: {name.upper()}\\n\\n")
        f.write(f"**Linhas**: {len(df):,}  \n")
        f.write(f"**Colunas**: {len(df.columns)}  \n")
        f.write(f"**Arquivo**: `{datasets[name]}`  \n\n")
        
        # Tabela de colunas
        f.write("### Estrutura\n\n")
        f.write("| Coluna | Tipo | N√£o-Nulos | Nulos % | √önicos |\n")
        f.write("|--------|------|-----------|---------|--------|\n")
        
        for col in df.columns:
            dtype = str(df[col].dtype)
            non_null = df[col].count()
            null_pct = (df[col].isnull().sum() / len(df) * 100)
            unique = df[col].nunique()
            f.write(f"| {col} | {dtype} | {non_null:,} | {null_pct:.1f}% | {unique:,} |\n")
        
        f.write("\n---\n\n")

print(f"\n‚úÖ Data Dictionary salvo em: {output_path}")


‚úÖ Data Dictionary salvo em: ..\bronze\dict_draft.md


In [44]:
## 10. Resumo Final
print("\n" + "="*80)
print("üìù RESUMO DA EXPLORA√á√ÉO")
print("="*80)

print(f"\n‚úÖ Datasets Analisados: {len(dfs)}")
print(f"‚úÖ Total de Colunas: {sum(len(df.columns) for df in dfs.values())}")
print(f"‚úÖ Total de Registros: {sum(len(df) for df in dfs.values()):,}")

print(f"\nüìä Principais Entidades:")
print(f"    ‚Ä¢ {len(dfs['orders']):,} pedidos")
print(f"    ‚Ä¢ {dfs['customers']['customer_id'].nunique():,} clientes √∫nicos")
print(f"    ‚Ä¢ {dfs['products']['product_id'].nunique():,} produtos √∫nicos")
print(f"    ‚Ä¢ {dfs['sellers']['seller_id'].nunique():,} vendedores")

print(f"\nüí∞ Receita Total: R$ {total_product + total_freight:,.2f}")
print(f"‚≠ê Avalia√ß√£o M√©dia: {dfs['reviews']['review_score'].mean():.2f}/5.0")

print(f"\nüìÑ Data Dictionary gerado em: bronze/data_dictionary.md")
print(f"\nüéØ Bronze Layer completa!")
print("="*80)


üìù RESUMO DA EXPLORA√á√ÉO

‚úÖ Datasets Analisados: 9
‚úÖ Total de Colunas: 52
‚úÖ Total de Registros: 73,166

üìä Principais Entidades:
    ‚Ä¢ 10,000 pedidos
    ‚Ä¢ 10,000 clientes √∫nicos
    ‚Ä¢ 10,000 produtos √∫nicos
    ‚Ä¢ 3,095 vendedores

üí∞ Receita Total: R$ 1,405,845.37
‚≠ê Avalia√ß√£o M√©dia: 4.11/5.0

üìÑ Data Dictionary gerado em: bronze/data_dictionary.md

üéØ Bronze Layer completa!
