In [52]:
# Importando as bibliotecas necessárias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import sqlite3
from datetime import datetime

In [53]:
# Configurações para visualização dos gráficos

# Estilo de visualização
plt.style.use('ggplot')
sns.set(style="whitegrid")

# Configuração para exibir todas as colunas
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


## Função de Carregamento e Limpeza de Dados

In [54]:
def load_and_clean_data():
    print("Carregando e limpando os dados...")

    # Caminho base da pasta onde estão os dados
    base_path = r'C:\Users\louis\datatech\Dados'

    # Carregamento dos arquivos
    orders_df = pd.read_csv(os.path.join(base_path, '5qPZ8EyPSau2UNVvdRak_orders.csv'))
    drivers_df = pd.read_csv(os.path.join(base_path, 'DASNKm5LTPy2hXX0dM0D_drivers_data.csv'))
    customers_df = pd.read_csv(os.path.join(base_path, 'i7WiftZQm2ToVfzHFBBW_customers_data.csv'))
    missing_items_df = pd.read_csv(os.path.join(base_path, 'LKyEGqe9QsWdRFCujqRc_missing_items_data.csv'))
    products_df = pd.read_csv(os.path.join(base_path, 'PGqj7HULTByfy23R8vxN_products_data.csv'))

    # Informações gerais
    print(f"Número de pedidos: {len(orders_df)}")
    print(f"Número de motoristas: {len(drivers_df)}")
    print(f"Número de clientes: {len(customers_df)}")
    print(f"Número de registros de itens faltantes: {len(missing_items_df)}")
    print(f"Número de produtos: {len(products_df)}")

    # Valores ausentes
    print("\nValores ausentes em cada DataFrame:")
    print(f"Orders: {orders_df.isnull().sum().sum()}")
    print(f"Drivers: {drivers_df.isnull().sum().sum()}")
    print(f"Customers: {customers_df.isnull().sum().sum()}")
    print(f"Missing Items: {missing_items_df.isnull().sum().sum()}")
    print(f"Products: {products_df.isnull().sum().sum()}")

    # Limpeza
    orders_df['order_amount'] = orders_df['order_amount'].str.replace('$', '').str.replace(',', '').astype(float)
    orders_df['date'] = pd.to_datetime(orders_df['date'])

    orders_df['delivery_hour_only'] = orders_df['delivery_hour'].apply(lambda x: int(x.split(':')[0]))
    orders_df['delivery_minute'] = orders_df['delivery_hour'].apply(lambda x: int(x.split(':')[1]))
    orders_df['delivery_second'] = orders_df['delivery_hour'].apply(lambda x: int(x.split(':')[2]))

    def categorize_time(hour):
        if 5 <= hour < 12:
            return 'Manhã'
        elif 12 <= hour < 18:
            return 'Tarde'
        else:
            return 'Noite'

    orders_df['period_of_day'] = orders_df['delivery_hour_only'].apply(categorize_time)

    if 'produc_id' in products_df.columns:
        products_df = products_df.rename(columns={'produc_id': 'product_id'})

    # Remoção de duplicatas
    orders_df = orders_df.drop_duplicates()
    drivers_df = drivers_df.drop_duplicates()
    customers_df = customers_df.drop_duplicates()
    missing_items_df = missing_items_df.drop_duplicates()
    products_df = products_df.drop_duplicates()

    if 'price' in products_df.columns:
        products_df['price'] = products_df['price'].str.replace('$', '').astype(float)

    print("\nVerificando a estrutura dos IDs de pedido:")
    print(f"Primeiro ID na tabela orders: {orders_df['order_id'].iloc[0]}")
    print(f"Primeiro ID na tabela missing_items: {missing_items_df['order_id'].iloc[0]}")

    return orders_df, drivers_df, customers_df, missing_items_df, products_df


## Função de Criação do Banco SQLite

In [55]:
def create_sqlite_database(orders_df, drivers_df, customers_df, missing_items_df, products_df):
    print("\nCriando banco de dados SQLite...")

    # Caminho absoluto da pasta Database
    db_dir = r'C:\Users\louis\datatech\Database'
    os.makedirs(db_dir, exist_ok=True)

    # Caminho completo do arquivo .db
    db_path = os.path.join(db_dir, 'walmart_fraudes.db')

    # Conecta ao banco de dados (cria ou sobrescreve)
    conn = sqlite3.connect(db_path)

    # Salva os DataFrames como tabelas no banco
    orders_df.to_sql('orders', conn, if_exists='replace', index=False)
    drivers_df.to_sql('drivers', conn, if_exists='replace', index=False)
    customers_df.to_sql('customers', conn, if_exists='replace', index=False)
    missing_items_df.to_sql('missing_items', conn, if_exists='replace', index=False)
    products_df.to_sql('products', conn, if_exists='replace', index=False)

    print(f"✅ Banco de dados SQLite salvo com sucesso em: {db_path}")
    return conn


## Execução Principal do Pipeline

In [56]:
# Execução
orders_df, drivers_df, customers_df, missing_items_df, products_df = load_and_clean_data()
conn = create_sqlite_database(orders_df, drivers_df, customers_df, missing_items_df, products_df)


Carregando e limpando os dados...
Número de pedidos: 10000
Número de motoristas: 1247
Número de clientes: 1239
Número de registros de itens faltantes: 1501
Número de produtos: 314

Valores ausentes em cada DataFrame:
Orders: 0
Drivers: 0
Customers: 0
Missing Items: 2841
Products: 0

Verificando a estrutura dos IDs de pedido:
Primeiro ID na tabela orders: c9da15aa-be24-4871-92a3-dfa7746fff69
Primeiro ID na tabela missing_items: c7a343f7-3f1d-497c-8004-b9ede2d48fb1

Criando banco de dados SQLite...
✅ Banco de dados SQLite salvo com sucesso em: C:\Users\louis\datatech\Database\walmart_fraudes.db


## Visualização dos Dados no Banco

In [57]:
# Listando as tabelas
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("\nTabelas criadas no banco de dados SQLite:")
for table in tables:
    print(table[0])



Tabelas criadas no banco de dados SQLite:
orders
drivers
customers
missing_items
products


## Sumário Estatístico das Tabelas

In [58]:
print("\nSumário da tabela orders:")
display(pd.read_sql("SELECT COUNT(*) as count, AVG(order_amount) as avg_amount FROM orders", conn))

print("\nSumário da tabela drivers:")
display(pd.read_sql("SELECT COUNT(*) as count, AVG(age) as avg_age FROM drivers", conn))

print("\nSumário da tabela customers:")
display(pd.read_sql("SELECT COUNT(*) as count, AVG(customer_age) as avg_age FROM customers", conn))

print("\nSumário da tabela missing_items:")
display(pd.read_sql("SELECT COUNT(*) as count FROM missing_items", conn))

print("\nSumário da tabela products:")
display(pd.read_sql("SELECT COUNT(*) as count, AVG(price) as avg_price FROM products", conn))



Sumário da tabela orders:


Unnamed: 0,count,avg_amount
0,10000,283.302238



Sumário da tabela drivers:


Unnamed: 0,count,avg_age
0,1247,34.17081



Sumário da tabela customers:


Unnamed: 0,count,avg_age
0,1239,54.421308



Sumário da tabela missing_items:


Unnamed: 0,count
0,1501



Sumário da tabela products:


Unnamed: 0,count,avg_price
0,314,59.129936


## Prévia das Tabelas

In [59]:
print("\nPrévia de cada DataFrame:")
display(orders_df.head())
display(drivers_df.head())
display(customers_df.head())
display(missing_items_df.head())
display(products_df.head())



Prévia de cada DataFrame:


Unnamed: 0,date,order_id,order_amount,region,items_delivered,items_missing,delivery_hour,driver_id,customer_id,delivery_hour_only,delivery_minute,delivery_second,period_of_day
0,2023-01-01,c9da15aa-be24-4871-92a3-dfa7746fff69,1095.54,Winter Park,10,1,8:37:28,WDID10627,WCID5031,8,37,28,Manhã
1,2023-01-01,ccacc183-09f8-4fd5-af35-009d18656326,659.11,Altamonte Springs,11,1,9:31:17,WDID10533,WCID5794,9,31,17,Manhã
2,2023-01-01,f4e1d30b-c3d1-413f-99b8-93c0b46d68bf,251.45,Winter Park,18,1,10:43:49,WDID10559,WCID5599,10,43,49,Manhã
3,2023-01-01,993d31f4-9358-41f0-a371-0021e55cef5d,598.83,Altamonte Springs,12,1,9:48:33,WDID10622,WCID5005,9,48,33,Manhã
4,2023-01-01,3e0a8f1b-3cd6-4d64-90e3-6b38dc368925,27.18,Clermont,3,1,10:09:49,WDID10654,WCID5114,10,9,49,Manhã


Unnamed: 0,driver_id,driver_name,age,Trips
0,WDID09873,Pamela Moore,18,64
1,WDID09874,Billy Lawson,18,37
2,WDID09875,Stephen Randolph,18,64
3,WDID09876,Jordan Daniel,18,53
4,WDID09877,James White,18,14


Unnamed: 0,customer_id,customer_name,customer_age
0,WCID5170,Elijah Taylor,30
1,WCID5901,Alexis Ross,58
2,WCID5652,Carla Knox,23
3,WCID5578,Matthew Beard,61
4,WCID5905,Julie Reese,80


Unnamed: 0,order_id,product_id_1,product_id_2,product_id_3
0,c7a343f7-3f1d-497c-8004-b9ede2d48fb1,PWPX0982761090982,PWPX0982761090982,PWPX0982761090982
1,20698293-8399-4fda-af1e-b61a9ebb8a0a,PWPX0982761090983,PWPX0982761090983,PWPX0982761090983
2,d7f690a0-c1c2-4b36-b05f-2b7e641ea1ac,PWPX0982761090984,PWPX0982761090984,PWPX0982761090984
3,d5cea1eb-7016-451b-9426-51973f4d6e14,PWPX0982761090985,PWPX0982761090985,PWPX0982761090985
4,2f7cbda8-793a-4a1d-bb66-3a514bee5dc4,PWPX0982761090986,PWPX0982761090986,PWPX0982761090986


Unnamed: 0,product_id,product_name,category,price
0,PWPX0982761090982,Kellogg's Frosties,Supermarket,12.53
1,PWPX0982761090983,Uncured Bacon,Supermarket,4.67
2,PWPX0982761090984,Whole Milk,Supermarket,9.95
3,PWPX0982761090985,Organic Bananas,Supermarket,3.94
4,PWPX0982761090986,Sourdough Bread,Supermarket,19.77


## Fechando a Conexão com o Banco

In [60]:
conn.close()


## Consultar tabelas

In [61]:
def consultar_tabela(nome_tabela, limit=5):
    """
    Consulta uma tabela do banco de dados salvo localmente no arquivo 'walmart_fraudes.db'.

    Parâmetros:
    - nome_tabela (str): Nome da tabela que deseja consultar.
    - limit (int): Número de registros a retornar (padrão: 5)

    Retorna:
    - DataFrame com os registros da tabela ou None se ocorrer erro
    """
    import sqlite3
    import pandas as pd
    import os

    db_path = r'C:\Users\louis\datatech\Database\walmart_fraudes.db'

    if not os.path.exists(db_path):
        print(f"❌ Banco de dados não encontrado em: {db_path}")
        return None

    try:
        conn = sqlite3.connect(db_path)
        query = f"SELECT * FROM {nome_tabela} LIMIT {limit};"
        df = pd.read_sql(query, conn)
        conn.close()
        print(f"✅ Tabela '{nome_tabela}' consultada com sucesso ({len(df)} linhas retornadas):")
        return df
    except Exception as e:
        print(f"❌ Erro ao consultar a tabela '{nome_tabela}': {e}")
        return None


Exemplo de uso no notebook:

In [62]:
consultar_tabela("orders", limit=10)


✅ Tabela 'orders' consultada com sucesso (10 linhas retornadas):


Unnamed: 0,date,order_id,order_amount,region,items_delivered,items_missing,delivery_hour,driver_id,customer_id,delivery_hour_only,delivery_minute,delivery_second,period_of_day
0,2023-01-01 00:00:00,c9da15aa-be24-4871-92a3-dfa7746fff69,1095.54,Winter Park,10,1,8:37:28,WDID10627,WCID5031,8,37,28,Manhã
1,2023-01-01 00:00:00,ccacc183-09f8-4fd5-af35-009d18656326,659.11,Altamonte Springs,11,1,9:31:17,WDID10533,WCID5794,9,31,17,Manhã
2,2023-01-01 00:00:00,f4e1d30b-c3d1-413f-99b8-93c0b46d68bf,251.45,Winter Park,18,1,10:43:49,WDID10559,WCID5599,10,43,49,Manhã
3,2023-01-01 00:00:00,993d31f4-9358-41f0-a371-0021e55cef5d,598.83,Altamonte Springs,12,1,9:48:33,WDID10622,WCID5005,9,48,33,Manhã
4,2023-01-01 00:00:00,3e0a8f1b-3cd6-4d64-90e3-6b38dc368925,27.18,Clermont,3,1,10:09:49,WDID10654,WCID5114,10,9,49,Manhã
5,2023-01-01 00:00:00,d0b8162c-2dfc-4038-93e2-fbe240dba870,494.32,Winter Park,5,1,19:40:57,WDID10666,WCID5857,19,40,57,Noite
6,2023-01-01 00:00:00,97fd47af-79b1-4177-b0be-2d26a2f8c288,220.13,Winter Park,1,1,16:38:31,WDID10504,WCID6084,16,38,31,Tarde
7,2023-01-01 00:00:00,7e8fb589-7a61-43c1-aacb-c30e53dc1acc,468.53,Sanford,2,0,8:44:46,WDID10562,WCID5428,8,44,46,Manhã
8,2023-01-01 00:00:00,cf227256-5702-40f0-9e6e-a8aa231bf842,336.68,Apopka,10,0,16:12:50,WDID11003,WCID5821,16,12,50,Tarde
9,2023-01-01 00:00:00,11b841e4-c021-41eb-9f81-dd90c15d6557,275.13,Clermont,14,0,17:55:37,WDID10010,WCID5069,17,55,37,Tarde


In [63]:
def listar_tabelas():
    """
    Lista as tabelas disponíveis no banco de dados SQLite.
    """
    import sqlite3
    import os

    db_path = os.path.join('Database', 'walmart_fraudes.db')
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tabelas = [t[0] for t in cursor.fetchall()]
    conn.close()
    print("📂 Tabelas disponíveis no banco de dados:")
    for t in tabelas:
        print(f"• {t}")


In [1]:
import sqlite3

conn = sqlite3.connect(r'C:\Users\louis\datatech\Database\walmart_fraudes.db')
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tabelas = cursor.fetchall()

conn.close()

print("Tabelas encontradas no banco:")
for t in tabelas:
    print(f"• {t[0]}")


Tabelas encontradas no banco:
• orders
• drivers
• customers
• missing_items
• products
