# TESTE TRIGGOAI  

*git lfs ajustado, database.db consegue ser pushado

# 0: Importando Bibliotecas


In [1]:
import pandas as pd
import sqlalchemy
import plotly.express as px

# 1: Tratamento do dataset
Tratando as informações dos clientes

In [2]:
df_customers = pd.read_csv(r'dataset\olist_customers_dataset.csv', sep=',', encoding='utf-8')
df_customers['customer_id'].drop_duplicates(inplace=True) #Eu posso retirar os duplicados aqui?
df_customers.dropna(inplace=True)
df_customers['customer_city'] = df_customers['customer_city'].str.lower().str.strip().str.replace(' ', '_')

Tratando os dados de Geolocalização

In [3]:
df_geolocation = pd.read_csv(r'dataset\olist_geolocation_dataset.csv', sep=',', encoding='utf-8')
df_geolocation.drop_duplicates(inplace=True)
df_geolocation.dropna(inplace=True)
df_geolocation['geolocation_city'] = df_geolocation['geolocation_city'].str.lower().str.strip().str.replace(' ', '_')

Tratamento dados dos produtos comprados

In [4]:
df_order_item = pd.read_csv(r'dataset\olist_order_items_dataset.csv', sep=',', encoding='utf-8')
df_order_item.drop_duplicates(inplace=True)
df_order_item.dropna(inplace=True)
df_order_item['shipping_limit_date'] = pd.to_datetime(df_order_item['shipping_limit_date'], format='%Y-%m-%d %H:%M:%S')

order_item_float = [
    'price',
    'freight_value'
]

df_order_item[order_item_float] = df_order_item[order_item_float].astype(float)

Tratamento dos dados de pagamento


In [5]:
df_order_payment = pd.read_csv(r'dataset\olist_order_payments_dataset.csv', sep=',', encoding='utf-8')
df_order_payment.drop_duplicates(inplace=True)
df_order_payment.dropna(inplace=True)
df_order_payment = df_order_payment[df_order_payment['payment_sequential'] < 15]
df_order_payment['payment_type'] = df_order_payment['payment_type'].str.lower().str.strip().str.replace(' ', '_')
df_order_payment['payment_value'] = df_order_payment['payment_value'].astype(float)

order_payment_int = [
    'payment_sequential',
    'payment_installments'
]

df_order_payment[order_payment_int] = df_order_payment[order_payment_int].astype(int)   

Tratando os dados de reviews dos pedidos

In [6]:
df_order_reviews = pd.read_csv(r'dataset\olist_order_reviews_dataset.csv', sep=',', encoding='utf-8')
df_order_reviews.drop_duplicates(inplace=True)
df_order_reviews.dropna(subset=['review_id','order_id','review_score','review_creation_date','review_answer_timestamp'],inplace=True)
df_order_reviews['review_score'] = df_order_reviews['review_score'].astype(int)
df_order_reviews['review_comment_title'] = df_order_reviews['review_comment_title'].fillna('')
df_order_reviews['review_comment_message'] = df_order_reviews['review_comment_message'].fillna('')

reviews_datetime = [
    'review_creation_date',
    'review_answer_timestamp'
]
df_order_reviews[reviews_datetime] = df_order_reviews[reviews_datetime].apply(pd.to_datetime, format='%Y-%m-%d %H:%M:%S')

Tratando os dados de pedidos


In [7]:
df_orders = pd.read_csv(r'dataset\olist_orders_dataset.csv', sep=',', encoding='utf-8')
df_orders.drop_duplicates(inplace=True)
df_orders.dropna(inplace=True)
df_orders['order_id'] = df_orders['order_id'].str.strip()
df_orders['customer_id'] = df_orders['customer_id'].str.strip()
df_orders['order_status'] = df_orders['order_status'].str.strip().str.lower().astype('category')

orders_datetime = [
    'order_purchase_timestamp',
    'order_approved_at',
    'order_delivered_carrier_date',
    'order_delivered_customer_date',
    'order_estimated_delivery_date'
]
df_orders[orders_datetime] = df_orders[orders_datetime].apply(pd.to_datetime)



Tratando o dataset de produtos

In [8]:
df_products = pd.read_csv(r'dataset\olist_products_dataset.csv', sep=',', encoding='utf-8')
df_products.drop_duplicates(inplace=True)
df_products.dropna(inplace=True)
df_products['product_id'] = df_products['product_id'].str.strip()
df_products['product_category_name'] = df_products['product_category_name'].str.lower().str.strip().str.replace(' ', '_')

products_int = [
    'product_name_lenght',
    'product_description_lenght',
    'product_photos_qty',
    'product_weight_g',
    'product_length_cm',
    'product_height_cm'
]

df_products[products_int] = df_products[products_int].astype(int)

Tratando o dataset de vendedores

In [9]:
df_sellers = pd.read_csv(r'dataset\olist_sellers_dataset.csv', sep=',', encoding='utf-8')
df_sellers.drop_duplicates(inplace=True)
df_sellers.dropna(inplace=True)
df_sellers['seller_id'] = df_sellers['seller_id'].str.strip()
df_sellers['seller_zip_code_prefix'] = df_sellers['seller_zip_code_prefix'].astype(str)
df_sellers['seller_city'] = df_sellers['seller_city'].str.lower().str.strip().str.replace(' ', '_')
df_sellers['seller_state'] = df_sellers['seller_state'].str.strip()

df = df[df['data_entrega'] >= df['data_compra']] para usar no dataset de pedidos



# 2. Preparando o Modelo Relacional com SQLAlchemy

Criando a Engine do SQLAlchemy

In [10]:
engine = sqlalchemy.create_engine('sqlite:///datasets.db')
print('Conectando ao banco de dados...')
try:
    conn = engine.connect()
    print('Conexão estabelecida com sucesso!')
except sqlalchemy.exc.OperationalError as e:
    print(f'Erro ao conectar ao banco de dados: {e}')
    exit(1)

Conectando ao banco de dados...
Conexão estabelecida com sucesso!


Carregando os Dataframes no banco 

In [11]:
print('Criando tabelas no banco de dados...')
df_customers.to_sql('customers', con=engine, if_exists='replace', index=False)
df_geolocation.to_sql('geolocation', con=engine, if_exists='replace', index=False)
df_order_item.to_sql('order_item', con=engine, if_exists='replace', index=False)
df_order_payment.to_sql('order_payment', con=engine, if_exists='replace', index=False)
df_order_reviews.to_sql('order_reviews', con=engine, if_exists='replace', index=False)
df_orders.to_sql('orders', con=engine, if_exists='replace', index=False)
df_products.to_sql('products', con=engine, if_exists='replace', index=False)
df_sellers.to_sql('sellers', con=engine, if_exists='replace', index=False)
print('Tabelas criadas com sucesso!')
conn.close()

Criando tabelas no banco de dados...
Tabelas criadas com sucesso!


Criando Índices nas tabelas para melhor performance do código

In [12]:
with engine.connect() as conn:
    conn.execute(sqlalchemy.text('CREATE INDEX IF NOT EXISTS idx_orders_customer_id ON orders(customer_id)'))
    conn.execute(sqlalchemy.text('CREATE INDEX IF NOT EXISTS idx_orders_order_id ON orders(order_id)'))
    conn.execute(sqlalchemy.text('CREATE INDEX IF NOT EXISTS idx_order_item_product_id ON order_item(product_id)'))
    conn.execute(sqlalchemy.text('CREATE INDEX IF NOT EXISTS idx_order_item_product_id ON order_item(seller_id)'))
    conn.execute(sqlalchemy.text('CREATE INDEX IF NOT EXISTS idx_geolocation_geolocation_zip_code_prefix ON geolocation(geolocation_zip_code_prefix)'))

In [13]:
query = """
SELECT a.order_id, b.customer_id, a.order_status, c.product_id, c.price
FROM orders a
JOIN customers b ON a.customer_id = b.customer_id
JOIN order_item c ON a.order_id = c.order_id
"""
df_joined = pd.read_sql(query, con=engine)

df_joined.head


<bound method NDFrame.head of                                 order_id                       customer_id  \
0       00e7ee1b050b8499577073aeb2a297a1  06b8999e2fba1a1fbc88172c00ba8bc7   
1       29150127e6685892b6eab3eec79f59c7  18955e83d337fd6b2def6b18a428ac77   
2       b2059ed67ce144a36e2aa97d2c9e9ad2  4e7b3e00288586ebd08712fdd0374a03   
3       951670f92359f4fe4a63112aa7306eba  b2b6027bc5c5109e529d4dc6358b12c3   
4       6b7d50bd145f6fc7f33cebabd7e49d0f  4f2d8ab171c80ec8364f7c12e35b23ad   
...                                  ...                               ...   
110175  6760e20addcf0121e9d58f2f1ff14298  17ddf5dd5d51696bb3d7c6291687be6f   
110176  9ec0c8947d973db4f4e8dcf1fbfa8f1b  e7b71a9017aa05c9a7fd292d714858e8   
110177  fed4434add09a6f332ea398efd656a5c  5e28dfe12db7fb50a4b2f691faecea5e   
110178  e31ec91cea1ecf97797787471f98a8c2  56b18e2166679b8a959d72dd06da27f9   
110179  28db69209a75e59f20ccbb5c36a20b90  274fa6071e5e17fe303b9748641082c8   

       order_status              

## 2.0 Análise exploratória dos dados
Qual o volume de pedidos no mês? Existe sazonalidade nas vendas?

In [15]:
df_pedidos_mes = pd.read_sql('''
SELECT strftime('%m/%Y', order_purchase_timestamp) AS mes, COUNT(*) AS total_pedidos
FROM orders
GROUP BY mes
ORDER BY order_purchase_timestamp
''', con=engine)

df_pedidos_mes['total_pedidos'] = df_pedidos_mes['total_pedidos'].astype(int)

fig = px.line(df_pedidos_mes, x='mes', y='total_pedidos', title='Pedidos por Mês')
fig.update_layout(xaxis_title='Mês', yaxis_title='Total de Pedidos')
fig.show()


2.1: Qual a distribuição do tempo de entrega dos pedidos?

In [None]:
df_time_delivery = pd.read_sql('SELECT order_delivered_customer_date, order_purchase_timestamp FROM orders WHERE order_delivered_customer_date IS NOT NULL', con=engine)
df_time_delivery['tempo_entrega'] = (pd.to_datetime(df_time_delivery['order_delivered_customer_date']) - pd.to_datetime(df_time_delivery['order_purchase_timestamp'])).dt.days

df_time_delivery = df_time_delivery[df_time_delivery['tempo_entrega'] < 100] #Retirando um outlier

fig = px.histogram(df_time_delivery, x='tempo_entrega', nbins=30, title='Distribuição do Tempo de Entrega (em dias)')
fig.update_layout(xaxis_title='Dias para entrega', yaxis_title='Número de pedidos')
fig.show()


2.2: Quais são as categorias de produtos mais vendidas em termos de faturamento?

In [29]:
query = '''
SELECT
  p.product_category_name,
  SUM(o.price) AS faturamento_total
FROM
  order_item o
JOIN
  products p ON o.product_id = p.product_id
GROUP BY
  p.product_category_name
ORDER BY
  faturamento_total DESC
LIMIT 10;
'''

df_faturamento = pd.read_sql(query, con=engine)

fig = px.bar(df_faturamento, x='product_category_name', y='faturamento_total',
             title='Top 10 Categorias por Faturamento',
             labels={'product_category_name': 'Categoria', 'faturamento': 'Faturamento (R$)'},
             text_auto=True)
fig.show()


2.3: Quais estados brasileiros possuem o maior valor médio de pedido?

In [38]:
query = '''
SELECT
  c.customer_state,
  AVG(oi.total_order_value) AS valor_medio_pedido
FROM
  orders o
JOIN customers c ON o.customer_id = c.customer_id
JOIN (
  SELECT
    order_id,
    SUM(price + freight_value) AS total_order_value
  FROM
    order_item
  GROUP BY
    order_id
) oi ON o.order_id = oi.order_id
GROUP BY
  c.customer_state
ORDER BY
  valor_medio_pedido DESC
LIMIT 10;
'''

df_estado_valor_medio = pd.read_sql(query, con=engine)

fig = px.bar(
    df_estado_valor_medio,
    x='customer_state',
    y='valor_medio_pedido',
    title='Valor Médio de Pedido por Estado',
    labels={'customer_state': 'Estado', 'valor_medio_pedido': 'Valor Médio (R$)'}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

# 3.0: Solução de Problemas de Negócio