# **PREPARANDO EL AMBIENTE**

In [None]:
import sys
import os
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os
from sqlalchemy.orm import declarative_base, relationship
from sqlalchemy import create_engine, Column, Integer, String
from sqlalchemy.types import DateTime

sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:


load_dotenv()  # Carga las variables del archivo .env

config = {
    'host': os.getenv('HOST'),
    'user': os.getenv('USER'),
    'password': os.getenv('PASSWORD'),
    'database': os.getenv('DATABASE'),
    'port': os.getenv('PORT')
}

In [None]:
# Crear la URL de conexión para SQLAlchemy
db_url = f"mysql+pymysql://{config['user']}:{config['password']}@{config['host']}:{config['port']}/caso_6"

# Crear el motor de SQLAlchemy
engine = create_engine(db_url, echo=True)



In [None]:

Base = declarative_base()


# **1. GENERACIÓN DE TABLAS**

In [5]:
class EventIdentifier(Base):

    __tablename__ = 'event_identifier'
    __table_args__ = {'extend_existing': True} 

    id = Column(Integer, primary_key=True) 
    nombre = Column(String(50))  


class CampaignIdentifier(Base):

    __tablename__ = 'campaign_identifier'
    __table_args__= {'extend_existing': True}

    # Columnas
    id = Column(Integer, primary_key=True, autoincrement=True)  # nueva PK
    campaign_id = Column(Integer, primary_key=True)
    products = Column(String(3))
    campaign_name = Column(String(33))
    start_date = Column(DateTime)
    end_date = Column(DateTime)


class PageHierarchy(Base):
     
    __tablename__ = 'page_hierarchy'
    __table_args__ = {'extend_existing': True}

    # Columnas

    page_id = Column(Integer, primary_key=True)
    page_name = Column(String(14))
    product_category = Column(String(9))
    product_id = Column(Integer)



class Users(Base):

    __tablename__= 'users'
    __table_args__={'extend_existing': True}

    # Columnas
    id = Column(Integer, primary_key=True, autoincrement=True)  # nueva PK
    user_id = Column(Integer)
    cookie_id = Column(String(6))
    start_date = Column(DateTime)


class Events(Base):

    __tablename__= 'events'
    __table_args_= {'extend_existing': True}


    # Columnas
    id = Column(Integer, primary_key=True, autoincrement=True)  # nueva PK
    visit_id = Column(String(6))
    cookie_id = Column(String(6))
    page_id = Column(Integer)
    event_type =Column(Integer)
    sequence_number = Column(Integer)
    event_time = Column(DateTime)


# Crear la tabla en la base de datos
Base.metadata.create_all(engine)


2025-04-25 14:13:27,622 INFO sqlalchemy.engine.Engine SELECT DATABASE()
2025-04-25 14:13:27,622 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:27,624 INFO sqlalchemy.engine.Engine SELECT @@sql_mode
2025-04-25 14:13:27,624 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:27,625 INFO sqlalchemy.engine.Engine SELECT @@lower_case_table_names
2025-04-25 14:13:27,626 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:27,628 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:27,628 INFO sqlalchemy.engine.Engine DESCRIBE `caso_6`.`event_identifier`
2025-04-25 14:13:27,629 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:27,631 INFO sqlalchemy.engine.Engine DESCRIBE `caso_6`.`campaign_identifier`
2025-04-25 14:13:27,632 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:27,633 INFO sqlalchemy.engine.Engine DESCRIBE `caso_6`.`page_hierarchy`
2025-04-25 14:13:27,633 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:27,634

## **1.1 CARGANDO LOS DATOS**

In [6]:
from utils import funciones_utiles

event_identifier_data = funciones_utiles.load_text_as_tuples('datos_tablas/event_identifier_data.txt')
campaign_identifier_data = funciones_utiles.load_text_as_tuples('datos_tablas/campaign_identifier_data.txt')
page_hierarchy_data =funciones_utiles.load_text_as_tuples('datos_tablas/page_hierarchy_data.txt')
users_data = funciones_utiles.load_text_as_tuples('datos_tablas/users_data.txt')
events_data = funciones_utiles.load_text_as_tuples('datos_tablas/events_data.txt')

In [7]:
event_identifier_data

[('1', 'Page View'),
 ('2', 'Add to Cart'),
 ('3', 'Purchase'),
 ('4', 'Ad Impression'),
 ('5', 'Ad Click')]

In [8]:
from sqlalchemy.orm import sessionmaker

# Crear la sesión
Session = sessionmaker(bind=engine)
session = Session()


In [9]:
from sqlalchemy.orm import sessionmaker



# Insertar los datos
for eid, name in event_identifier_data:
    
    eid = eid.strip('\'')
    name = name.strip('\'')

    print(eid)
    print(name)

    exists_query = session.query(
        session.query(EventIdentifier).filter_by(id=int(eid)).exists()
    ).scalar()

    if not exists_query:
        event = EventIdentifier(id=int(eid), nombre=name)
        session.add(event)


# Confirmar los cambios en la base de datos
session.commit()

# Cerrar sesión
session.close()

1
Page View
2025-04-25 14:13:28,350 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:28,354 INFO sqlalchemy.engine.Engine SELECT EXISTS (SELECT 1 
FROM event_identifier 
WHERE event_identifier.id = %(id_1)s) AS anon_1
2025-04-25 14:13:28,355 INFO sqlalchemy.engine.Engine [generated in 0.00129s] {'id_1': 1}
2
Add to Cart
2025-04-25 14:13:28,363 INFO sqlalchemy.engine.Engine INSERT INTO event_identifier (id, nombre) VALUES (%(id)s, %(nombre)s)
2025-04-25 14:13:28,364 INFO sqlalchemy.engine.Engine [generated in 0.00109s] {'id': 1, 'nombre': 'Page View'}
2025-04-25 14:13:28,373 INFO sqlalchemy.engine.Engine SELECT EXISTS (SELECT 1 
FROM event_identifier 
WHERE event_identifier.id = %(id_1)s) AS anon_1
2025-04-25 14:13:28,374 INFO sqlalchemy.engine.Engine [cached since 0.02047s ago] {'id_1': 2}
3
Purchase
2025-04-25 14:13:28,376 INFO sqlalchemy.engine.Engine INSERT INTO event_identifier (id, nombre) VALUES (%(id)s, %(nombre)s)
2025-04-25 14:13:28,377 INFO sqlalchemy.engine.En

In [10]:
   
for eid, name in event_identifier_data:
    
    eid = eid.strip('\'')
    name = name.strip('\'')

    print(eid)
    print(name)

    exists_query = session.query(
        session.query(EventIdentifier).filter_by(id=int(eid)).exists()
    ).scalar()

    if not exists_query:
        event = EventIdentifier(id=int(eid), nombre=name)
        session.add(event)


# Confirmar los cambios en la base de datos
session.commit()


1
Page View
2025-04-25 14:13:28,410 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:28,411 INFO sqlalchemy.engine.Engine SELECT EXISTS (SELECT 1 
FROM event_identifier 
WHERE event_identifier.id = %(id_1)s) AS anon_1
2025-04-25 14:13:28,412 INFO sqlalchemy.engine.Engine [cached since 0.05894s ago] {'id_1': 1}
2
Add to Cart
2025-04-25 14:13:28,414 INFO sqlalchemy.engine.Engine SELECT EXISTS (SELECT 1 
FROM event_identifier 
WHERE event_identifier.id = %(id_1)s) AS anon_1
2025-04-25 14:13:28,414 INFO sqlalchemy.engine.Engine [cached since 0.06111s ago] {'id_1': 2}
3
Purchase
2025-04-25 14:13:28,416 INFO sqlalchemy.engine.Engine SELECT EXISTS (SELECT 1 
FROM event_identifier 
WHERE event_identifier.id = %(id_1)s) AS anon_1
2025-04-25 14:13:28,416 INFO sqlalchemy.engine.Engine [cached since 0.0628s ago] {'id_1': 3}
4
Ad Impression
2025-04-25 14:13:28,417 INFO sqlalchemy.engine.Engine SELECT EXISTS (SELECT 1 
FROM event_identifier 
WHERE event_identifier.id = %(id_1)s) AS an

In [11]:
campaign_identifier_data

[('1', '1-3', 'BOGOF - Fishing For Compliments', '2020-01-01', '2020-01-14'),
 ('2', '4-5', '25% Off - Living The Lux Life', '2020-01-15', '2020-01-28'),
 ('3', '6-8', 'Half Off - Treat Your Shellf(ish)', '2020-02-01', '2020-03-31')]

In [12]:
from datetime import datetime

objects = [
    CampaignIdentifier(
        campaign_id=int(cid),
        products=prod,
        campaign_name=name,
        start_date=datetime.strptime(start, '%Y-%m-%d'),
        end_date=datetime.strptime(end, '%Y-%m-%d')
    )
    for cid, prod, name, start, end in campaign_identifier_data
]

session.bulk_save_objects(objects)
session.commit()

2025-04-25 14:13:28,459 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:28,460 INFO sqlalchemy.engine.Engine INSERT INTO campaign_identifier (campaign_id, products, campaign_name, start_date, end_date) VALUES (%(campaign_id)s, %(products)s, %(campaign_name)s, %(start_date)s, %(end_date)s)
2025-04-25 14:13:28,461 INFO sqlalchemy.engine.Engine [generated in 0.00064s] [{'campaign_id': 1, 'products': '1-3', 'campaign_name': 'BOGOF - Fishing For Compliments', 'start_date': datetime.datetime(2020, 1, 1, 0, 0), 'end_date': datetime.datetime(2020, 1, 14, 0, 0)}, {'campaign_id': 2, 'products': '4-5', 'campaign_name': '25% Off - Living The Lux Life', 'start_date': datetime.datetime(2020, 1, 15, 0, 0), 'end_date': datetime.datetime(2020, 1, 28, 0, 0)}, {'campaign_id': 3, 'products': '6-8', 'campaign_name': 'Half Off - Treat Your Shellf(ish)', 'start_date': datetime.datetime(2020, 2, 1, 0, 0), 'end_date': datetime.datetime(2020, 3, 31, 0, 0)}]
2025-04-25 14:13:28,463 INFO sqlalchem

In [13]:
page_hierarchy_data

[('1', 'Home Page', None, None),
 ('2', 'All Products', None, None),
 ('3', 'Salmon', 'Fish', '1'),
 ('4', 'Kingfish', 'Fish', '2'),
 ('5', 'Tuna', 'Fish', '3'),
 ('6', 'Russian Caviar', 'Luxury', '4'),
 ('7', 'Black Truffle', 'Luxury', '5'),
 ('8', 'Abalone', 'Shellfish', '6'),
 ('9', 'Lobster', 'Shellfish', '7'),
 ('10', 'Crab', 'Shellfish', '8'),
 ('11', 'Oyster', 'Shellfish', '9'),
 ('12', 'Checkout', None, None),
 ('13', 'Confirmation', None, None)]

In [14]:
from datetime import datetime

objects = [
    PageHierarchy(
        page_id=int(page_id),
        page_name=page_name,
        product_category=product_category,
        product_id=int(product_id) if product_id is not None else None,
    )
    for page_id, page_name, product_category, product_id in page_hierarchy_data
]

session.bulk_save_objects(objects)
session.commit()

2025-04-25 14:13:28,505 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:28,507 INFO sqlalchemy.engine.Engine INSERT INTO page_hierarchy (page_id, page_name) VALUES (%(page_id)s, %(page_name)s)
2025-04-25 14:13:28,507 INFO sqlalchemy.engine.Engine [generated in 0.00063s] [{'page_id': 1, 'page_name': 'Home Page'}, {'page_id': 2, 'page_name': 'All Products'}]
2025-04-25 14:13:28,510 INFO sqlalchemy.engine.Engine INSERT INTO page_hierarchy (page_id, page_name, product_category, product_id) VALUES (%(page_id)s, %(page_name)s, %(product_category)s, %(product_id)s)
2025-04-25 14:13:28,510 INFO sqlalchemy.engine.Engine [generated in 0.00062s] [{'page_id': 3, 'page_name': 'Salmon', 'product_category': 'Fish', 'product_id': 1}, {'page_id': 4, 'page_name': 'Kingfish', 'product_category': 'Fish', 'product_id': 2}, {'page_id': 5, 'page_name': 'Tuna', 'product_category': 'Fish', 'product_id': 3}, {'page_id': 6, 'page_name': 'Russian Caviar', 'product_category': 'Luxury', 'product_id'

In [15]:
from datetime import datetime

objects = [
    Users(
        user_id=int(user_id),
        cookie_id=cookie_id,
        start_date=datetime.strptime(start_date, '%Y-%m-%d')
    )
    for user_id, cookie_id, start_date in users_data
]

session.bulk_save_objects(objects)
session.commit()

2025-04-25 14:13:28,581 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:28,594 INFO sqlalchemy.engine.Engine INSERT INTO users (user_id, cookie_id, start_date) VALUES (%(user_id)s, %(cookie_id)s, %(start_date)s)
2025-04-25 14:13:28,595 INFO sqlalchemy.engine.Engine [generated in 0.00632s] [{'user_id': 1, 'cookie_id': 'c4ca42', 'start_date': datetime.datetime(2020, 2, 4, 0, 0)}, {'user_id': 2, 'cookie_id': 'c81e72', 'start_date': datetime.datetime(2020, 1, 18, 0, 0)}, {'user_id': 3, 'cookie_id': 'eccbc8', 'start_date': datetime.datetime(2020, 2, 21, 0, 0)}, {'user_id': 4, 'cookie_id': 'a87ff6', 'start_date': datetime.datetime(2020, 2, 22, 0, 0)}, {'user_id': 5, 'cookie_id': 'e4da3b', 'start_date': datetime.datetime(2020, 2, 1, 0, 0)}, {'user_id': 6, 'cookie_id': '167909', 'start_date': datetime.datetime(2020, 1, 25, 0, 0)}, {'user_id': 7, 'cookie_id': '8f14e4', 'start_date': datetime.datetime(2020, 2, 9, 0, 0)}, {'user_id': 8, 'cookie_id': 'c9f0f8', 'start_date': datetim

In [16]:
from datetime import datetime

objects = [
    Events(
        visit_id=visit_id,
        cookie_id=cookie_id,
        page_id=int(page_id),
        event_type = event_type,
        sequence_number = sequence_number,
        event_time =  event_time   
    )
    for visit_id, cookie_id, page_id, event_type, sequence_number, event_time in events_data
]

session.bulk_save_objects(objects)
session.commit()

2025-04-25 14:13:29,275 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:29,529 INFO sqlalchemy.engine.Engine INSERT INTO events (visit_id, cookie_id, page_id, event_type, sequence_number, event_time) VALUES (%(visit_id)s, %(cookie_id)s, %(page_id)s, %(event_type)s, %(sequence_number)s, %(event_time)s)
2025-04-25 14:13:29,529 INFO sqlalchemy.engine.Engine [generated in 0.07973s] [{'visit_id': 'ccf365', 'cookie_id': 'c4ca42', 'page_id': 1, 'event_type': '1', 'sequence_number': '1', 'event_time': '2020-02-04 19:16:09.182546'}, {'visit_id': 'ccf365', 'cookie_id': 'c4ca42', 'page_id': 2, 'event_type': '1', 'sequence_number': '2', 'event_time': '2020-02-04 19:16:17.358191'}, {'visit_id': 'ccf365', 'cookie_id': 'c4ca42', 'page_id': 6, 'event_type': '1', 'sequence_number': '3', 'event_time': '2020-02-04 19:16:58.454669'}, {'visit_id': 'ccf365', 'cookie_id': 'c4ca42', 'page_id': 9, 'event_type': '1', 'sequence_number': '4', 'event_time': '2020-02-04 19:16:58.609142'}, {'visit_id

<center><h1>PREGUNTAS</h1></center>

# **1. Cuantos usuarios hay ?**

In [17]:
query = '''

    SELECT
        COUNT(DISTINCT user_id) AS total_usuarios
    FROM
        users

'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,213 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,214 INFO sqlalchemy.engine.Engine 

    SELECT
        COUNT(DISTINCT user_id) AS total_usuarios
    FROM
        users


2025-04-25 14:13:30,214 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,224 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,total_usuarios
0,500


- Existe un total de 500 usuarios unicos

# **2. Cuantos cookies en promedio existen por usuario ?**

In [18]:
query = '''
    WITH cuenta_por_usuario AS (
    SELECT
        user_id,
        COUNT(cookie_id) as total_cookies
    FROM
        users
    GROUP BY
        user_id )


    SELECT 
        AVG(total_cookies) AS Promedio_cookies_por_usuario
    FROM
        cuenta_por_usuario

'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,245 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,246 INFO sqlalchemy.engine.Engine 
    WITH cuenta_por_usuario AS (
    SELECT
        user_id,
        COUNT(cookie_id) as total_cookies
    FROM
        users
    GROUP BY
        user_id )


    SELECT 
        AVG(total_cookies) AS Promedio_cookies_por_usuario
    FROM
        cuenta_por_usuario


2025-04-25 14:13:30,247 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,250 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,Promedio_cookies_por_usuario
0,3.564


# **3. Cúal es el número de visitas unicas de todos los usuarios por mes?**



In [19]:
query = '''
    SELECT
        EXTRACT(MONTH FROM start_date) AS mes,
       COUNT(DISTINCT  visit_id) AS total_usuarios_por_mes
    FROM
        users u
    JOIN 
        events  e
     ON u.cookie_id = e.cookie_id
    GROUP BY 
        mes



'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,262 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,263 INFO sqlalchemy.engine.Engine 
    SELECT
        EXTRACT(MONTH FROM start_date) AS mes,
       COUNT(DISTINCT  visit_id) AS total_usuarios_por_mes
    FROM
        users u
    JOIN 
        events  e
     ON u.cookie_id = e.cookie_id
    GROUP BY 
        mes




2025-04-25 14:13:30,263 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,321 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,mes,total_usuarios_por_mes
0,1,876
1,2,1488
2,3,916
3,4,248
4,5,36



# **4. Cúal es el numero de eventos de cada tipo?**




In [20]:
query = '''
    SELECT
        i.nombre,
        COUNT(*) AS cantidad_de_eventos
    FROM
        events  e
    JOIN 
        event_identifier i
    
     ON e.event_type = i.id
    GROUP BY
        i.nombre
  


'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,343 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,344 INFO sqlalchemy.engine.Engine 
    SELECT
        i.nombre,
        COUNT(*) AS cantidad_de_eventos
    FROM
        events  e
    JOIN 
        event_identifier i
    
     ON e.event_type = i.id
    GROUP BY
        i.nombre
  



2025-04-25 14:13:30,345 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,402 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,nombre,cantidad_de_eventos
0,Page View,20928
1,Add to Cart,8451
2,Purchase,1777
3,Ad Impression,876
4,Ad Click,702


# **5. Cúal es el procentaje de visitas que tiene un evento de purchase ?**



In [21]:
query = '''
    
   
    SELECT
        100 * COUNT(DISTINCT e.visit_id)/(SELECT COUNT(DISTINCT visit_id) FROM events) AS porcentaje_compras
    FROM
        events  e
    JOIN 
        event_identifier i
    
     ON e.event_type = i.id
    WHERE i.nombre = 'Purchase'

  
  
  


'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,420 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,421 INFO sqlalchemy.engine.Engine 
    
   
    SELECT
        100 * COUNT(DISTINCT e.visit_id)/(SELECT COUNT(DISTINCT visit_id) FROM events) AS porcentaje_compras
    FROM
        events  e
    JOIN 
        event_identifier i
    
     ON e.event_type = i.id
    WHERE i.nombre = 'Purchase'

  
  
  



2025-04-25 14:13:30,422 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,468 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,porcentaje_compras
0,49.8597


# **6. Cual es el procentaje de vistitantes que vieron la pagina de checkout pero no compraron (purchase event)**




In [22]:
query = '''
    
    WITH tabla_resumen AS (
    SELECT
        e.visit_id,
        SUM((CASE WHEN i.nombre = 'Page View' AND ph.page_name = 'Checkout' THEN 1 ELSE 0 END)) AS checkout,
        SUM((CASE WHEN i.nombre = 'Purchase' THEN 1 ELSE 0 END)) AS purchase


    FROM
        events  e
    JOIN 
        event_identifier i
    
     ON e.event_type = i.id
       
    JOIN 
        page_hierarchy ph
    
     ON e.page_id = ph.page_id
    GROUP BY
        e.visit_id )

    
    SELECT
        ROUND(100 * (1-(SUM(purchase)/SUM(checkout))),2) AS percentage_checkout_view_with_no_purchase
    FROM
        tabla_resumen
  
  
  
  


'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,482 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,483 INFO sqlalchemy.engine.Engine 
    
    WITH tabla_resumen AS (
    SELECT
        e.visit_id,
        SUM((CASE WHEN i.nombre = 'Page View' AND ph.page_name = 'Checkout' THEN 1 ELSE 0 END)) AS checkout,
        SUM((CASE WHEN i.nombre = 'Purchase' THEN 1 ELSE 0 END)) AS purchase


    FROM
        events  e
    JOIN 
        event_identifier i
    
     ON e.event_type = i.id
       
    JOIN 
        page_hierarchy ph
    
     ON e.page_id = ph.page_id
    GROUP BY
        e.visit_id )

    
    SELECT
        ROUND(100 * (1-(SUM(purchase)/SUM(checkout))),2) AS percentage_checkout_view_with_no_purchase
    FROM
        tabla_resumen
  
  
  
  



2025-04-25 14:13:30,484 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,571 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,percentage_checkout_view_with_no_purchase
0,15.5


# **7. Cuales son los tops 3 paginas por numero de visitas?**






In [23]:
query = '''
    
   

    SELECT
        ph.page_name,
        COUNT(visit_id) AS total_visitas
    FROM
        events e
    JOIN 
        page_hierarchy ph
    ON e.page_id = ph.page_id
    WHERE e.event_type = 1
    GROUP BY 
        ph.page_name
    ORDER BY
        total_visitas DESC
  
  


'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,591 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,592 INFO sqlalchemy.engine.Engine 
    
   

    SELECT
        ph.page_name,
        COUNT(visit_id) AS total_visitas
    FROM
        events e
    JOIN 
        page_hierarchy ph
    ON e.page_id = ph.page_id
    WHERE e.event_type = 1
    GROUP BY 
        ph.page_name
    ORDER BY
        total_visitas DESC
  
  



2025-04-25 14:13:30,593 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,640 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,page_name,total_visitas
0,All Products,3174
1,Checkout,2103
2,Home Page,1782
3,Oyster,1568
4,Crab,1564
5,Russian Caviar,1563
6,Kingfish,1559
7,Salmon,1559
8,Lobster,1547
9,Abalone,1525


# **8. ¿Cuál es el número de visualizaciones y agregaciones al carrito para cada categoría de producto?**



In [24]:
query = '''
    
   

    SELECT
        ph.product_category,
        SUM(CASE WHEN ei.nombre='Add to Cart' THEN 1 ELSE 0 END) AS carrito,
        SUM(CASE WHEN ei.nombre='Page View' THEN 1 ELSE 0 END) AS visitas
    FROM
        events e
    JOIN 
        page_hierarchy ph
    ON e.page_id = ph.page_id
    JOIN 
        event_identifier ei
    ON e.event_type = ei.id
    WHERE ph.product_category IS NOT NULL
    GROUP BY
        ph.product_category
    ORDER BY
        visitas DESC
  


'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,656 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,657 INFO sqlalchemy.engine.Engine 
    
   

    SELECT
        ph.product_category,
        SUM(CASE WHEN ei.nombre='Add to Cart' THEN 1 ELSE 0 END) AS carrito,
        SUM(CASE WHEN ei.nombre='Page View' THEN 1 ELSE 0 END) AS visitas
    FROM
        events e
    JOIN 
        page_hierarchy ph
    ON e.page_id = ph.page_id
    JOIN 
        event_identifier ei
    ON e.event_type = ei.id
    WHERE ph.product_category IS NOT NULL
    GROUP BY
        ph.product_category
    ORDER BY
        visitas DESC
  



2025-04-25 14:13:30,658 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,738 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,product_category,carrito,visitas
0,Shellfish,3792.0,6204.0
1,Fish,2789.0,4633.0
2,Luxury,1870.0,3032.0


# **9. ¿Cuáles son los 3 productos principales por compras?**



In [25]:
query = '''
    
    -- Obtener todas las ids que fueron compras

    WITH all_ids_purchase AS (
    SELECT
        DISTINCT visit_id
    FROM
        events
    WHERE
        event_type = 3 ),

    -- Filtrar la tabla evento por todas aquellas ids donde hubo compras

    purchase_table AS (

    SELECT
        *
    FROM
        events
    WHERE
        visit_id IN (SELECT * FROM all_ids_purchase) ) 

    
    -- Una vez filtrado agrupamos por producto y contamos

    SELECT
        ph.page_name,
        COUNT(*) AS total_compras
    FROM
        purchase_table pt
    JOIN
       page_hierarchy ph
    ON pt.page_id = ph.page_id
    WHERE
        event_type = 2
    GROUP BY
        ph.page_name
    ORDER BY
        total_compras
    
    

'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,766 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,767 INFO sqlalchemy.engine.Engine 
    
    -- Obtener todas las ids que fueron compras

    WITH all_ids_purchase AS (
    SELECT
        DISTINCT visit_id
    FROM
        events
    WHERE
        event_type = 3 ),

    -- Filtrar la tabla evento por todas aquellas ids donde hubo compras

    purchase_table AS (

    SELECT
        *
    FROM
        events
    WHERE
        visit_id IN (SELECT * FROM all_ids_purchase) ) 

    
    -- Una vez filtrado agrupamos por producto y contamos

    SELECT
        ph.page_name,
        COUNT(*) AS total_compras
    FROM
        purchase_table pt
    JOIN
       page_hierarchy ph
    ON pt.page_id = ph.page_id
    WHERE
        event_type = 2
    GROUP BY
        ph.page_name
    ORDER BY
        total_compras
    
    


2025-04-25 14:13:30,769 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:30,819 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,page_name,total_compras
0,Tuna,697
1,Russian Caviar,697
2,Abalone,699
3,Kingfish,707
4,Black Truffle,707
5,Salmon,711
6,Crab,719
7,Oyster,726
8,Lobster,754


# **3. Análisis del Embudo de Producto**

Usando una sola consulta SQL, crea una nueva tabla de salida que contenga los siguientes detalles:

- ¿Cuántas veces se visualizó cada producto?
- ¿Cuántas veces se añadió cada producto al carrito?
- ¿Cuántas veces se añadió cada producto al carrito pero no se compró (abandonado)?
- ¿Cuántas veces se compró cada producto?

In [26]:
query = '''
    WITH temporal_resume AS (
    SELECT
        e.visit_id,
        e.cookie_id,
        e.page_id,
        e.event_type,
        e.sequence_number,
        e.event_time,
        ei.nombre,
        ph.page_name,
        ph.product_category,
        ph.product_id,
        ph.page_name as product_name
    FROM
        events e
    JOIN event_identifier ei
    ON  e.event_type = ei.id
    JOIN page_hierarchy ph
    ON e.page_id = ph.page_id)


    SELECT 
        product_name,
        product_category,
        
        SUM((CASE WHEN nombre = 'Page View' THEN 1 ELSE 0 END )) AS total_vistar_por_producto,
        SUM((CASE WHEN nombre = 'Add to Cart' THEN 1 ELSE 0 END )) AS total_agregados_carro,
        SUM((CASE WHEN nombre = 'Add to Cart'  AND visit_id NOT IN (SELECT DISTINCT visit_id FROM temporal_resume WHERE nombre='Purchase') THEN 1 ELSE 0 END)) AS agregado_pero_no_comprados,
        SUM((CASE WHEN nombre = 'Add to Cart'  AND visit_id  IN (SELECT DISTINCT visit_id FROM temporal_resume WHERE nombre='Purchase') THEN 1 ELSE 0 END)) AS Comprados

    FROM temporal_resume

    GROUP BY 
        product_name, product_category

 



'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:30,843 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:30,843 INFO sqlalchemy.engine.Engine 
    WITH temporal_resume AS (
    SELECT
        e.visit_id,
        e.cookie_id,
        e.page_id,
        e.event_type,
        e.sequence_number,
        e.event_time,
        ei.nombre,
        ph.page_name,
        ph.product_category,
        ph.product_id,
        ph.page_name as product_name
    FROM
        events e
    JOIN event_identifier ei
    ON  e.event_type = ei.id
    JOIN page_hierarchy ph
    ON e.page_id = ph.page_id)


    SELECT 
        product_name,
        product_category,
        
        SUM((CASE WHEN nombre = 'Page View' THEN 1 ELSE 0 END )) AS total_vistar_por_producto,
        SUM((CASE WHEN nombre = 'Add to Cart' THEN 1 ELSE 0 END )) AS total_agregados_carro,
        SUM((CASE WHEN nombre = 'Add to Cart'  AND visit_id NOT IN (SELECT DISTINCT visit_id FROM temporal_resume WHERE nombre='Purchase') THEN 1 ELSE 0 END)) AS agregado_

Unnamed: 0,product_name,product_category,total_vistar_por_producto,total_agregados_carro,agregado_pero_no_comprados,Comprados
0,Home Page,,1782.0,0.0,0.0,0.0
1,All Products,,3174.0,0.0,0.0,0.0
2,Russian Caviar,Luxury,1563.0,946.0,249.0,697.0
3,Lobster,Shellfish,1547.0,968.0,214.0,754.0
4,Crab,Shellfish,1564.0,949.0,230.0,719.0
5,Oyster,Shellfish,1568.0,943.0,217.0,726.0
6,Checkout,,2103.0,0.0,0.0,0.0
7,Confirmation,,0.0,0.0,0.0,0.0
8,Kingfish,Fish,1559.0,920.0,213.0,707.0
9,Tuna,Fish,1515.0,931.0,234.0,697.0


In [27]:
query = ''' 

CREATE TABLE product_funnel_analysis AS
WITH temporal_resume AS (
    SELECT
        e.visit_id,
        e.cookie_id,
        e.page_id,
        e.event_type,
        e.sequence_number,
        e.event_time,
        ei.nombre,
        ph.page_name,
        ph.product_category,
        ph.product_id,
        ph.page_name as product_name
    FROM
        events e
    JOIN event_identifier ei
    ON  e.event_type = ei.id
    JOIN page_hierarchy ph
    ON e.page_id = ph.page_id)


    SELECT 
        product_name,
        product_category,
        
        SUM((CASE WHEN nombre = 'Page View' THEN 1 ELSE 0 END )) AS total_vistar_por_producto,
        SUM((CASE WHEN nombre = 'Add to Cart' THEN 1 ELSE 0 END )) AS total_agregados_carro,
        SUM((CASE WHEN nombre = 'Add to Cart'  AND visit_id NOT IN (SELECT DISTINCT visit_id FROM temporal_resume WHERE nombre='Purchase') THEN 1 ELSE 0 END)) AS agregado_pero_no_comprados,
        SUM((CASE WHEN nombre = 'Add to Cart'  AND visit_id  IN (SELECT DISTINCT visit_id FROM temporal_resume WHERE nombre='Purchase') THEN 1 ELSE 0 END)) AS Comprados

    FROM temporal_resume

    GROUP BY 
        product_name, product_category
        
        
    '''


with engine.begin() as conn:
    conn.execute(text(query))

2025-04-25 14:13:31,063 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:31,065 INFO sqlalchemy.engine.Engine  

CREATE TABLE product_funnel_analysis AS
WITH temporal_resume AS (
    SELECT
        e.visit_id,
        e.cookie_id,
        e.page_id,
        e.event_type,
        e.sequence_number,
        e.event_time,
        ei.nombre,
        ph.page_name,
        ph.product_category,
        ph.product_id,
        ph.page_name as product_name
    FROM
        events e
    JOIN event_identifier ei
    ON  e.event_type = ei.id
    JOIN page_hierarchy ph
    ON e.page_id = ph.page_id)


    SELECT 
        product_name,
        product_category,
        
        SUM((CASE WHEN nombre = 'Page View' THEN 1 ELSE 0 END )) AS total_vistar_por_producto,
        SUM((CASE WHEN nombre = 'Add to Cart' THEN 1 ELSE 0 END )) AS total_agregados_carro,
        SUM((CASE WHEN nombre = 'Add to Cart'  AND visit_id NOT IN (SELECT DISTINCT visit_id FROM temporal_resume WHERE nombre='Purch

# **1. Qué producto tuvo más visualizaciones, añadidos al carrito y compras?**

In [28]:
query = '''
    SELECT
        *

    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL

    ORDER BY total_vistar_por_producto DESC,
            total_agregados_carro DESC,
            Comprados DESC



'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:31,452 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:31,453 INFO sqlalchemy.engine.Engine 
    SELECT
        *

    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL

    ORDER BY total_vistar_por_producto DESC,
            total_agregados_carro DESC,
            Comprados DESC




2025-04-25 14:13:31,453 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:31,456 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,product_name,product_category,total_vistar_por_producto,total_agregados_carro,agregado_pero_no_comprados,Comprados
0,Oyster,Shellfish,1568.0,943.0,217.0,726.0
1,Crab,Shellfish,1564.0,949.0,230.0,719.0
2,Russian Caviar,Luxury,1563.0,946.0,249.0,697.0
3,Salmon,Fish,1559.0,938.0,227.0,711.0
4,Kingfish,Fish,1559.0,920.0,213.0,707.0
5,Lobster,Shellfish,1547.0,968.0,214.0,754.0
6,Abalone,Shellfish,1525.0,932.0,233.0,699.0
7,Tuna,Fish,1515.0,931.0,234.0,697.0
8,Black Truffle,Luxury,1469.0,924.0,217.0,707.0


- **Oyster**

# **2. ¿Qué producto es más propenso a ser abandonado?**

In [29]:
query = '''
    SELECT
        *

    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL

    ORDER BY
            agregado_pero_no_comprados DESC
    LIMIT 
        1;



'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:31,486 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:31,488 INFO sqlalchemy.engine.Engine 
    SELECT
        *

    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL

    ORDER BY
            agregado_pero_no_comprados DESC
    LIMIT 
        1;




2025-04-25 14:13:31,489 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:31,491 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,product_name,product_category,total_vistar_por_producto,total_agregados_carro,agregado_pero_no_comprados,Comprados
0,Russian Caviar,Luxury,1563.0,946.0,249.0,697.0


# **3. ¿Qué producto tuvo el mayor porcentaje de visualizaciones a compras?**

In [30]:
query = '''
    SELECT
        *,
        ROUND(Comprados *100 / total_vistar_por_producto, 2) AS porcentaje_ventas_por_visitas

    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL
    ORDER BY
        porcentaje_ventas_por_visitas DESC;





'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:31,517 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:31,518 INFO sqlalchemy.engine.Engine 
    SELECT
        *,
        ROUND(Comprados *100 / total_vistar_por_producto, 2) AS porcentaje_ventas_por_visitas

    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL
    ORDER BY
        porcentaje_ventas_por_visitas DESC;






2025-04-25 14:13:31,519 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:31,522 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,product_name,product_category,total_vistar_por_producto,total_agregados_carro,agregado_pero_no_comprados,Comprados,porcentaje_ventas_por_visitas
0,Lobster,Shellfish,1547.0,968.0,214.0,754.0,48.74
1,Black Truffle,Luxury,1469.0,924.0,217.0,707.0,48.13
2,Oyster,Shellfish,1568.0,943.0,217.0,726.0,46.3
3,Tuna,Fish,1515.0,931.0,234.0,697.0,46.01
4,Crab,Shellfish,1564.0,949.0,230.0,719.0,45.97
5,Abalone,Shellfish,1525.0,932.0,233.0,699.0,45.84
6,Salmon,Fish,1559.0,938.0,227.0,711.0,45.61
7,Kingfish,Fish,1559.0,920.0,213.0,707.0,45.35
8,Russian Caviar,Luxury,1563.0,946.0,249.0,697.0,44.59


# **4. ¿Cuál es la tasa de conversión promedio de visualización a añadir al carrito?**

# **5. ¿Cuál es la tasa de conversión promedio de añadir al carrito a compra?**



In [31]:
query = '''
    SELECT
        AVG(ROUND(total_agregados_carro *100 / total_vistar_por_producto, 2)) AS conversion_promedio_view_to_add_cart,
        AVG(ROUND(Comprados	 *100 / total_agregados_carro, 2)) AS conversion_promedio_view_to_add_cart


    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL





'''


pd.read_sql_query(query, con=engine)

2025-04-25 14:13:31,545 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 14:13:31,546 INFO sqlalchemy.engine.Engine 
    SELECT
        AVG(ROUND(total_agregados_carro *100 / total_vistar_por_producto, 2)) AS conversion_promedio_view_to_add_cart,
        AVG(ROUND(Comprados	 *100 / total_agregados_carro, 2)) AS conversion_promedio_view_to_add_cart


    FROM 
        product_funnel_analysis
    WHERE 
        product_category IS NOT NULL






2025-04-25 14:13:31,546 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-04-25 14:13:31,548 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,conversion_promedio_view_to_add_cart,conversion_promedio_view_to_add_cart.1
0,60.95,75.928889


# **3. Análisis de Campañas**

Genera una tabla que tenga 1 única fila por cada registro único de `visit_id` y que contenga las siguientes columnas:

- **user_id**: Identificador del usuario.
- **visit_id**: Identificador de la visita.
- **visit_start_time**: El `event_time` más temprano para cada visita.
- **page_views**: Conteo de vistas de página para cada visita.
- **cart_adds**: Conteo de eventos de agregar producto al carrito para cada visita.
- **purchase**: Indicador 1/0 si existe un evento de compra para cada visita.
- **campaign_name**: Asigna la visita a una campaña si el `visit_start_time` está entre la `start_date` y la `end_date` de la campaña.
- **impression**: Conteo de impresiones de anuncios para cada visita.
- **click**: Conteo de clics en anuncios para cada visita.
- **(Columna opcional) cart_products**: Un valor de texto separado por comas con los productos agregados al carrito, ordenados según el orden en que fueron agregados (pista: usa el `sequence_number`).

In [65]:
query = '''
   
SELECT 
    u.user_id, 
    e.visit_id, 
    MIN(e.event_time) AS visit_start_time,
    SUM(CASE WHEN e.event_type = 1 THEN 1 ELSE 0 END) AS page_views,
    SUM(CASE WHEN e.event_type = 2 THEN 1 ELSE 0 END) AS cart_adds,
    SUM(CASE WHEN e.event_type = 3 THEN 1 ELSE 0 END) AS purchase,
    c.campaign_name,
    SUM(CASE WHEN e.event_type = 4 THEN 1 ELSE 0 END) AS impression, 
    SUM(CASE WHEN e.event_type = 5 THEN 1 ELSE 0 END) AS click, 
    GROUP_CONCAT(
        CASE 
            WHEN p.product_id IS NOT NULL AND e.event_type = 2 THEN p.page_name 
            ELSE NULL 
        END 
        ORDER BY e.sequence_number
        SEPARATOR ', '
    ) AS cart_products
FROM 
    users AS u
INNER JOIN 
    events AS e ON u.cookie_id = e.cookie_id
LEFT JOIN 
    campaign_identifier AS c ON e.event_time BETWEEN c.start_date AND c.end_date
LEFT JOIN 
    page_hierarchy AS p ON e.page_id = p.page_id
GROUP BY 
    u.user_id, e.visit_id, c.campaign_name;






'''


pd.read_sql_query(query, con=engine)

2025-04-25 15:20:33,231 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-04-25 15:20:33,232 INFO sqlalchemy.engine.Engine 
   
SELECT 
    u.user_id, 
    e.visit_id, 
    MIN(e.event_time) AS visit_start_time,
    SUM(CASE WHEN e.event_type = 1 THEN 1 ELSE 0 END) AS page_views,
    SUM(CASE WHEN e.event_type = 2 THEN 1 ELSE 0 END) AS cart_adds,
    SUM(CASE WHEN e.event_type = 3 THEN 1 ELSE 0 END) AS purchase,
    c.campaign_name,
    SUM(CASE WHEN e.event_type = 4 THEN 1 ELSE 0 END) AS impression, 
    SUM(CASE WHEN e.event_type = 5 THEN 1 ELSE 0 END) AS click, 
    GROUP_CONCAT(
        CASE 
            WHEN p.product_id IS NOT NULL AND e.event_type = 2 THEN p.page_name 
            ELSE NULL 
        END 
        ORDER BY e.sequence_number
        SEPARATOR ', '
    ) AS cart_products
FROM 
    users AS u
INNER JOIN 
    events AS e ON u.cookie_id = e.cookie_id
LEFT JOIN 
    campaign_identifier AS c ON e.event_time BETWEEN c.start_date AND c.end_date
LEFT JOIN 
    page_hierar

Unnamed: 0,user_id,visit_id,visit_start_time,page_views,cart_adds,purchase,campaign_name,impression,click,cart_products
0,1,02a5d5,2020-02-26 16:57:26,4.0,0.0,0.0,Half Off - Treat Your Shellf(ish),0.0,0.0,
1,1,0826dc,2020-02-26 05:58:38,1.0,0.0,0.0,Half Off - Treat Your Shellf(ish),0.0,0.0,
2,1,0fc437,2020-02-04 17:49:50,10.0,6.0,1.0,Half Off - Treat Your Shellf(ish),1.0,1.0,"Tuna, Russian Caviar, Black Truffle, Abalone, ..."
3,1,30b94d,2020-03-15 13:12:54,9.0,7.0,1.0,Half Off - Treat Your Shellf(ish),1.0,1.0,"Salmon, Kingfish, Tuna, Russian Caviar, Abalon..."
4,1,41355d,2020-03-25 00:11:18,6.0,1.0,0.0,Half Off - Treat Your Shellf(ish),0.0,0.0,Lobster
...,...,...,...,...,...,...,...,...,...,...
3559,499,e6794c,2020-01-29 04:55:56,8.0,4.0,1.0,,0.0,0.0,"Russian Caviar, Abalone, Lobster, Crab"
3560,500,13668d,2020-02-28 02:16:15,8.0,5.0,1.0,Half Off - Treat Your Shellf(ish),1.0,1.0,"Salmon, Tuna, Russian Caviar, Abalone, Crab"
3561,500,29e5f8,2020-02-28 22:06:31,8.0,3.0,1.0,Half Off - Treat Your Shellf(ish),0.0,0.0,"Kingfish, Russian Caviar, Abalone"
3562,500,4cdfb7,2020-02-28 00:31:15,1.0,0.0,0.0,Half Off - Treat Your Shellf(ish),0.0,0.0,
