# Lectura de datos exploratoria 
- **Objetivo**: Prototipar la lectura desde MySQL, ADLS y MongoDB.

Este notebook sirve como prueba inicial de conectividad y lectura.

In [1]:
import os
import pandas as pd
from pandas import DataFrame
from dotenv import load_dotenv

# Cargar las variables de entorno desde el archivo .env
load_dotenv();

categories_df  
customers_df  
departments_df  
order_items_df  
orders_df  
products_df  

## MySQL

In [2]:
from sqlalchemy import create_engine, text

In [3]:
# Conexión a MySQL (ajusta usuario, password, host, puerto y base de datos)
engine = create_engine("mysql+pymysql://root:root@mysql:3306/retail_db")
conn = engine.connect()

In [4]:
# Lectura categories con MySQL
categories_df = pd.read_sql_query(text("SELECT * FROM categories"), conn)
categories_df.head(1)

Unnamed: 0,category_id,category_department_id,category_name
0,1,2,Football


In [5]:
# Lectura customers con MySQL
customers_df = pd.read_sql_query(text("SELECT * FROM customers"), conn)
customers_df.head(1)

Unnamed: 0,customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521


## MongoDB

In [6]:
from pymongo import MongoClient

In [7]:
def get_database():

    connection_string = os.getenv("MONGO_URI")
    client = MongoClient(connection_string)

    return client['retail_db']

In [8]:
# Lectura departments con MongoDB
dbname = get_database()
 
collection_name = dbname["departments"]
departments = collection_name.find({})

departments_df = DataFrame(departments)
departments_df.head()

Unnamed: 0,_id,department_id,department_name
0,68958d94ef1d1c6340e365a8,7,Fan Shop
1,68958d94ef1d1c6340e365a6,5,Golf
2,68958d94ef1d1c6340e365a4,3,Footwear
3,68958d94ef1d1c6340e365a3,2,Fitness
4,68958d94ef1d1c6340e365a7,6,Outdoors


In [9]:
# Lectura order_items con MongoDB
dbname = get_database()
 
collection_name = dbname["order_items"]
order_items = collection_name.find({})

order_items_df = DataFrame(order_items)
order_items_df.head()

Unnamed: 0,_id,order_item_id,order_item_order_id,order_item_product_id,order_item_quantity,order_item_subtotal,order_item_product_price
0,68958d95ef1d1c6340e365bf,22,9,1073,1,199.99,199.99
1,68958d95ef1d1c6340e365d0,39,13,276,4,127.96,31.99
2,68958d95ef1d1c6340e365da,49,16,365,5,299.95,59.99
3,68958d95ef1d1c6340e365fa,81,28,191,1,99.99,99.99
4,68958d95ef1d1c6340e36603,90,33,403,1,129.99,129.99


## ADLS

In [10]:
import io
from io import StringIO
from azure.storage.blob import ContainerClient

In [11]:
# Cargar variables de entorno
conn_str = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
container_name = "retail-source"

# 2Conectar al contenedor
container_client = ContainerClient.from_connection_string(
    conn_str=conn_str,
    container_name=container_name
)

In [12]:
# Lectura orders con ADLS

# Descargar el blob como texto y leer en DataFrame
blob_client = container_client.get_blob_client('orders.csv')
csv_content = blob_client.download_blob().content_as_text(encoding="utf-8")
orders_df = pd.read_csv(StringIO(csv_content))
orders_df.head()

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25,11599,CLOSED
1,2,2013-07-25,256,PENDING_PAYMENT
2,3,2013-07-25,12111,COMPLETE
3,4,2013-07-25,8827,CLOSED
4,5,2013-07-25,11318,COMPLETE


In [13]:
# Lectura products con ADLS

# Descargar el blob como texto y leer en DataFrame
blob_client = container_client.get_blob_client('products.csv')
csv_content = blob_client.download_blob().content_as_text(encoding="utf-8")
products_df = pd.read_csv(StringIO(csv_content))
products_df.head()

Unnamed: 0,product_id,product_category_id,product_name,product_description,product_price,product_image
0,1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+F...
1,2,2,Under Armour Men's Highlight MC Football Clea,,129.99,http://images.acmesports.sports/Under+Armour+M...
2,3,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+M...
3,4,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+M...
4,5,2,Riddell Youth Revolution Speed Custom Footbal,,199.99,http://images.acmesports.sports/Riddell+Youth+...


## Amazon S3

In [14]:
import boto3

In [15]:
# Crear cliente S3
s3_client = boto3.client('s3')

In [16]:
# Parámetros
bucket_name = "retail-landing"
file_key = "categories.csv" 

# Descargar el archivo como texto
csv_obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
body = csv_obj['Body'].read().decode('utf-8')

# Convertir a DataFrame
categories_df = pd.read_csv(StringIO(body))

# Mostrar datos
categories_df.head()

Unnamed: 0,category_id,category_department_id,category_name
0,1,2,Football
1,2,2,Soccer
2,3,2,Baseball & Softball
3,4,2,Basketball
4,5,2,Lacrosse


In [17]:
# Parámetros
bucket_name = "retail-landing"
file_key = "customers.csv" 

# Descargar el archivo como texto
csv_obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
body = csv_obj['Body'].read().decode('utf-8')

# Convertir a DataFrame
customers_df = pd.read_csv(StringIO(body))

# Mostrar datos
customers_df.head()

Unnamed: 0,customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_state,customer_zipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
2,3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,725
3,4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069
4,5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,10 Crystal River Mall,Caguas,PR,725
