# Northwind

Extenderemos nuestro ejemplo modelando nuestra base de datos en su formato dimensional o de estrella.

In [17]:
import sqlite3

# Connect to the DB
conn = sqlite3.connect("northwind.db")
c = conn.cursor();

In [18]:
c.execute("""SELECT name FROM sqlite_master WHERE type='table';""").fetchall()

[('Categories',),
 ('sqlite_sequence',),
 ('Customers',),
 ('Employees',),
 ('Shippers',),
 ('Suppliers',),
 ('Products',),
 ('Orders',),
 ('OrderDetails',),
 ('Retired',),
 ('Customer_DIM',),
 ('Order_DIM',),
 ('Product_DIM',),
 ('Orderline_FACT',)]

Crearemos las tablas de dimensiones con la información por la que querremos cortar o agrupar los datos. En este caso nuestras dimensiones serán las de **Cliente**, **Order** y **Producto**.

In [19]:
c.execute("""CREATE TABLE Customer_DIM AS SELECT CustomerID, CustomerName, Country, City FROM Customers;""").fetchall()
c.execute("""CREATE TABLE Order_DIM AS SELECT OrderID, datetime(OrderDate) as Date, ShipperName FROM Orders INNER JOIN Shippers""").fetchall()
c.execute("""CREATE TABLE Product_DIM AS SELECT ProductID, ProductName, Unit, Price FROM Products""").fetchall()

OperationalError: table Customer_DIM already exists

Y los hechos a analizar, la información relativa a las **ordenes** (cuando, a qué precio, qué cantidad, etc...).

In [20]:
c.execute("""CREATE TABLE Orderline_FACT AS SELECT od.*, CustomerID FROM Orders o INNER JOIN OrderDetails od ON od.OrderID = o.OrderID""").fetchall()

OperationalError: table Orderline_FACT already exists

Vemos que ahora disponemos de nuevas tablas en nuestro sistema. Habitualmente estas tablas se encontrarán en otra base de datos, nuestro sistema informacional o _data warehouse_.

In [21]:
c.execute("""SELECT name FROM sqlite_master WHERE type='table';""").fetchall()

[('Categories',),
 ('sqlite_sequence',),
 ('Customers',),
 ('Employees',),
 ('Shippers',),
 ('Suppliers',),
 ('Products',),
 ('Orders',),
 ('OrderDetails',),
 ('Retired',),
 ('Customer_DIM',),
 ('Order_DIM',),
 ('Product_DIM',),
 ('Orderline_FACT',)]

In [None]:
# Utiliza SQLAlchemy para conectarse a una base de datos SQLite

In [23]:
!pip install sqlalchemy 



In [24]:
from sqlalchemy import create_engine # La función create_engine se utiliza para configurar la conexión a la base de datos 

engine = create_engine('sqlite:///C:/Users/rodri/OneDrive/Escritorio/DATA_SCIENCE/CODIGO_CLASES/TheBridge_DSPT-1/2-Data_Analysis/6-Bases_de_datos/Práctica/datanorthwind.db', echo=False) 
connection = engine.connect()  

# Aquí se crea un engine para conectarse a una base de datos SQLite. El prefijo 'sqlite:///' indica que se trata de una base de
# datos SQLite. Se debe proporcionar la ruta correcta al archivo de base de datos (path/to/northwind.db). El argumento echo=False es opcional
# y, si se establece como True, SQLAlchemy imprimirá todas las consultas SQL generadas, lo cual es útil para depuración. 

Podemos observar la información relativa a las ordenes, en este caso solo disponemos de cantidad de producto vendida en cada orden.

In [25]:
from sqlalchemy import inspect

# Crear un inspector para listar las tablas
inspector = inspect(engine)
tables = inspector.get_table_names()

# Imprimir la lista de tablas
print("Tablas en la base de datos:", tables) 

Tablas en la base de datos: []


In [26]:
import pandas as pd

fact_table = pd.read_sql_table('Orderline_FACT', con=engine)
fact_table 

ValueError: Table Orderline_FACT not found

Pero combinando con la información de contexto podemos obtener información significativa a las preguntas que nos haciamos antes.

In [27]:
import pandas as pd

query = """
SELECT 	strftime('%m', Date) AS Month, CustomerID AS Client, ProductID AS Product, count(*) AS Howmany 
FROM Orderline_FACT
JOIN Order_DIM 
GROUP BY Month, Client, Product
"""
pd.read_sql(query, con=connection)

OperationalError: (sqlite3.OperationalError) no such table: Orderline_FACT
[SQL: 
SELECT 	strftime('%m', Date) AS Month, CustomerID AS Client, ProductID AS Product, count(*) AS Howmany 
FROM Orderline_FACT
JOIN Order_DIM 
GROUP BY Month, Client, Product
]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [28]:
query = """
SELECT 	strftime('%m', Date) AS Month, CustomerID AS Client, ProductName, count(*) AS Howmany 
FROM Orderline_FACT
JOIN Order_DIM 
JOIN Product_DIM p
GROUP BY Month, Client, ProductName
"""
pd.read_sql(query, con=connection)

OperationalError: (sqlite3.OperationalError) no such table: Orderline_FACT
[SQL: 
SELECT 	strftime('%m', Date) AS Month, CustomerID AS Client, ProductName, count(*) AS Howmany 
FROM Orderline_FACT
JOIN Order_DIM 
JOIN Product_DIM p
GROUP BY Month, Client, ProductName
]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [7]:
query = """
SELECT strftime('%Y', Date) AS Year, 
       strftime('%m', Date) AS Month,
       CustomerName AS Client, 
       ProductName AS Product, 
       count(*) AS Howmany
FROM Orderline_FACT fact
JOIN Order_DIM o ON o.OrderID = fact.OrderID
JOIN Product_DIM p ON p.ProductID = fact.ProductID
JOIN Customer_DIM c ON c.CustomerID = fact.CustomerID
GROUP BY Year, Month, Client, Product
"""
pd.read_sql(query, con=connection)

Unnamed: 0,Year,Month,Client,Product,Howmany
0,1996,07,Blondel père et fils,Alice Mutton,3
1,1996,07,Blondel père et fils,Outback Lager,3
2,1996,07,Centro comercial Moctezuma,Gravad lax,3
3,1996,07,Centro comercial Moctezuma,Sir Rodney's Scones,3
4,1996,07,Chop-suey Chinese,Guaraná Fantástica,3
...,...,...,...,...,...
507,1997,02,Save-a-lot Markets,Thüringer Rostbratwurst,3
508,1997,02,Toms Spezialitäten,Ravioli Angelo,3
509,1997,02,Toms Spezialitäten,Sasquatch Ale,3
510,1997,02,Toms Spezialitäten,Teatime Chocolate Biscuits,3


Estas consultas sin disponer de mucho dato pueden ser pesadas. En estos casos suele ser recomendable, si no nos urge la información de rabiosa actualidad, que estos procesos sean ejecutados durante la noche para disponer de información ya agregada a la mañana siguiente.

# Cubos

Se definen como cubos agregaciones como la anterior ya consolidadas como tablas donde podemos cortar las dimensiones en base a los datos de corte dimensional seleccionados.

In [10]:
query = """
SELECT strftime('%Y', Date) AS Year, 
       strftime('%m', Date) AS Month,
       CustomerName AS Client, 
       ProductName AS Product, 
       count(*) AS Howmany
FROM Orderline_FACT fact
JOIN Order_DIM o ON o.OrderID = fact.OrderID
JOIN Product_DIM p ON p.ProductID = fact.ProductID
JOIN Customer_DIM c ON c.CustomerID = fact.CustomerID
GROUP BY Year, Month, Client, Product
"""
productivity = pd.read_sql(query, con=connection)
productivity.to_sql("Productivity_CUBE", con=engine)

512

In [13]:
# Now it should be faster to get our insights
# Two most productive months
query = """SELECT Month, SUM(Howmany) AS Cuantos
FROM Productivity_CUBE 
GROUP BY Month
ORDER BY 2 DESC
LIMIT 2"""
pd.read_sql(query, con=connection)

Unnamed: 0,Month,Cuantos
0,1,255
1,12,243


In [14]:
# Three most productive months-client
query = """SELECT Month, Client, SUM(Howmany) AS Cuantos
FROM Productivity_CUBE 
GROUP BY Month, Client
ORDER BY 3 DESC
LIMIT 3"""
pd.read_sql(query, con=connection)

Unnamed: 0,Month,Client,Cuantos
0,8,QUICK-Stop,30
1,9,Hungry Owl All-Night Grocers,27
2,10,Frankenversand,27


In [15]:
conn.close()