# Cargar Librerias

In [30]:
from pathlib import Path
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime

# Cargamos Datos desde SQlite

In [5]:
DB_DIR  = Path("../db")   
DB_DIR.mkdir(parents=True, exist_ok=True)
DB_PATH = DB_DIR / "online_retail_clean.db"
print(DB_PATH)

with sqlite3.connect(DB_PATH) as conn:
    df = pd.read_sql_query("SELECT * FROM transactions", conn)

..\db\online_retail_clean.db


# Tipos recomendados y columnas derivadas

In [6]:
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"], errors="coerce")
df["Quantity"]    = pd.to_numeric(df["Quantity"], errors="coerce")
df["UnitPrice"]   = pd.to_numeric(df["UnitPrice"], errors="coerce")
df["Revenue"]     = df["Quantity"] * df["UnitPrice"]

print("Shape:", df.shape)
df.head()

Shape: (397884, 9)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


# MINIMO MAXIMO

In [15]:
rows, cols = df.shape
columns = df.columns.tolist()
date_min = df["InvoiceDate"].min()
date_max = df["InvoiceDate"].max()
print("Fecha Maxima",date_max)
print("Fecha Minima",date_min)

Fecha Maxima 2011-12-09 12:50:00
Fecha Minima 2010-12-01 08:26:00


# Clientes, Paises, Productos, Facturas

In [14]:
n_invoices  = df["InvoiceNo"].nunique()
n_customers = df["CustomerID"].nunique(dropna=True)
n_products  = df["StockCode"].nunique()
n_countries = df["Country"].nunique()

print("# Paises", n_countries)
print("# Clientes", n_customers)
print("# Facturas", n_invoices)
print("# Productos", n_products)

# Paises 37
# Clientes 4338
# Facturas 18532
# Productos 3665


# Métricas de ventas

In [24]:
total_revenue = np.nansum(df["Revenue"])
order_revenue = df.groupby("InvoiceNo", dropna=True)["Revenue"].sum()
avg_order_value   = order_revenue.mean()
median_order_value= order_revenue.median()

items_per_invoice = df.groupby("InvoiceNo", dropna=True)["Quantity"].sum()
avg_items_per_inv = items_per_invoice.mean()
median_items_per_inv = items_per_invoice.median()
print("Total Ganacia",total_revenue)
print("Ordenes ",order_revenue)
print("Promedio Ventas",avg_order_value)
print("Mediana Ventas",median_order_value)
print("Promedio items por factura",avg_items_per_inv)
print("Mediana items por factura",median_items_per_inv)
print("Items por Factura",items_per_invoice)




Total Ganacia 8911407.904
Ordenes  InvoiceNo
536365    139.12
536366     22.20
536367    278.73
536368     70.05
536369     17.85
           ...  
581583    124.60
581584    140.64
581585    329.05
581586    339.20
581587    249.45
Name: Revenue, Length: 18532, dtype: float64
Promedio Ventas 480.8659563997409
Mediana Ventas 303.03999999999996
Promedio items por factura 278.8588387653788
Mediana items por factura 155.0
Items por Factura InvoiceNo
536365     40
536366     12
536367     83
536368     15
536369      3
         ... 
581583     76
581584    120
581585    278
581586     66
581587    105
Name: Quantity, Length: 18532, dtype: int64


# Top países / productos

In [26]:
top_countries = (df.groupby("Country", dropna=True)["Revenue"]
                   .sum()
                   .sort_values(ascending=False)
                   .head(10))

top_products_by_qty = (df.groupby("Description", dropna=True)["Quantity"]
                         .sum()
                         .sort_values(ascending=False)
                         .head(10))
print(top_countries)
print(top_products_by_qty)



Country
United Kingdom    7308391.554
Netherlands        285446.340
EIRE               265545.900
Germany            228867.140
France             209024.050
Australia          138521.310
Spain               61577.110
Switzerland         56443.950
Belgium             41196.340
Sweden              38378.330
Name: Revenue, dtype: float64
Description
PAPER CRAFT , LITTLE BIRDIE           80995
MEDIUM CERAMIC TOP STORAGE JAR        77916
WORLD WAR 2 GLIDERS ASSTD DESIGNS     54415
JUMBO BAG RED RETROSPOT               46181
WHITE HANGING HEART T-LIGHT HOLDER    36725
ASSORTED COLOUR BIRD ORNAMENT         35362
PACK OF 72 RETROSPOT CAKE CASES       33693
POPCORN HOLDER                        30931
RABBIT NIGHT LIGHT                    27202
MINI PAINT SET VINTAGE                26076
Name: Quantity, dtype: int64


# Nulos y duplicados

In [27]:
nulls_by_col = df.isna().sum().sort_values(ascending=False)
dupe_rows = df.duplicated().sum()

summary_preview = {
    "rows": rows,
    "cols": cols,
    "date_min": str(date_min),
    "date_max": str(date_max),
    "n_invoices": n_invoices,
    "n_customers": n_customers,
    "n_products": n_products,
    "n_countries": n_countries,
    "total_revenue": float(total_revenue),
    "avg_order_value": float(avg_order_value),
    "median_order_value": float(median_order_value),
    "avg_items_per_invoice": float(avg_items_per_inv),
    "median_items_per_invoice": float(median_items_per_inv),
    "dup_rows": int(dupe_rows),
}
summary_preview

{'rows': 397884,
 'cols': 9,
 'date_min': '2010-12-01 08:26:00',
 'date_max': '2011-12-09 12:50:00',
 'n_invoices': 18532,
 'n_customers': 4338,
 'n_products': 3665,
 'n_countries': 37,
 'total_revenue': 8911407.904,
 'avg_order_value': 480.8659563997409,
 'median_order_value': 303.03999999999996,
 'avg_items_per_invoice': 278.8588387653788,
 'median_items_per_invoice': 155.0,
 'dup_rows': 5192}

# Path para archivo txt

In [None]:
REPORTS_DIR  = Path("../reports")   
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
OUT_TXT = REPORTS_DIR / "online_retail_clean.txt"
print(OUT_TXT)

# Generación de archivo txt

In [31]:


field_desc = {
    "InvoiceNo":  "Identificador de factura (string). En el raw, facturas anuladas suelen iniciar con 'C'.",
    "StockCode":  "Código del producto (string).",
    "Description":"Nombre del producto (texto).",
    "Quantity":   "Cantidad vendida en la línea (entero; en el raw, valores negativos indican devoluciones).",
    "InvoiceDate":"Fecha y hora de la transacción (datetime).",
    "UnitPrice":  "Precio unitario del producto (numérico).",
    "CustomerID": "Identificador de cliente (categórico; en el raw hay nulos).",
    "Country":    "País del cliente (categórico).",
    "Revenue":    "Campo derivado = Quantity × UnitPrice (numérico)."
}

def fmt_money(x): 
    return f"{x:,.2f}"
#Genero el top 10
def list_block(title, series, top_n=10, money=False):
    lines = [f"{title}:"]
    for k, v in series.head(top_n).items():
        vv = fmt_money(v) if money else f"{v:,}"
        lines.append(f"  - {k}: {vv}")
    return "\n".join(lines)

metadata_list = [
    f"- Rango temporal: {date_min:%Y-%m-%d} → {date_max:%Y-%m-%d}",
    f"- Filas × columnas: {rows:,} × {cols}",
    f"- Facturas únicas: {n_invoices:,}",
    f"- Clientes únicos: {n_customers:,}",
    f"- Productos únicos: {n_products:,}",
    f"- Países: {n_countries:,}",
    f"- Revenue total (según UnitPrice): {fmt_money(total_revenue)}",
    f"- Valor promedio por factura (AOV): {fmt_money(avg_order_value)} (mediana: {fmt_money(median_order_value)})",
    f"- Ítems promedio por factura: {avg_items_per_inv:,.2f} (mediana: {median_items_per_inv:,.2f})",
    f"- Filas duplicadas (exact match): {dupe_rows:,}",
]

txt = []
txt.append("=== EDA Summary – Online Retail ===")
txt.append(f"Generated at: {datetime.now():%Y-%m-%d %H:%M:%S}")
txt.append(f"Source table: transactions (SQLite @ ../db/online_retail_clean.db)")
txt.append("")

txt.append("1) What does the dataset contain?")
txt.append("   Transactional e-commerce records with invoice, product, quantity, pricing, timestamp, customer, and country information.")
txt.append("")

txt.append("2) Field descriptions:")
for c in df.columns:
    desc = field_desc.get(c, "(sin descripción)")
    dtype = str(df[c].dtype)
    nulls = int(nulls_by_col.get(c, 0))
    txt.append(f"   - {c} [{dtype}] – {desc} | nulls: {nulls:,}")

txt.append("")
txt.append("3) High-level metadata:")
for line in metadata_list:
    txt.append(f"   {line}")

txt.append("")
txt.append("4) Top countries by revenue (Top 10):")
for k, v in top_countries.items():
    txt.append(f"   - {k}: {fmt_money(v)}")

txt.append("")
txt.append("5) Top products by quantity (Top 10):")
for k, v in top_products_by_qty.items():
    txt.append(f"   - {k}: {int(v):,}")

txt.append("")
txt.append("6) Suggested additional metadata:")
txt.append("   - Ventas mensuales por país y global.")
txt.append("   - Recencia (última compra por cliente), frecuencia (nº facturas) y valor monetario (RFM).")
txt.append("   - Distribución por día de la semana y hora del día.")
txt.append("   - Ticket promedio por país y dispersión de precios unitarios.")
txt.append("   - Porcentaje del revenue concentrado en Top-N productos/países (curva de Pareto).")

OUT_TXT.write_text("\n".join(txt), encoding="utf-8")
print(f"✅ Resumen EDA guardado en: {OUT_TXT}")
print("\n--- Vista previa ---\n")
print("\n".join(txt[:40]), "\n...\n")


✅ Resumen EDA guardado en: ..\reports\online_retail_clean.txt

--- Vista previa ---

=== EDA Summary – Online Retail ===
Generated at: 2025-09-25 18:51:07
Source table: transactions (SQLite @ ../db/online_retail_clean.db)

1) What does the dataset contain?
   Transactional e-commerce records with invoice, product, quantity, pricing, timestamp, customer, and country information.

2) Field descriptions:
   - InvoiceNo [object] – Identificador de factura (string). En el raw, facturas anuladas suelen iniciar con 'C'. | nulls: 0
   - StockCode [object] – Código del producto (string). | nulls: 0
   - Description [object] – Nombre del producto (texto). | nulls: 0
   - Quantity [int64] – Cantidad vendida en la línea (entero; en el raw, valores negativos indican devoluciones). | nulls: 0
   - InvoiceDate [datetime64[ns]] – Fecha y hora de la transacción (datetime). | nulls: 0
   - UnitPrice [float64] – Precio unitario del producto (numérico). | nulls: 0
   - CustomerID [float64] – Identificador