In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime

DATA_PATH = '../data/online_retail.csv'
IMG_DIR = '../images'
os.makedirs(IMG_DIR, exist_ok=True)

pd.set_option('display.max_columns', 50)
print('pandas', pd.__version__)

In [None]:
# Carga de datos
df = pd.read_csv(DATA_PATH, parse_dates=['InvoiceDate'])
df.head()

In [None]:
df.info()

In [None]:
df.describe(include='all').T

In [None]:
# Limpieza de datos
before = len(df)
df.drop_duplicates(inplace=True)
after = len(df)
print(f"Filas eliminadas por duplicados: {before - after}")

In [None]:
df['Description'] = df['Description'].fillna('Unknown')
before = len(df)
df = df.dropna(subset=['CustomerID'])
after = len(df)
print(f"Filas eliminadas por CustomerID nulo: {before - after}")
df['CustomerID'] = df['CustomerID'].astype(int)

In [None]:
invalid_price = (df['UnitPrice'] <= 0).sum()
print('Filas con precio inválido:', invalid_price)
df = df[df['UnitPrice'] > 0]

In [None]:
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['DayOfWeek'] = df['InvoiceDate'].dt.day_name()
df['Hour'] = df['InvoiceDate'].dt.hour
df['Revenue'] = df['Quantity'] * df['UnitPrice']
df.head()

In [None]:
@ Análisis descriptivo con Pandas
sales_by_country = df.groupby('Country')['Revenue'].sum().sort_values(ascending=False).round(2)
sales_by_country.head(10)

In [None]:
top_products_qty = (df[df['Quantity']>0]
                    .groupby(['StockCode','Description'])['Quantity']
                    .sum().sort_values(ascending=False).head(10))
top_products_qty

In [None]:
top_products_rev = (df.groupby(['StockCode','Description'])['Revenue']
                    .sum().sort_values(ascending=False).head(10).round(2))
top_products_rev

In [None]:
top_customers = (df.groupby('CustomerID')['Revenue']
                 .sum().sort_values(ascending=False).head(10).round(2))
top_customers

In [None]:
monthly_revenue = (df.groupby(['Year','Month'])['Revenue']
                   .sum().reset_index().sort_values(['Year','Month']))
monthly_revenue.head()