# Mineria de patrones - Online Retail

Frequent itemsets, reglas de asociacion y patrones emergentes sobre el dataset limpio.


## Objetivos
- Preparar transacciones con un unico espacio de items (encoder unico)
- Obtener itemsets frecuentes y reglas (soporte, confianza, lift)
- Comparar particiones temporales (H1 vs H2 2011) con growth rate
- Exportar resultados a disco y generar visualizaciones basicas


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from mlxtend.frequent_patterns import fpgrowth, association_rules, fpmax
from mlxtend.preprocessing import TransactionEncoder

sns.set_theme(style="whitegrid")
pd.options.display.float_format = "{:,.4f}".format


## Configuracion


In [None]:
DATA_DIR = Path("data")
PROCESSED_PATH = DATA_DIR / "processed" / "cleaned_online_retail.csv"
RESULTS_DIR = DATA_DIR / "processed"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Parametros
MIN_SUPPORT = 0.01
MAX_LEN = None  # ejemplo: 3 para limitar
MIN_CONFIDENCE = 0.3
SEED = 42
np.random.seed(SEED)


## Carga del dataset procesado


In [None]:
if not PROCESSED_PATH.exists():
    raise FileNotFoundError("Ejecuta primero eda.ipynb para generar cleaned_online_retail.csv")

df = pd.read_csv(PROCESSED_PATH)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
print(f"Filas: {len(df):,} | Items unicos: {df['Description'].nunique()} | Facturas: {df['InvoiceNo'].nunique()}")
df.head()


## Particiones temporales
Se separa 2011 en dos mitades para observar cambios estacionales.


In [None]:
cutoff = pd.Timestamp('2011-06-01')
part1 = df[df['InvoiceDate'] < cutoff]
part2 = df[df['InvoiceDate'] >= cutoff]

print(f"H1: {len(part1):,} filas, {part1['InvoiceNo'].nunique()} facturas")
print(f"H2: {len(part2):,} filas, {part2['InvoiceNo'].nunique()} facturas")


## Preparacion de transacciones (encoder unico)
Se ajusta `TransactionEncoder` con todas las transacciones para garantizar el mismo espacio de items en cada particion.


In [None]:
def build_transactions(df_part):
    return df_part.groupby('InvoiceNo')['Description'].apply(list).tolist()

transactions_all = build_transactions(df)
transactions1 = build_transactions(part1)
transactions2 = build_transactions(part2)

te = TransactionEncoder()
te_ary_all = te.fit(transactions_all).transform(transactions_all)
onehot_all = pd.DataFrame(te_ary_all, columns=te.columns_)

# Reusar las mismas columnas para las particiones
te_ary_1 = te.transform(transactions1)
onehot_1 = pd.DataFrame(te_ary_1, columns=te.columns_)

te_ary_2 = te.transform(transactions2)
onehot_2 = pd.DataFrame(te_ary_2, columns=te.columns_)

print(f"Items en espacio comun: {onehot_all.shape[1]}")


## Funcion auxiliar para itemsets


In [None]:
def run_fpgrowth(df_oh, min_support=MIN_SUPPORT, max_len=MAX_LEN):
    freq = fpgrowth(df_oh, min_support=min_support, use_colnames=True, max_len=max_len)
    freq = freq.sort_values(by='support', ascending=False).reset_index(drop=True)
    return freq


def top_summary(freq, name, n=5):
    print(f"Top {n} itemsets frecuentes - {name}")
    print(freq.head(n))

freq_all = run_fpgrowth(onehot_all)
freq_p1 = run_fpgrowth(onehot_1)
freq_p2 = run_fpgrowth(onehot_2)

top_summary(freq_all, "Dataset completo")
top_summary(freq_p1, "H1")
top_summary(freq_p2, "H2")


## Reglas de asociacion (dataset completo)


In [None]:
rules = association_rules(freq_all, metric='confidence', min_threshold=MIN_CONFIDENCE)
rules = rules.sort_values(by='lift', ascending=False)
print(f"Reglas generadas: {len(rules)}")
rules[['antecedents','consequents','support','confidence','lift']].head()


## Patrones emergentes (growth rate H2 vs H1)
Se calcula growth rate comparando soportes de los mismos itemsets entre H1 y H2. Los inf son Jumping Emerging Patterns (JEP).


In [None]:
merged = pd.merge(freq_p1, freq_p2, on='itemsets', how='outer', suffixes=('_1', '_2')).fillna(0)
merged['growth_rate'] = np.where(
    merged['support_1'] == 0,
    np.inf,
    merged['support_2'] / merged['support_1']
)

merged = merged.sort_values(by='growth_rate', ascending=False)

# Segmentos
jep = merged[merged['growth_rate'] == np.inf].sort_values(by='support_2', ascending=False)
inc = merged[(merged['growth_rate'] > 1) & (merged['growth_rate'] != np.inf)].sort_values(by='growth_rate', ascending=False)
stable = merged[(merged['growth_rate'] >= 0.8) & (merged['growth_rate'] <= 1.2)].sort_values(by='support_2', ascending=False)
dec = merged[(merged['growth_rate'] < 0.8) & (merged['support_2'] > 0)].sort_values(by='growth_rate')

print(f"Itemsets comparados: {len(merged)}")
print(f"JEPs: {len(jep)} | Incrementan: {len(inc)} | Estables: {len(stable)} | Decrecen: {len(dec)}")

merged.head()


## Visualizaciones de patrones


In [None]:
def iset_to_str(s):
    try:
        return ', '.join(sorted(s))
    except Exception:
        return str(s)

# Top growth (sin inf)
top_inc = inc.head(10).copy()
top_inc['item'] = top_inc['itemsets'].apply(iset_to_str)

plt.figure(figsize=(10,4))
plt.bar(top_inc['item'], top_inc['growth_rate'], color='teal')
plt.xticks(rotation=60, ha='right')
plt.ylabel('Growth rate (H2/H1)')
plt.title('Top 10 Emerging Patterns (sin JEP)')
plt.tight_layout()
plt.show()

# Top soportes por particion
for name, freq_df in [("H1", freq_p1), ("H2", freq_p2)]:
    top_support = freq_df.head(10).copy()
    top_support['item'] = top_support['itemsets'].apply(iset_to_str)
    plt.figure(figsize=(8,4))
    sns.barplot(x=top_support['support'], y=top_support['item'], palette='crest')
    plt.title(f'Top 10 itemsets por soporte - {name}')
    plt.xlabel('Support')
    plt.tight_layout()
    plt.show()


## Exportar resultados


In [None]:
freq_all.to_csv(RESULTS_DIR / 'frequent_itemsets_all.csv', index=False)
freq_p1.to_csv(RESULTS_DIR / 'frequent_itemsets_h1.csv', index=False)
freq_p2.to_csv(RESULTS_DIR / 'frequent_itemsets_h2.csv', index=False)
rules.to_csv(RESULTS_DIR / 'association_rules_all.csv', index=False)
merged.to_csv(RESULTS_DIR / 'growth_rates_h1_vs_h2.csv', index=False)
print("Archivos guardados en data/processed/")
