# EDA: RAW Pokémon data from MongoDB

Exploratory Data Analysis (EDA) sobre los datos RAW de Pokémon guardados en MongoDB por la etapa EXTRACT.

- Origen de datos: colección `pokemon_raw` en MongoDB.
- Estadísticas: mínimo, máximo, media y mediana para `height`, `weight`, `base_experience`.
- Visualización: top 10 de tipos primarios (gráfica de barras).


In [None]:
# Configuración y conexión a MongoDB
from dotenv import load_dotenv
from pymongo import MongoClient
import os

load_dotenv()  # lee variables de .env si existe en el proyecto
MONGO_URI = os.getenv('MONGO_URI', 'mongodb://localhost:27017')
MONGO_DB = os.getenv('MONGO_DB', 'etl_demo')
MONGO_RAW_COLLECTION = os.getenv('MONGO_RAW_COLLECTION', 'pokemon_raw')

client = MongoClient(MONGO_URI)
db = client[MONGO_DB]
raw_col = db[MONGO_RAW_COLLECTION]
raw_col


In [None]:
# Cargar datos necesarios (solo campos usados en el EDA)
from typing import Dict, List, Optional
from collections import Counter

docs = list(raw_col.find({}, {'_id': 0, 'types': 1, 'height': 1, 'weight': 1, 'base_experience': 1}))
total = len(docs)
total


In [None]:
# Funciones auxiliares para estadísticas y tipo primario
from statistics import mean, median

def safe_numbers(docs: List[Dict], key: str):
    return [v for v in (d.get(key) for d in docs) if isinstance(v, (int, float))]

def summary(values):
    if not values:
        return {'count': 0}
    return {
        'count': len(values),
        'min': min(values),
        'max': max(values),
        'mean': round(mean(values), 2),
        'median': median(values),
    }

def primary_type(doc: Dict) -> Optional[str]:
    types: List[Dict] = doc.get('types', []) or []
    if not types:
        return None
    types_sorted = sorted(types, key=lambda x: x.get('slot', 99))
    t0 = types_sorted[0] if types_sorted else None
    return (t0 or {}).get('type', {}).get('name')


In [None]:
# Estadísticas simples
heights = safe_numbers(docs, 'height')
weights = safe_numbers(docs, 'weight')
base_exp = safe_numbers(docs, 'base_experience')

height_sum = summary(heights)
weight_sum = summary(weights)
base_exp_sum = summary(base_exp)

print('Total RAW Pokémon:', total)
print('Height summary:', height_sum)
print('Weight summary:', weight_sum)
print('Base experience summary:', base_exp_sum)


In [None]:
# Distribución de tipos primarios (Top 10)
types = [primary_type(d) for d in docs]
types = [t for t in types if t]
counter = Counter(types)
counter.most_common(10)


In [None]:
# Gráfica de barras (inline)
import matplotlib.pyplot as plt
top = counter.most_common(10)
if top:
    labels, values = zip(*top)
    plt.figure(figsize=(8, 5))
    plt.bar(labels, values, color='#3B82F6')
    plt.title('Top primary types (count)')
    plt.xlabel('Type')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
else:
    print('No hay tipos para graficar.')


## Notas
- Asegúrate de que MongoDB esté corriendo y `.env` contenga `MONGO_URI`, `MONGO_DB`, `MONGO_RAW_COLLECTION`.
- Este EDA es intencionalmente simple y claro para docencia.
