# W02 – SQL esencial I en DuckDB (SELECT/WHERE/GROUP BY/NULLs)

## Conexión con DDIA
- **DDIA Cap. 2**: modelos de datos y lenguajes de consulta (SQL como herramienta central).
- Aquí convertimos *preguntas* en *consultas* sobre un dataset real (NASA Exoplanet Archive).

## Prerrequisitos
- Haber hecho W01A y W01B (o al menos tener Python + dependencias instaladas).
- Si no tienes `data/raw/pscomppars.csv`, el notebook lo descargará.

## Objetivos
- Crear una **vista** `raw_ps` desde un CSV.
- Usar SQL básico: `SELECT`, `WHERE`, `ORDER BY`, `LIMIT`, `GROUP BY`, `HAVING`.
- Entender `NULL`: `COUNT(*)` vs `COUNT(col)` y `COALESCE`.

## Checklist de evidencias
- [ ] Output de: `SELECT count(*) FROM raw_ps`
- [ ] 6 consultas resueltas en la sección **TU TURNO**
- [ ] 10 consultas adicionales (tarea) guardadas al final


In [1]:
import os
os.chdir("..")
os.getcwd()

'c:\\Users\\nancy\\Documents\\Ingenieria_datos'

In [3]:
# Setup común (cross-platform)
import sys, subprocess
from pathlib import Path
import duckdb

DB_PATH = Path("data/exoplanets.duckdb")
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
con = duckdb.connect(str(DB_PATH))

def run_module(mod: str, *args: str):
    cmd = [sys.executable, "-m", mod, *args]
    print("Running:", " ".join(cmd))
    subprocess.check_call(cmd)

raw_csv = Path("data/raw/pscomppars.csv")
if not raw_csv.exists():
    # Para clase: descarga razonable. Quita --limit si quieres el subset completo.
    run_module("src.ingest.download_exoplanets", "--format", "csv")

# DuckDB no permite parámetros preparados en DDL (ej. CREATE VIEW).
# Insertamos la ruta como literal SQL, escapando comillas simples.
def sql_quote(s: str) -> str:
    return "'" + s.replace("'", "''") + "'"

raw_csv_abs = raw_csv.resolve()
con.execute(
    f"CREATE OR REPLACE VIEW raw_ps AS SELECT * FROM read_csv_auto({sql_quote(raw_csv_abs.as_posix())})"
)
con.execute("SELECT count(*) AS n_rows FROM raw_ps").fetchall()


[(6107,)]

## DEMO

In [4]:
# DEMO 1: inspección rápida
con.sql("DESCRIBE raw_ps").show()


┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name   │ column_type │  null   │   key   │ default │  extra  │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ pl_name         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ hostname        │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ discoverymethod │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ disc_year       │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ sy_snum         │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ sy_pnum         │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ sy_dist         │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ ra              │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ dec             │ DOUBLE      │ YES     │ NULL    │ NULL    │ NULL    │
│ pl_orbper       │ DOUBLE      │ YES 

In [5]:
# DEMO 2: SELECT + LIMIT (muestra pequeña)
con.sql("""
SELECT pl_name, hostname, discoverymethod, disc_year
FROM raw_ps
LIMIT 10
""").show()


┌─────────────────────────┬───────────────────────┬─────────────────┬───────────┐
│         pl_name         │       hostname        │ discoverymethod │ disc_year │
│         varchar         │        varchar        │     varchar     │   int64   │
├─────────────────────────┼───────────────────────┼─────────────────┼───────────┤
│ 11 Com b                │ 11 Com                │ Radial Velocity │      2007 │
│ 11 UMi b                │ 11 UMi                │ Radial Velocity │      2009 │
│ 14 And b                │ 14 And                │ Radial Velocity │      2008 │
│ 14 Her b                │ 14 Her                │ Radial Velocity │      2002 │
│ 16 Cyg B b              │ 16 Cyg B              │ Radial Velocity │      1996 │
│ 17 Sco b                │ 17 Sco                │ Radial Velocity │      2020 │
│ 18 Del b                │ 18 Del                │ Radial Velocity │      2008 │
│ 1RXS J160929.1-210524 b │ 1RXS J160929.1-210524 │ Imaging         │      2008 │
│ 24 Boo b      

In [6]:
# DEMO 3: WHERE + ORDER BY (evita NULL)
con.sql("""
SELECT pl_name, pl_orbper, pl_rade
FROM raw_ps
WHERE pl_orbper IS NOT NULL
ORDER BY pl_orbper ASC
LIMIT 10
""").show()


┌──────────────────┬─────────────┬─────────────┐
│     pl_name      │  pl_orbper  │   pl_rade   │
│     varchar      │   double    │   double    │
├──────────────────┼─────────────┼─────────────┤
│ PSR J1719-1438 b │ 0.090706293 │        NULL │
│ ZTF J1828+2308 b │   0.1120067 │ 11.13051784 │
│ M62H b           │ 0.132935028 │        NULL │
│ KOI-1843.03      │   0.1768913 │        0.61 │
│ K2-137 b         │    0.179719 │        0.64 │
│ KIC 10001893 b   │      0.2197 │        NULL │
│ ZTF J1230-2655 b │  0.23597766 │ 13.78704626 │
│ TOI-6255 b       │  0.23818244 │       1.079 │
│ KOI-55 b         │    0.240104 │       0.759 │
│ TOI-6324 b       │    0.279221 │       1.059 │
├──────────────────┴─────────────┴─────────────┤
│ 10 rows                            3 columns │
└──────────────────────────────────────────────┘



In [7]:
# DEMO 4: NULLs — COUNT(*) vs COUNT(col)
con.sql("""
SELECT
  COUNT(*)                    AS total_rows,
  COUNT(pl_rade)              AS non_null_radius,
  COUNT(*) - COUNT(pl_rade)   AS null_radius
FROM raw_ps
""").show()


┌────────────┬─────────────────┬─────────────┐
│ total_rows │ non_null_radius │ null_radius │
│   int64    │      int64      │    int64    │
├────────────┼─────────────────┼─────────────┤
│       6107 │            6057 │          50 │
└────────────┴─────────────────┴─────────────┘



In [8]:
# DEMO 5: GROUP BY + agregados
con.sql("""
SELECT
  discoverymethod,
  COUNT(*) AS n_planets,
  AVG(pl_rade) AS avg_radius_earth
FROM raw_ps
GROUP BY 1
ORDER BY n_planets DESC
LIMIT 10
""").show()


┌───────────────────────────────┬───────────┬────────────────────┐
│        discoverymethod        │ n_planets │  avg_radius_earth  │
│            varchar            │   int64   │       double       │
├───────────────────────────────┼───────────┼────────────────────┤
│ Transit                       │      4501 │  4.368151792099998 │
│ Radial Velocity               │      1166 │  9.759872661948641 │
│ Microlensing                  │       266 │  9.850563909774435 │
│ Imaging                       │        92 │ 15.612215793749998 │
│ Transit Timing Variations     │        39 │  6.493408364210528 │
│ Eclipse Timing Variations     │        17 │ 12.893333333333338 │
│ Orbital Brightness Modulation │         9 │            9.64504 │
│ Pulsar Timing                 │         8 │  5.411333333333334 │
│ Astrometry                    │         6 │ 12.450000000000001 │
│ Pulsation Timing Variations   │         2 │              12.75 │
├───────────────────────────────┴───────────┴─────────────────

In [9]:
# DEMO 6: HAVING (filtra grupos después de agrupar)
con.sql("""
SELECT
  disc_year,
  COUNT(*) AS n
FROM raw_ps
WHERE disc_year IS NOT NULL
GROUP BY 1
HAVING COUNT(*) >= 200
ORDER BY disc_year ASC
""").show()


┌───────────┬───────┐
│ disc_year │   n   │
│   int64   │ int64 │
├───────────┼───────┤
│      2014 │   869 │
│      2016 │  1496 │
│      2018 │   315 │
│      2020 │   234 │
│      2021 │   564 │
│      2022 │   369 │
│      2023 │   324 │
│      2024 │   259 │
│      2025 │   240 │
└───────────┴───────┘



### Resumen
- `GROUP BY` cambia el grano: ya no son planetas, son **grupos**.
- `COUNT(*)` cuenta filas; `COUNT(col)` ignora `NULL`.
- `HAVING` filtra **después** de agrupar (a diferencia de `WHERE`).

---

## TU TURNO (práctica guiada)
Resuelve estas consultas. Pega el output (al menos las primeras filas) en cada celda.


### 1) ¿Cuántos planetas hay por año? (top 15 años con más planetas)

In [12]:
# TODO (1): ¿Cuántos planetas hay por año? (top 15 años con más planetas)
# Pistas: usa disc_year, filtra IS NOT NULL, GROUP BY, ORDER BY n DESC, LIMIT 15
query = """
SELECT disc_year, COUNT(*) AS n_planets
FROM raw_ps
WHERE disc_year IS NOT NULL
GROUP BY disc_year
ORDER BY n_planets DESC
LIMIT 15
"""
con.execute(query).fetchall()

[(2016, 1496),
 (2014, 869),
 (2021, 564),
 (2022, 369),
 (2023, 324),
 (2018, 315),
 (2024, 259),
 (2025, 240),
 (2020, 234),
 (2019, 196),
 (2015, 155),
 (2017, 152),
 (2012, 139),
 (2011, 135),
 (2013, 128)]

### 2) Top 10 sistemas (hostname) con más planetas

In [13]:
# TODO (2): Top 10 sistemas (hostname) con más planetas
# Pistas: GROUP BY hostname, cuenta filas, ORDER BY DESC, LIMIT 10
query = """
SELECT hostname, COUNT(*) AS n_planets
FROM raw_ps
WHERE hostname IS NOT NULL
GROUP BY hostname
ORDER BY n_planets DESC
LIMIT 10
"""
con.execute(query).fetchall()

[('KOI-351', 8),
 ('TRAPPIST-1', 7),
 ('TOI-178', 6),
 ('HD 191939', 6),
 ('HIP 41378', 6),
 ('TOI-1136', 6),
 ('Kepler-11', 6),
 ('Kepler-80', 6),
 ('K2-138', 6),
 ('HD 110067', 6)]

### 3) ¿Qué fracción de filas tiene `pl_bmasse` nulo?

In [14]:
# TODO (3): ¿Qué fracción de filas tiene pl_bmasse nulo?
# Pistas: COUNT(*) total, COUNT(pl_bmasse) non_null, nulls = total - non_null
#       fracción = nulls / total (convierte a DOUBLE)
query = """
SELECT
    COUNT(*) AS total,
    COUNT(pl_bmasse) AS non_null,
    COUNT(*) - COUNT(pl_bmasse) AS nulls,
    ROUND((COUNT(*) - COUNT(pl_bmasse))::DOUBLE / COUNT(*), 4) AS frac_null
FROM raw_ps
"""
con.execute(query).fetchall()

[(6107, 6076, 31, 0.0051)]

### 4) 10 planetas con mayor radio (pl_rade) (evita NULL)

In [15]:
# TODO (4): 10 planetas con mayor radio (pl_rade) (evita NULL)
# Pistas: WHERE pl_rade IS NOT NULL, ORDER BY pl_rade DESC, LIMIT 10
query = """
SELECT pl_name, hostname, pl_rade
FROM raw_ps
WHERE pl_rade IS NOT NULL
ORDER BY pl_rade DESC
LIMIT 10
"""
con.execute(query).fetchall()

[('V2376 Ori b', 'V2376 Ori', 87.20586985),
 ('HD 100546 b', 'HD 100546', 77.3421),
 ('GQ Lup b', 'GQ Lup', 33.6),
 ('Kepler-297 d', 'Kepler-297', 32.6),
 ('PDS 70 b', 'PDS 70', 30.48848),
 ('DH Tau b', 'DH Tau', 30.2643),
 ('Kepler-1979 b', 'Kepler-1979', 29.33),
 ('TOI-1408 b', 'TOI-1408', 25.0),
 ('CT Cha b', 'CT Cha', 24.66),
 ('HAT-P-67 b', 'HAT-P-67', 23.9872187)]

### 5) Compara `COUNT(*)` vs `COUNT(disc_year)` por método

In [17]:
# TODO (5): Compara COUNT(*) vs COUNT(disc_year) por método
# Pistas: GROUP BY discoverymethod, calcula total y non_null_year = COUNT(disc_year)
query = """
SELECT
    discoverymethod,
    COUNT(*) AS total,
    COUNT(disc_year) AS non_null_year,
    COUNT(*) - COUNT(disc_year) AS null_year
FROM raw_ps
GROUP BY discoverymethod
ORDER BY total DESC
"""
con.execute(query).fetchall()

[('Transit', 4501, 4500, 1),
 ('Radial Velocity', 1166, 1166, 0),
 ('Microlensing', 266, 266, 0),
 ('Imaging', 92, 92, 0),
 ('Transit Timing Variations', 39, 39, 0),
 ('Eclipse Timing Variations', 17, 17, 0),
 ('Orbital Brightness Modulation', 9, 9, 0),
 ('Pulsar Timing', 8, 8, 0),
 ('Astrometry', 6, 6, 0),
 ('Pulsation Timing Variations', 2, 2, 0),
 ('Disk Kinematics', 1, 1, 0)]

### 6) Resumen: por método, n_planets y mediana de periodo orbital

In [16]:
# TODO (6): Resumen por método: n_planets y mediana del periodo orbital
# Pistas: MEDIAN(pl_orbper) (filtra NULL si aplica), GROUP BY discoverymethod
query = """
SELECT
    discoverymethod,
    COUNT(*) AS n_planets,
    MEDIAN(pl_orbper) AS median_period_days
FROM raw_ps
WHERE pl_orbper IS NOT NULL
GROUP BY discoverymethod
ORDER BY n_planets DESC
"""
con.execute(query).fetchall()

[('Transit', 4501, 8.15872),
 ('Radial Velocity', 1166, 298.895),
 ('Transit Timing Variations', 39, 30.0),
 ('Imaging', 25, 33000.0),
 ('Eclipse Timing Variations', 17, 3160.0),
 ('Microlensing', 12, 3142.5),
 ('Orbital Brightness Modulation', 9, 0.81161),
 ('Pulsar Timing', 7, 25.262),
 ('Astrometry', 6, 334.76),
 ('Pulsation Timing Variations', 2, 1005.0)]

## Para entregar (tarea)
1) **4 consultas adicionales** (tú decides las preguntas), pero deben incluir:
   - 2 consultas de calidad (nulos, rangos, duplicados, outliers simples)
   - 2 consultas científicas: pregunta + 1–2 líneas de interpretación 

2) En `docs/decisions_log.md`: 1 decisión de hoy (con evidencia: conteos o query).

## Reflexión (bitácora)
- ¿Qué consulta te pareció más difícil y por qué?
- Si el dataset creciera 100×, ¿qué consultas crees que empeoran más?


**Entrega sugerida:** crea `docs/w02a_sql_practice.md` y pega tus 6 respuestas (1–6) + 4 consultas adicionales tuyas con resultados.


In [19]:
# Calidad 1: ¿Hay planetas con pl_rade < 0 o pl_orbper <= 0 (valores físicamente imposibles)?
query = """
SELECT COUNT(*) AS outliers
FROM raw_ps
WHERE pl_rade < 0 OR pl_orbper <= 0
"""
con.execute(query).fetchall()

[(0,)]

In [23]:
# Calidad 2: Duplicados por nombre de planeta
query = """
SELECT 
    pl_name,
    COUNT(*) AS n_duplicates,
    COUNT(DISTINCT hostname) AS n_different_hosts
FROM raw_ps
WHERE pl_name IS NOT NULL
GROUP BY pl_name
HAVING COUNT(*) > 1
ORDER BY n_duplicates DESC
"""
con.execute(query).fetchall()

[]

In [26]:
# Científica 1: Zonas habitables
query = """
SELECT 
    pl_name,
    hostname,
    pl_eqt AS temp_equilibrium,
    pl_rade AS radius_earth,
    sy_dist AS distance_parsec,
    ABS(pl_eqt - 288) AS temp_diff_from_earth
FROM raw_ps
WHERE pl_eqt BETWEEN 200 AND 350  
    AND pl_rade BETWEEN 0.5 AND 2.0  
    AND pl_eqt IS NOT NULL
ORDER BY temp_diff_from_earth ASC
LIMIT 20
"""
con.execute(query).fetchall()

[('Kepler-438 b', 'Kepler-438', 288.0, 1.12, 195.94287175, 0.0),
 ('TOI-1266 d', 'TOI-1266', 288.0, 1.74, 36.0118, 0.0),
 ('TRAPPIST-1 d', 'TRAPPIST-1', 286.2, 0.788, 12.42988881, 1.8000000000000114),
 ('Kepler-1410 b', 'Kepler-1410', 290.0, 1.78, 367.0, 2.0),
 ('L 98-59 f', 'L 98-59', 285.0, 1.48, 10.6194, 3.0),
 ('Kepler-138 e', 'Kepler-138', 292.0, 0.797, 66.8624, 4.0),
 ('Proxima Cen d', 'Proxima Cen', 282.0, 0.692, 1.30119, 6.0),
 ('K2-133 e', 'K2-133', 296.0, 1.73, 75.1703, 8.0),
 ('TOI-2095 c', 'TOI-2095', 297.0, 1.33, 41.9176, 9.0),
 ('Kepler-737 b', 'Kepler-737', 298.0, 1.96, 205.118, 10.0),
 ('Kepler-560 b', 'Kepler-560', 298.0, 1.72, 109.308, 10.0),
 ('Kepler-69 c', 'Kepler-69', 299.0, 1.71, 730.625, 11.0),
 ("Teegarden's Star b", "Teegarden's Star", 277.0, 1.05, 3.83078, 11.0),
 ('Kepler-1389 b', 'Kepler-1389', 300.0, 1.77, 497.093, 12.0),
 ('GJ 1132 c', 'GJ 1132', 300.0, 1.43, 12.613, 12.0),
 ('Ross 128 b', 'Ross 128', 301.0, 1.11, 3.37454, 13.0),
 ('Kepler-296 f', 'Kepler

In [25]:
# Científica 2: ¿Los planetas más masivos tienen periodos orbitales más largos?
query = """
SELECT
    CASE
        WHEN pl_bmasse < 10   THEN 'Terrestre (<10)'
        WHEN pl_bmasse < 100  THEN 'Neptuniano (10-100)'
        ELSE 'Gigante (>100)'
    END AS mass_class,
    COUNT(*) AS n,
    MEDIAN(pl_orbper) AS median_period_days
FROM raw_ps
WHERE pl_bmasse IS NOT NULL AND pl_orbper IS NOT NULL
GROUP BY 1
ORDER BY median_period_days
"""
# Interpretación: los gigantes tienden a tener periodos más largos (orbitan más lejos).
con.execute(query).fetchall()

[('Terrestre (<10)', 3124, 9.22686525),
 ('Neptuniano (10-100)', 1071, 16.537944),
 ('Gigante (>100)', 1561, 42.6318)]