In [2]:
import polars as pl
def revisar_base(df: pl.DataFrame, nombre: str, filas_preview: int = 5):
    total_filas = df.height
    total_columnas = df.width

    print(f"\n📦 Revisión de: {nombre}")
    print(f"Total de filas: {total_filas}")
    print(f"Total de columnas: {total_columnas}")
    print(f"Tipos de datos:")
    print(df.schema)

    # Revisar nulos y NaN por columna
    resumen = []
    for col in df.columns:
        dtype = df.schema[col]
        n_null = df.select(pl.col(col).is_null().sum()).item()
        n_nan = df.select(pl.col(col).is_nan().sum()).item() if dtype in [pl.Float32, pl.Float64] else 0
        total_missing = n_null + n_nan
        pct = (total_missing / total_filas) * 100 if total_filas > 0 else 0.0

        resumen.append({
            "columna": col,
            "tipo": str(dtype),
            "n_null": n_null,
            "n_nan": n_nan,
            "total_nulos": total_missing,
            "pct_nulos": round(pct, 2)
        })

    print("\n📊 Resumen de nulos:")
    print(pl.DataFrame(resumen))

    # Mostrar algunas filas con patent_id null si existe
    if "patent_id" in df.columns:
        n_nulos_id = df.filter(pl.col("patent_id").is_null()).shape[0]
        if n_nulos_id > 0:
            print(f"\n🔍 Filas con patent_id nulo ({n_nulos_id}):")
            print(df.filter(pl.col("patent_id").is_null()).head(filas_preview))


In [8]:

df_patent_raw = pl.read_csv(
    "g_patent.tsv",
    separator="\t",
    ignore_errors=True)
revisar_base(df_patent_raw, "g_patent")


📦 Revisión de: g_patent
Total de filas: 9075421
Total de columnas: 8
Tipos de datos:
Schema([('patent_id', Int64), ('patent_type', String), ('patent_date', String), ('patent_title', String), ('wipo_kind', String), ('num_claims', Int64), ('withdrawn', Int64), ('filename', String)])

📊 Resumen de nulos:
shape: (8, 6)
┌──────────────┬────────┬────────┬───────┬─────────────┬───────────┐
│ columna      ┆ tipo   ┆ n_null ┆ n_nan ┆ total_nulos ┆ pct_nulos │
│ ---          ┆ ---    ┆ ---    ┆ ---   ┆ ---         ┆ ---       │
│ str          ┆ str    ┆ i64    ┆ i64   ┆ i64         ┆ f64       │
╞══════════════╪════════╪════════╪═══════╪═════════════╪═══════════╡
│ patent_id    ┆ Int64  ┆ 869242 ┆ 0     ┆ 869242      ┆ 9.58      │
│ patent_type  ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ patent_date  ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ patent_title ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ wipo_kind    ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0 

In [9]:
# Ejemplo para g_patent.tsv
df_patent_raw = pl.read_csv(
    "g_patent.tsv",
    separator="\t",
    ignore_errors=True,
    schema_overrides={"patent_id": pl.Utf8}
)
revisar_base(df_patent_raw, "g_patent")


📦 Revisión de: g_patent
Total de filas: 9075421
Total de columnas: 8
Tipos de datos:
Schema([('patent_id', String), ('patent_type', String), ('patent_date', String), ('patent_title', String), ('wipo_kind', String), ('num_claims', Int64), ('withdrawn', Int64), ('filename', String)])

📊 Resumen de nulos:
shape: (8, 6)
┌──────────────┬────────┬────────┬───────┬─────────────┬───────────┐
│ columna      ┆ tipo   ┆ n_null ┆ n_nan ┆ total_nulos ┆ pct_nulos │
│ ---          ┆ ---    ┆ ---    ┆ ---   ┆ ---         ┆ ---       │
│ str          ┆ str    ┆ i64    ┆ i64   ┆ i64         ┆ f64       │
╞══════════════╪════════╪════════╪═══════╪═════════════╪═══════════╡
│ patent_id    ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ patent_type  ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ patent_date  ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ patent_title ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ wipo_kind    ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0

In [10]:
# Ejemplo para g_cpc_current.tsv
df_cpc_raw = pl.read_csv("g_cpc_current.tsv", separator="\t", ignore_errors=True)
revisar_base(df_cpc_raw, "g_cpc_current")



📦 Revisión de: g_cpc_current
Total de filas: 56755723
Total de columnas: 7
Tipos de datos:
Schema([('patent_id', Int64), ('cpc_sequence', Int64), ('cpc_section', String), ('cpc_class', String), ('cpc_subclass', String), ('cpc_group', String), ('cpc_type', String)])

📊 Resumen de nulos:
shape: (7, 6)
┌──────────────┬────────┬────────┬───────┬─────────────┬───────────┐
│ columna      ┆ tipo   ┆ n_null ┆ n_nan ┆ total_nulos ┆ pct_nulos │
│ ---          ┆ ---    ┆ ---    ┆ ---   ┆ ---         ┆ ---       │
│ str          ┆ str    ┆ i64    ┆ i64   ┆ i64         ┆ f64       │
╞══════════════╪════════╪════════╪═══════╪═════════════╪═══════════╡
│ patent_id    ┆ Int64  ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ cpc_sequence ┆ Int64  ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ cpc_section  ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ cpc_class    ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ cpc_subclass ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ cpc_gr

In [11]:
# Ejemplo para g_assignee_disambiguated.tsv
df_assignee_raw = pl.read_csv("g_assignee_disambiguated.tsv", separator="\t", ignore_errors=True)
revisar_base(df_assignee_raw, "g_assignee_disambiguated")


📦 Revisión de: g_assignee_disambiguated
Total de filas: 8385078
Total de columnas: 8
Tipos de datos:
Schema([('patent_id', String), ('assignee_sequence', Int64), ('assignee_id', String), ('disambig_assignee_individual_name_first', String), ('disambig_assignee_individual_name_last', String), ('disambig_assignee_organization', String), ('assignee_type', Int64), ('location_id', String)])

📊 Resumen de nulos:
shape: (8, 6)
┌─────────────────────────────────┬────────┬────────┬───────┬─────────────┬───────────┐
│ columna                         ┆ tipo   ┆ n_null ┆ n_nan ┆ total_nulos ┆ pct_nulos │
│ ---                             ┆ ---    ┆ ---    ┆ ---   ┆ ---         ┆ ---       │
│ str                             ┆ str    ┆ i64    ┆ i64   ┆ i64         ┆ f64       │
╞═════════════════════════════════╪════════╪════════╪═══════╪═════════════╪═══════════╡
│ patent_id                       ┆ String ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ assignee_sequence               ┆ Int64  ┆ 0  

In [12]:
df_location_raw = pl.read_csv("g_location_disambiguated.tsv", separator="\t", ignore_errors=True)
revisar_base(df_location_raw, "g_location_disambiguated")


📦 Revisión de: g_location_disambiguated
Total de filas: 96968
Total de columnas: 9
Tipos de datos:
Schema([('location_id', String), ('disambig_city', String), ('disambig_state', String), ('disambig_country', String), ('latitude', Float64), ('longitude', Float64), ('county', String), ('state_fips', String), ('county_fips', String)])

📊 Resumen de nulos:
shape: (9, 6)
┌──────────────────┬─────────┬────────┬───────┬─────────────┬───────────┐
│ columna          ┆ tipo    ┆ n_null ┆ n_nan ┆ total_nulos ┆ pct_nulos │
│ ---              ┆ ---     ┆ ---    ┆ ---   ┆ ---         ┆ ---       │
│ str              ┆ str     ┆ i64    ┆ i64   ┆ i64         ┆ f64       │
╞══════════════════╪═════════╪════════╪═══════╪═════════════╪═══════════╡
│ location_id      ┆ String  ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ disambig_city    ┆ String  ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ disambig_state   ┆ String  ┆ 0      ┆ 0     ┆ 0           ┆ 0.0       │
│ disambig_country ┆ String  ┆ 0      