# CASEN 2017–2022 Data Exploration
Objective: Inspect variables, confirm structure, and select features for regional aggregation.


In [1]:
import os
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

print("Pandas version:", pd.__version__)


Pandas version: 2.3.3


In [2]:
# Relative Paths
path_casen_2022 = "../data/raw/Base de datos Casen 2022 STATA_18 marzo 2024.dta"
path_casen_2017_factor = "../data/raw/Casen2017_factorCenso2017.dta"

# Verify
print("CASEN 2022 existe:", os.path.exists(path_casen_2022))
print("Factor CASEN 2017 existe:", os.path.exists(path_casen_2017_factor))

CASEN 2022 existe: True
Factor CASEN 2017 existe: True


In [3]:
# Iterative read to avoid crushing
reader_2022 = pd.read_stata(
    path_casen_2022,
    iterator=True,
    convert_categoricals=False
)

reader_2017 = pd.read_stata(
    path_casen_2017_factor,
    iterator=True,
    convert_categoricals=False
)

# 5K sample
casen_2022_sample = reader_2022.get_chunk(5000)
casen_2017_sample = reader_2017.get_chunk(5000)

print("=== CASEN 2022 SAMPLE ===")
casen_2022_sample.info()
display(casen_2022_sample.head())

print("\n=== FACTOR CASEN 2017 SAMPLE ===")
casen_2017_sample.info()
display(casen_2017_sample.head())


One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  casen_2022_sample = reader_2022.get_chunk(5000)


=== CASEN 2022 SAMPLE ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Columns: 918 entries, id_vivienda to allega_int
dtypes: datetime64[ns](1), float64(603), int16(9), int32(120), int8(152), object(33)
memory usage: 27.4+ MB


Unnamed: 0,id_vivienda,folio,id_persona,region,area,cod_upm,nse,estrato,hogar,expr,expr_osig,varstrat,varunit,fecha_entrev,p1,p2,p3,p4,p9,p10,p11,tot_per_h,h1,edad,mes_nac_nna,...,asiste,educ,depen,activ,asal,contrato,cotiza,lugar_nac,pueblos_indigenas,n_ocupados,n_desocupados,n_inactivos,conyuge_jh,numper,numnuc,men18c,may60c,tipohogar,tot_hog,ind_hacina,indsan,ten_viv,ten_viv_f,allega_ext,allega_int
0,1000901,100090101,1,16,2,10009,4,1630324,1,43,54.0,751,12041,2023-01-28,1,2,4,4,3,1.0,,3,1,72,,...,2,1,,3.0,,,0.0,0,0,2,0,1,1,3,1,0,1,3,1,1,1,1,1,0,0
1,1000901,100090101,2,16,2,10009,4,1630324,1,43,,751,12041,2023-01-28,1,2,4,4,3,1.0,,3,1,67,,...,2,1,,1.0,0.0,,1.0,0,0,2,0,1,1,3,1,0,1,3,1,1,1,1,1,0,0
2,1000901,100090101,3,16,2,10009,4,1630324,1,44,122.0,751,12041,2023-01-28,1,2,4,4,3,1.0,,3,1,40,,...,2,8,,1.0,1.0,-88.0,1.0,0,0,2,0,1,1,3,1,0,1,3,1,1,1,1,1,0,0
3,1000902,100090201,1,16,2,10009,4,1630324,1,51,,751,12041,2022-12-29,1,2,4,4,4,1.0,,4,1,56,,...,2,-88,,3.0,,,0.0,0,0,0,0,3,1,4,2,1,1,5,1,1,2,1,1,0,1
4,1000902,100090201,2,16,2,10009,4,1630324,1,51,131.0,751,12041,2022-12-29,1,2,4,4,4,1.0,,4,1,25,,...,2,5,,3.0,,,1.0,0,0,0,0,3,1,4,2,1,1,5,1,1,2,1,1,0,1



=== FACTOR CASEN 2017 SAMPLE ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   folio       5000 non-null   float64
 1   o           5000 non-null   int8   
 2   expr_C2017  5000 non-null   float32
dtypes: float32(1), float64(1), int8(1)
memory usage: 63.6 KB


Unnamed: 0,folio,o,expr_C2017
0,110110000000.0,1,39.0
1,110110000000.0,1,39.0
2,110110000000.0,1,39.0
3,110110000000.0,2,39.0
4,110110000000.0,1,39.0


In [4]:
cols = casen_2022_sample.columns.tolist()

print("Total columnas CASEN 2022:", len(cols))
print("\nPrimeras 50 columnas:")
print(cols[:50])

patterns = ["region", "reg", "comuna", "com", "exp", "expr", "pobr", "pobre", "ing", "educ", "edad", "hog", "tam"]

for p in patterns:
    matches = [c for c in cols if p in c.lower()]
    print(f"\nPatrón: '{p}' → {len(matches)} columnas encontradas")
    print(matches[:30])  # show 30 to avoid saturation


Total columnas CASEN 2022: 918

Primeras 50 columnas:
['id_vivienda', 'folio', 'id_persona', 'region', 'area', 'cod_upm', 'nse', 'estrato', 'hogar', 'expr', 'expr_osig', 'varstrat', 'varunit', 'fecha_entrev', 'p1', 'p2', 'p3', 'p4', 'p9', 'p10', 'p11', 'tot_per_h', 'h1', 'edad', 'mes_nac_nna', 'ano_nac_nna', 'sexo', 'pco1_a', 'pco1_b', 'pco1', 'h5_cp', 'h5_sp', 'h5_b1_1', 'h5_b1_2', 'h5a_2', 'h5_b2_1', 'h5_b2_2', 'h5a_3', 'h5_b3_1', 'h5_b3_2', 'h5a_4', 'h5b', 'ecivil', 'h5_10', 'h5_1a', 'h5_1b', 'h5_20', 'h5_2', 'n_nucleos', 'nucleo']

Patrón: 'region' → 1 columnas encontradas
['region']

Patrón: 'reg' → 61 columnas encontradas
['region', 'e6d_preg', 'y3a_preg', 'y3b_preg', 'y3c_preg', 'y3d_preg', 'y3e_preg', 'y3f_preg', 'y4a_preg', 'y4b_preg', 'y4c_preg', 'y4d_preg', 'y5a_preg', 'y5b_preg', 'y5c_preg', 'y5d_preg', 'y5e_preg', 'y5f_preg', 'y5g_preg', 'y5h_preg', 'y5i_preg', 'y5j_preg', 'y5k_preg', 'y5l_preg', 'y11_preg', 'y12a_preg', 'y12b_preg', 'y13a_preg', 'y13b_preg', 'y13c_preg']


In [5]:
# Inspeccionar variables de ingreso/candidatas (ajusta el patrón si quieres afinar)
income_candidates = [c for c in casen_2022_sample.columns 
                     if "ing" in c.lower() or "y" in c.lower()]

print("Posibles variables de ingreso (primeras 50):")
print(income_candidates[:50])


Posibles variables de ingreso (primeras 50):
['y1', 'y2_dias', 'y2_hrs', 'y3a_preg', 'y3b_preg', 'y3c_preg', 'y3d_preg', 'y3e_preg', 'y3f_preg', 'y3a', 'y3ap', 'y3b', 'y3bp', 'y3c', 'y3cp', 'y3d', 'y3dp', 'y3e', 'y3ep', 'y3f_esp', 'y3f', 'y3fp', 'y4a_preg', 'y4b_preg', 'y4c_preg', 'y4d_preg', 'y4a', 'y4b', 'y4c', 'y4d_esp', 'y4d', 'y5a_preg', 'y5b_preg', 'y5c_preg', 'y5d_preg', 'y5e_preg', 'y5f_preg', 'y5g_preg', 'y5h_preg', 'y5i_preg', 'y5j_preg', 'y5k_preg', 'y5l_preg', 'y5a', 'y5b', 'y5c', 'y5d', 'y5e', 'y5f', 'y5g']
