# CASEN 2017â€“2022 Data Exploration
Objective: Inspect variables, confirm structure, and select features for regional aggregation.


In [None]:
import os
import pandas as pd
import numpy as np

try:
    import pyreadstat
    HAS_PYREADSTAT = True
except ImportError:
    HAS_PYREADSTAT = False

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 120)

print('pandas', pd.__version__, '| pyreadstat available:', HAS_PYREADSTAT)

# --- Paths ---
DATA_2017 = './data/raw/Casen2017_factorCenso2017.dta'
DATA_2022 = './data/raw/Base de datos Casen 2022 STATA_18 marzo 2024.dta'
assert os.path.exists(DATA_2017)
assert os.path.exists(DATA_2022)
print("Files found.")


In [None]:
# --- Helper readers ---
def read_stata_with_meta(path, convert_categoricals=False, chunksize=None):
    if HAS_PYREADSTAT and chunksize is None:
        df, meta = pyreadstat.read_dta(path, apply_value_formats=convert_categoricals)
        meta_dict = {'column_labels': meta.column_names_to_labels, 'value_labels': meta.value_labels}
        return df, meta_dict
    elif chunksize is not None:
        it = pd.read_stata(path, iterator=True, convert_categoricals=convert_categoricals)
        df = it.get_chunk(chunksize)
        return df, {}
    else:
        df = pd.read_stata(path, convert_categoricals=convert_categoricals)
        return df, {}

def find_cols(df, *keywords):
    keys = [k.lower() for k in keywords]
    return [c for c in df.columns if any(k in c.lower() for k in keys)]
