In [1]:
# 1. Importaciones y configuración inicial
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
import pyreadstat
from statsmodels.formula.api import logit

pd.set_option('display.max_rows', None)

# 2. Definición de constantes y configuraciones
EMPLOYMENT_STATUS = {
    "Employed, absent from work": 1,
    "Employed, at work": 1
}

NOC_CATEGORIES = {
    0: "Business, finance and administration occupations, except management",
    1: "Health occupations, except management",
    2: "Management occupations",
    3: "Natural and applied sciences and related occupations, except management",
    4: "Natural resources, agriculture and related production occupations, except management",
    5: "Occupations in art, culture, recreation and sport, except management",
    6: "Occupations in education, law and social, community and government services, except management",
    7: "Occupations in manufacturing and utilities, except management",
    8: "Sales and service occupations, except management",
    9: "Trades, transport and equipment operators and related occupations, except management",
}

# 3. Funciones de utilidad
def calculate_quality_metrics(df):
    """Calcula porcentajes de zeros y NaN en el dataset"""
    zero_pct = (df == 0).mean() * 100
    nan_pct = df.isna().mean() * 100
    return zero_pct, nan_pct

def encode_categorical_vars(df, columns):
    """Codifica variables categóricas preservando NaN"""
    df_encoded = df.copy()
    for col in columns:
        df_encoded[col] = df_encoded[col].astype('category').cat.codes
        if col == 'NOC_10':
            df_encoded.loc[df_encoded[col] == -1, col] = np.nan
    return df_encoded

def get_quarter(month):
    """Mapea el nombre del mes al trimestre correspondiente."""
    if month in ['January', 'February', 'March']:
        return 'Q1'
    elif month in ['April', 'May', 'June']:
        return 'Q2'
    elif month in ['July', 'August', 'September']:
        return 'Q3'
    elif month in ['October', 'November', 'December']:
        return 'Q4'
    else:
        return None


In [2]:
# Lista de años a procesar hasta 2019
anos = [2023,2022,2021,2020,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010]

# Variables de interés
variables = ["Has_a_job", "LFSSTAT", "PROV", "AGE_12", "SEX", 
             "MARSTAT", "EDUC", "IMMIG", "NOC_10"]
categorical_cols = ["SEX", "MARSTAT", "EDUC", "NOC_10"]
feature_cols = ["SEX", "MARSTAT", "EDUC"]


In [3]:
# 4. Bucle para procesar cada archivo y guardar los resultados
for year in anos:
    try:
        print(f"\nIniciando procesamiento del año {year}...")
        
        # Definir el path del archivo actual
        path = fr"C:\Users\wmate\Downloads\SPSS\Compilado\{year}.sav"
        
        # Cargar archivo .sav
        print(f"Cargando archivo {year}.sav...")
        df = pd.read_spss(path)
        
        # 5. Creación de variables
        print("Creando variables...")
        df['Quarter'] = pd.Categorical(df['SURVMNTH'].apply(get_quarter))
        df['Has_a_job'] = df['LFSSTAT'].map(lambda x: EMPLOYMENT_STATUS.get(x, 0))
        
        # 6. Preparación para imputación KNN
        print("Preparando datos para imputación KNN...")
        df_knn = encode_categorical_vars(df[categorical_cols], categorical_cols)
        
        # 7. Imputación KNN para NOC_10
        print("Realizando imputación KNN...")
        X = df_knn[feature_cols]
        mask_train = df_knn['NOC_10'].notna()
        mask_pred = df_knn['NOC_10'].isna()
        
        X_train = X[mask_train]
        y_train = df_knn.loc[mask_train, 'NOC_10']
        X_pred = X[mask_pred]

        # Entrenar y predecir
        knn = KNeighborsRegressor(n_neighbors=3)
        knn.fit(X_train, y_train)
        
        if mask_pred.any():
            y_pred = np.clip(np.round(knn.predict(X_pred)).astype(int), 0, 9)
            df_knn.loc[mask_pred, 'NOC_10'] = y_pred
        
        # Liberar memoria de variables KNN
        del X_train, y_train, X_pred, knn, X
        
        # 8. Agregar categorías de industria
        print("Agregando categorías de industria...")
        df_knn['NOC_Category'] = pd.Categorical(df_knn['NOC_10'].map(NOC_CATEGORIES))

        # 9. Crear dataset final
        print("Creando dataset final...")
        df_final = pd.concat([
            df_knn[["NOC_Category"]],
            df[variables],
            df["Quarter"]
        ], axis=1)
        
        # Liberar memoria de DataFrames grandes
        del df, df_knn

        # 11. Modelo Logit
        print("Ajustando modelo logístico...")
        formula = (
            "Has_a_job ~ "
            "C(PROV, Treatment(reference='Newfoundland and Labrador')) + "
            "C(AGE_12, Treatment(reference='15 to 19 years')) + "
            "C(SEX, Treatment(reference='Female')) + "
            "C(MARSTAT, Treatment(reference='Separated')) + "
            "C(EDUC, Treatment(reference='0 to 8 years')) + "
            "C(IMMIG, Treatment(reference='Non-immigrant')) + "
            "C(NOC_Category, Treatment(reference='Occupations in art, culture, recreation and sport, except management')) + "
            "C(Quarter, Treatment(reference='Q1'))"
        )
        
        md_logit = logit(formula, data=df_final)
        res_logit = md_logit.fit()
        
        # Imprimir resumen
        print("\nResumen del modelo logístico:")
        print(res_logit.summary())
        
        # Guardar el resumen de la regresión en CSV
        print(f"\nGuardando resumen de regresión para {year}...")
        summary = res_logit.summary()
        tables = summary.tables
        
        regression_results = pd.DataFrame()
        for i, table in enumerate(tables):
            df_table = pd.read_html(table.as_html())[0]
            df_table.columns = df_table.columns.map(str)
            df_table['Table'] = f'Table_{i+1}'
            regression_results = pd.concat([regression_results, df_table], axis=0)
        
        regression_results.to_csv(f"Regresion_{year}.csv", index=False)
        print(f"Resumen de regresión guardado en Regresion_{year}.csv")
        
        # Liberar memoria del modelo y resultados de regresión
        del md_logit, df_final, regression_results

        # 12. Calcular odds y probabilidades
        print("\nCalculando odds y probabilidades...")
        params = res_logit.params
        odds_ratios = np.exp(params)
        
        # Crear DataFrame de resultados
        results = []
        
        # Agregar intercepto
        intercept_odds = odds_ratios['Intercept']
        results.append({
            'Variable': 'Intercepto',
            'Odds': intercept_odds,
            'Probabilidad': intercept_odds / (1 + intercept_odds)
        })
        
        # Agregar resto de variables
        for variable, odds_ratio in odds_ratios[1:].items():
            odds = intercept_odds * odds_ratio
            results.append({
                'Variable': variable,
                'Odds': odds,
                'Probabilidad': odds / (1 + odds)
            })
        
        probabilidades_df = pd.DataFrame(results)

        # Guardar resultados en archivo CSV
        print(f"Guardando probabilidades para {year}...")
        probabilidades_df.to_csv(f"Probabilidades_{year}.csv", index=False)
        
        # Liberar memoria final
        del res_logit, probabilidades_df, results, odds_ratios, params
        
        print(f"\nAño {year} completado exitosamente")
        
    except Exception as e:
        print(f"\nError procesando año {year}: {e}")
        continue  # Salta este año y sigue con el siguiente


Iniciando procesamiento del año 2023...
Cargando archivo 2023.sav...
Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.388917
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1348445
Model:                          Logit   Df Residuals:                  1348398
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.4249
Time:                        11:02:11   Log-Likelihood:            -5.2443e+05
converged:                       True   LL-Null:                   -9.1182e+05
Covariance Type:            nonrobust   LLR p-value:          

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.401819
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1306571
Model:                          Logit   Df Residuals:                  1306524
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.4063
Time:                        11:07:48   Log-Likelihood:            -5.2500e+05
converged:                       True   LL-Null:                   -8.8435e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.399509
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1046994
Model:                          Logit   Df Residuals:                  1046947
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.4153
Time:                        11:10:49   Log-Likelihood:            -4.1828e+05
converged:                       True   LL-Null:                   -7.1543e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.443561
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1069011
Model:                          Logit   Df Residuals:                  1068964
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3533
Time:                        11:14:01   Log-Likelihood:            -4.7417e+05
converged:                       True   LL-Null:                   -7.3326e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.412091
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1203199
Model:                          Logit   Df Residuals:                  1203152
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3874
Time:                        11:18:12   Log-Likelihood:            -4.9583e+05
converged:                       True   LL-Null:                   -8.0945e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.419486
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1222490
Model:                          Logit   Df Residuals:                  1222443
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3764
Time:                        11:22:31   Log-Likelihood:            -5.1282e+05
converged:                       True   LL-Null:                   -8.2238e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.454516
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1236558
Model:                          Logit   Df Residuals:                  1236511
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3247
Time:                        11:26:57   Log-Likelihood:            -5.6204e+05
converged:                       True   LL-Null:                   -8.3222e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.435016
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1229128
Model:                          Logit   Df Residuals:                  1229081
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3555
Time:                        11:31:26   Log-Likelihood:            -5.3469e+05
converged:                       True   LL-Null:                   -8.2967e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.425347
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1215627
Model:                          Logit   Df Residuals:                  1215580
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3680
Time:                        11:35:37   Log-Likelihood:            -5.1706e+05
converged:                       True   LL-Null:                   -8.1819e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.399044
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1254181
Model:                          Logit   Df Residuals:                  1254134
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.4067
Time:                        11:40:05   Log-Likelihood:            -5.0047e+05
converged:                       True   LL-Null:                   -8.4361e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.454904
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1252505
Model:                          Logit   Df Residuals:                  1252458
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3224
Time:                        11:44:34   Log-Likelihood:            -5.6977e+05
converged:                       True   LL-Null:                   -8.4092e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.421890
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1261528
Model:                          Logit   Df Residuals:                  1261481
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3710
Time:                        11:49:07   Log-Likelihood:            -5.3223e+05
converged:                       True   LL-Null:                   -8.4617e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.418089
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1262077
Model:                          Logit   Df Residuals:                  1262030
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3772
Time:                        11:53:38   Log-Likelihood:            -5.2766e+05
converged:                       True   LL-Null:                   -8.4718e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


Creando variables...
Preparando datos para imputación KNN...
Realizando imputación KNN...
Agregando categorías de industria...
Creando dataset final...
Ajustando modelo logístico...
Optimization terminated successfully.
         Current function value: 0.427258
         Iterations 7

Resumen del modelo logístico:
                           Logit Regression Results                           
Dep. Variable:              Has_a_job   No. Observations:              1261173
Model:                          Logit   Df Residuals:                  1261126
Method:                           MLE   Df Model:                           46
Date:                Sun, 03 Nov 2024   Pseudo R-squ.:                  0.3647
Time:                        11:58:05   Log-Likelihood:            -5.3885e+05
converged:                       True   LL-Null:                   -8.4815e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                                     

  df_table = pd.read_html(table.as_html())[0]
  df_table = pd.read_html(table.as_html())[0]


In [None]:
for year in anos:
    try:
        path = fr"C:\Users\wmate\Downloads\SPSS\Compilado\{year}.sav"
        print(f"Intentando cargar el archivo: {path}")
        df, meta = pyreadstat.read_sav(path)
    except Exception as e:
        print(f"Error al cargar el archivo {year}: {e}")
        continue  # Salta este año y sigue con el siguiente