## Calculo del numero de trabajadores para los ultimos periodos tributarios

 ### Se crea sesion de Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
import time
import os
from pyspark.sql import functions as F
from pyspark.sql.functions import col, expr, col, concat, lit
from pyspark.sql.types import IntegerType
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="pyspark")



In [2]:
spark = SparkSession.builder \
    .appName("Test") \
    .config("spark.yarn.access.hadoopFileSystems","abfs://data@datalakesii.dfs.core.windows.net/") \
    .master("local[*]") \
    .getOrCreate()

spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")

## Calculo de trabajadores segun DJ1887 y DJ1879

### Cantidad de trabajadores con contrato y honorarios de la última declaración (más rentas)

#### DJ1887

Se cargan las DJ1887 para los periodos de estudio (en este caso son las declaraciones asociada a los anio 2020, 2021 y 2022.

In [3]:
dj1887 = spark.read.parquet("abfs://data@datalakesii.dfs.core.windows.net/DatosOrigen/DW/DW_TRN_DJR_F1887_E")
dj1887 = dj1887.filter((dj1887.PERI_AGNO_MES_TRIBUTARIO == 202000) | (dj1887.PERI_AGNO_MES_TRIBUTARIO == 202100) | (dj1887.PERI_AGNO_MES_TRIBUTARIO == 202200))

                                                                                

Se agrupa por rut y periodo, y se calculan las sumas de la rentas de los empleados y la cantidad de estos.

In [4]:
cols = ['CONT_RUT','CONT_DV','PERI_AGNO_MES_TRIBUTARIO','F1887_RENTA_TOTAL_EXENTA','F1887_RENTA_NETA','F1887_IMPTO_RETE','F1887_MAYOR_RETE','F1887_RENTA_EXEN','F1887_REBAJA_ZON','F1887_INGRESO_ENE_SA_C37','F1887_INGRESO_FEB_SA_C38','F1887_INGRESO_MAR_SA_C39','F1887_INGRESO_ABR_SA_C40','F1887_INGRESO_MAY_SA_C41','F1887_INGRESO_JUN_SA_C42','F1887_INGRESO_JUL_SA_C43','F1887_INGRESO_AGO_SA_C44','F1887_INGRESO_SEP_SA_C45','F1887_INGRESO_OCT_SA_C46','F1887_INGRESO_NOV_SA_C47','F1887_INGRESO_DIC_SA_C48']
rentas = dj1887.select(cols).groupBy(cols[:3]).sum('F1887_RENTA_TOTAL_EXENTA','F1887_RENTA_NETA','F1887_IMPTO_RETE','F1887_MAYOR_RETE','F1887_RENTA_EXEN','F1887_REBAJA_ZON','F1887_INGRESO_ENE_SA_C37','F1887_INGRESO_FEB_SA_C38','F1887_INGRESO_MAR_SA_C39','F1887_INGRESO_ABR_SA_C40','F1887_INGRESO_MAY_SA_C41','F1887_INGRESO_JUN_SA_C42','F1887_INGRESO_JUL_SA_C43','F1887_INGRESO_AGO_SA_C44','F1887_INGRESO_SEP_SA_C45','F1887_INGRESO_OCT_SA_C46','F1887_INGRESO_NOV_SA_C47','F1887_INGRESO_DIC_SA_C48').toPandas()
count_trabajadores = dj1887.groupBy(cols[:3]).agg(countDistinct('CONT_RUT_INFO').alias("cantidad_trabajadores_dependientes")).toPandas()

                                                                                

Obtenemos un unico dataframe para calcular el numero de trabajadores con la suma de las rentas por campo

In [5]:
df_count_rentas = count_trabajadores.merge(rentas, how='left', on=['CONT_RUT','CONT_DV','PERI_AGNO_MES_TRIBUTARIO'])

In [6]:
df_count_rentas

Unnamed: 0,CONT_RUT,CONT_DV,PERI_AGNO_MES_TRIBUTARIO,cantidad_trabajadores_dependientes,sum(F1887_RENTA_TOTAL_EXENTA),sum(F1887_RENTA_NETA),sum(F1887_IMPTO_RETE),sum(F1887_MAYOR_RETE),sum(F1887_RENTA_EXEN),sum(F1887_REBAJA_ZON),...,sum(F1887_INGRESO_MAR_SA_C39),sum(F1887_INGRESO_ABR_SA_C40),sum(F1887_INGRESO_MAY_SA_C41),sum(F1887_INGRESO_JUN_SA_C42),sum(F1887_INGRESO_JUL_SA_C43),sum(F1887_INGRESO_AGO_SA_C44),sum(F1887_INGRESO_SEP_SA_C45),sum(F1887_INGRESO_OCT_SA_C46),sum(F1887_INGRESO_NOV_SA_C47),sum(F1887_INGRESO_DIC_SA_C48)
0,onZsalPVcKfOXQLxUlw87g==,9,202100,2,0,993497,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,993497.0
1,P/zy0VZxkhyGq1Yj1lW11Q==,1,202100,7,0,23757995,0,0,0,0,...,2023482.0,2020830.0,2020677.0,2020797.0,2009186.0,1697614.0,1697476.0,1721746.0,1744821.0,1387011.0
2,fRFFxQtWGa68VhrCeZ3wCA==,9,202100,3,0,9674327,0,0,0,0,...,961500.0,961500.0,961500.0,961500.0,961500.0,961500.0,961500.0,979500.0,1469500.0,1469500.0
3,YQDeL8kLGgK53fzAlXwiNA==,9,202100,21,0,133625949,1115983,0,16729622,0,...,11064774.0,10782401.0,10903356.0,10903843.0,10904544.0,10904255.0,10899546.0,10900400.0,10725564.0,11174638.0
4,rhgqOj0xAGEhrv3h7Haowg==,8,202100,26,0,20388229,0,0,3856730,0,...,6692735.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174139,HPdCqv+PoFryk6ApJBqjnQ==,7,202000,1,0,1461873,0,0,0,0,...,,,,,,,,,,
1174140,XfRCwSid/Wzg7YC84miwOw==,K,202200,0,0,3193796,0,0,0,0,...,326500.0,326500.0,326500.0,326500.0,358000.0,358000.0,358000.0,358000.0,358000.0,358000.0
1174141,3Ie7VQe93aEkUCl9ICJk1Q==,0,202200,0,0,654886,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100966.0,275363.0,275363.0
1174142,obiuKE+xZwXVauoU/AIDpA==,3,202000,1,0,4748016,0,0,0,0,...,,,,,,,,,,


Se selecciona la última declaración

In [7]:
# calculamos el ultimo periodo disponible para cada rut
dj1887_last = df_count_rentas.sort_values(by=['CONT_RUT','CONT_DV','PERI_AGNO_MES_TRIBUTARIO'], ascending=False) \
                                    .groupby(['CONT_RUT','CONT_DV'])\
                                    .first()


In [8]:
dj1887_last=dj1887_last.reset_index(drop=False)
dj1887_last

Unnamed: 0,CONT_RUT,CONT_DV,PERI_AGNO_MES_TRIBUTARIO,cantidad_trabajadores_dependientes,sum(F1887_RENTA_TOTAL_EXENTA),sum(F1887_RENTA_NETA),sum(F1887_IMPTO_RETE),sum(F1887_MAYOR_RETE),sum(F1887_RENTA_EXEN),sum(F1887_REBAJA_ZON),...,sum(F1887_INGRESO_MAR_SA_C39),sum(F1887_INGRESO_ABR_SA_C40),sum(F1887_INGRESO_MAY_SA_C41),sum(F1887_INGRESO_JUN_SA_C42),sum(F1887_INGRESO_JUL_SA_C43),sum(F1887_INGRESO_AGO_SA_C44),sum(F1887_INGRESO_SEP_SA_C45),sum(F1887_INGRESO_OCT_SA_C46),sum(F1887_INGRESO_NOV_SA_C47),sum(F1887_INGRESO_DIC_SA_C48)
0,+++4/3jzUwtDPSSo3PxUgQ==,K,202200,3,0,100335428,11057741,0,2974910,0,...,7964654.0,8131831.0,8131398.0,8168393.0,8168732.0,7591093.0,8166825.0,8162064.0,8153792.0,8199533.0
1,+++VKgYcn1igYZjkTHXjgA==,5,202200,1,0,1306000,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,++/ILpGAKOZrH+0u+L1tdw==,1,202000,11,0,2025493,0,0,0,0,...,,,,,,,,,,
3,++/UTl2mwL/J484yKcuZrg==,0,202200,2,0,20241701,245967,0,0,0,...,1726589.0,1720037.0,1713484.0,1708570.0,1708570.0,1693826.0,1688912.0,1667617.0,1646321.0,1638130.0
4,++/dFaxQQDYaKe8zO/rKoQ==,2,202200,7,0,11172701,0,0,0,0,...,858125.0,858125.0,858125.0,1325312.0,1292500.0,1713750.0,1713750.0,842500.0,842500.0,1404166.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533624,zzxiVVlRVmrHIZn8ZjPTVQ==,5,202200,23,0,72350541,362416,0,4623164,0,...,7502194.0,7006238.0,6194317.0,5810360.0,6306282.0,5773516.0,4508210.0,5056608.0,4869184.0,4859469.0
533625,zzydn/q5KUdW0PcyxWSyzg==,8,202200,2,146471,4497040,0,0,0,0,...,582420.0,338642.0,338642.0,338642.0,370784.0,349532.0,349532.0,349532.0,349532.0,349532.0
533626,zzzEfsBUrus3+LpvpwcHig==,9,202200,2,0,7414505,0,0,0,0,...,816250.0,816250.0,816250.0,842500.0,842500.0,842500.0,842500.0,842500.0,842500.0,842500.0
533627,zzzo7RUe8Wp/sBVZrqC1BQ==,9,202200,1,2348275,5534665,0,0,0,0,...,483549.0,481714.0,479820.0,478444.0,478444.0,474315.0,299527.0,466976.0,461013.0,458719.0


In [9]:
dj1887_last = dj1887_last[['CONT_RUT','cantidad_trabajadores_dependientes']]

Se guarda en un archivo .csv

In [10]:
dj1887_last.to_csv("/home/cdsw/data/trabajadores/dj1887_last_period.csv")


#### DJ1879

Se cargan las DJ1879 para los periodos de estudio

In [11]:
EXAMPLE_SQL_QUERY = """
select *
from dw.dw_trn_djr_f1879_e
where PERI_AGNO_MES_TRIBUTARIO BETWEEN 202000 AND 202200
"""
dj1879 = spark.sql(EXAMPLE_SQL_QUERY)
dj1879.createOrReplaceTempView('DJ1879')


Hive Session ID = af170291-2667-46ee-836d-be1f14324aae


In [12]:
dj1879.columns

['peri_agno_mes_tributario',
 'header_form_key_vo',
 'tifo_cod_form',
 'tifo_cod_form_version',
 'cont_rut',
 'cont_dv',
 'f1879_rut_decla_vo',
 'f1879_dv_decla_vo',
 'cont_rut_info',
 'cont_dv_info',
 'f1879_rut_info_vo',
 'f1879_dv_info_vo',
 'f1879_rete_hono',
 'f1879_rete_dire10',
 'f1879_rete_dire20',
 'f1879_certificado',
 'f1879_correlativo',
 'f1879_periodo_pago',
 'f1879_rete_hono_artes',
 'f1879_rete_hono_isla',
 'f1879_rut_fncr',
 'f1879_dv_fncr',
 'cont_rut_fncr',
 'cont_dv_fncr',
 'f1879_seve',
 'f1879_borra_vo',
 'f1879_origen_vo',
 'f1879_estado_vx',
 'timo_cod_moneda',
 'f1879_fecha_conversion',
 'f1879_fecha_extraccion_so',
 'f1879_pago_prestamo_c29',
 'f1879_fecha_carga_dw',
 'c33']

Se filtra para el mismo año del F29 asociado, se agrupa por rut y periodo, luego se calculan las retenciones a honorarios y la cantidad de estos.

In [13]:
EXAMPLE_SQL_QUERY = """
select CONT_RUT,
PERI_AGNO_MES_TRIBUTARIO,
count(distinct CONT_RUT_INFO) as cantidad_trabajadores_honorarios,
sum(F1879_RETE_HONO) as total_monto_honorario
from DJ1879
group by CONT_RUT,PERI_AGNO_MES_TRIBUTARIO
"""

spark.sql(EXAMPLE_SQL_QUERY).createOrReplaceTempView('DJ1879')


In [14]:
spark.sql('select * from DJ1879').show()



+--------------------+------------------------+--------------------------------+---------------------+
|            CONT_RUT|PERI_AGNO_MES_TRIBUTARIO|cantidad_trabajadores_honorarios|total_monto_honorario|
+--------------------+------------------------+--------------------------------+---------------------+
|bXRGmIuD5AhAo3MeL...|                  202200|                               7|              3672562|
|yJleXl/ivjcEGu5CM...|                  202100|                               4|               115903|
|5uO6dpNKFLq7gsMaW...|                  202200|                               1|                35708|
|7jl4v9ibbfEy+tUy3...|                  202200|                             196|             17075945|
|dTbO23O9PszufqMSe...|                  202200|                              70|             20194629|
|JvLmLuwg2DO/9tlM2...|                  202200|                               6|              2505363|
|snLGi2DHJpB+tTDKk...|                  202200|                          

                                                                                

In [15]:
dj1879_pd = spark.sql('select * from DJ1879').toPandas()


                                                                                

Seleccionamos los datos del periodo informado mas reciente en cada caso

In [16]:
# calculamos el ultimo periodo disponible para cada rut
dj1879_last = dj1879_pd.sort_values(by=['CONT_RUT', 'PERI_AGNO_MES_TRIBUTARIO'], ascending=False) \
                                    .groupby(['CONT_RUT'])\
                                    .first()

In [17]:
dj1879_last=dj1879_last.reset_index(drop=False)

In [18]:
dj1879_last= dj1879_last[['CONT_RUT', 'cantidad_trabajadores_honorarios']]

Se guarda en un archivo .csv

In [19]:
dj1879_last.to_csv("/home/cdsw/data/trabajadores/dj1879_last_period.csv")

Tambien generaremos un dataset donde haya tanto el numero de trabajadores dependientes como el numero de trabajadores a honorarios 

### Dataset final

In [20]:
# Realizar un outer join en la columna 'CONT_RUT'
merged_df = dj1879_last.merge(dj1887_last, on='CONT_RUT', how='outer')

# Seleccionar las columnas de interés
columns_to_keep = ['CONT_RUT', 'cantidad_trabajadores_honorarios', 'cantidad_trabajadores_dependientes']
final_df = merged_df[columns_to_keep]
final_df = final_df.fillna(0)

In [21]:
final_df.to_csv("/home/cdsw/data/trabajadores/trabajadores_last_declaration.csv")