In [1]:

# ## Apiux & SII: calculo de patrimonio en personas juridicas usando como base la malla societaria y patrimonio de personas naturales

# ## Henry Vega (henrry.vega@api-ux.com)
# ## Data analyst

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import pyspark
#warnings.filterwarnings('ignore', category=DeprecationWarning)

In [2]:
spark = SparkSession.builder \
      .appName("Ejecucion algoritmo XXX")  \
      .config("spark.yarn.access.hadoopFileSystems","abfs://data@datalakesii.dfs.core.windows.net/") \
      .config("spark.executor.memory", "16g") \
      .config("spark.driver.memory", "12g")\
      .config("spark.executor.cores", "2") \
      .config("spark.executor.instances", "10") \
      .config("spark.driver.maxResultSize", "12g") \
      .config("spark.default.parallelism", "20") \
      .getOrCreate()


Setting spark.hadoop.yarn.resourcemanager.principal to hvega.externo


In [3]:
# ## Carga de relaciones societarias(depurada)
# Se carga la data depurada anteriormente de relaciones societarias.

spark.read.options(header=True,inferSchema=True,delimiter=",").csv("/home/cdsw/data/processed/patrimonio/malla_societaria/sociedades_participacion_capital_nozero.csv").createOrReplaceTempView("sociedad")


                                                                                

In [4]:
# Se usara el porcentaje de capital con el patrimonio de cada socio para poder establecer el valor de sociedad

spark.sql("select  RUT_SOCIEDAD, RUT_SOCIO, mean(PORCENTAJE_CAPITAL) as PORCENTAJE_CAPITAL, mean(PORCENTAJE_UTILIDADES) as PORCENTAJE_UTILIDADES from sociedad group by RUT_SOCIEDAD, RUT_SOCIO").createOrReplaceTempView("composicion")

spark.sql("select  RUT_SOCIEDAD, sum(PORCENTAJE_CAPITAL) as TOTAL_CAPITAL, SUM(PORCENTAJE_UTILIDADES) as TOTAL_UTILIDADES from composicion group by RUT_SOCIEDAD").createOrReplaceTempView("composicion")
m=spark.sql("select * from composicion").toPandas()


                                                                                

In [5]:
# Se utilizara el porcentaje de capital para hacer la transmision de patrimonio. En el proceso de limpieza de datos se considero las filas con valores no nulos de PORCENTAJE_CAPITAL. Se utilizara una tabla de composicion para poder normalizar los valores de oscuridad para cuando los valores de porcentajes de capital sumen diferente a 100%.

# ## Tabla de ponderacion para el total de capital.

spark.sql("select RUT_SOCIEDAD as CONT_RUT, TOTAL_CAPITAL as ponderador from composicion").createOrReplaceTempView("composicion")

In [6]:
# ## Lectura de tabla de patrimonio 2022
pat_2022=spark.sql("select * from libsdf.pat_gp_arfi_apiux_2022_e")
pat_2022.createOrReplaceTempView("pat_2022")
spark.sql('select * from pat_2022 ').schema

Hive Session ID = 86be838c-d64d-4297-a4ea-baf90b810551


StructType(List(StructField(CONT_RUT,StringType,true),StructField(CONT_DV,StringType,true),StructField(TRAMO,StringType,true),StructField(PATR_TOTAL_2022,StringType,true)))

In [7]:
spark.sql('SELECT *, CAST(REGEXP_REPLACE(PATR_TOTAL_2022, ",", "") AS INT) AS PATR_TOTAL_2022_entero from pat_2022').createOrReplaceTempView("pat_2022")
spark.sql('SELECT CONT_RUT, PATR_TOTAL_2022_entero as PAT_2022 from pat_2022 where PATR_TOTAL_2022_entero>0').createOrReplaceTempView("pat_2022")
spark.sql("select count(*) from pat_2022").show()

[Stage 8:>                                                          (0 + 1) / 1]

+--------+
|count(1)|
+--------+
|   54161|
+--------+



                                                                                

In [8]:
spark.sql("select cont_rut as CONT_RUT,CAST(patrimonio_2022 AS INT) as PAT_2022 from libsdf.patrimonio_at2022_21_20 where patrimonio_2022>0 ").createOrReplaceTempView("altos_pat")
spark.sql("select count(*) from altos_pat").show()



+--------+
|count(1)|
+--------+
| 4582461|
+--------+



                                                                                

In [9]:
spark.sql('SELECT CONT_RUT, PAT_2022 FROM pat_2022 UNION SELECT CONT_RUT, PAT_2022 FROM altos_pat').createOrReplaceTempView("union_pat_altos_pat")
spark.sql('SELECT CONT_RUT, mean(PAT_2022) as PAT_2022 FROM union_pat_altos_pat group by CONT_RUT').createOrReplaceTempView("pat")

spark.sql("select count(*) from pat").show()




+--------+
|count(1)|
+--------+
| 4587460|
+--------+



                                                                                

In [10]:
spark.sql("select  RUT_SOCIEDAD, RUT_SOCIO, mean(PORCENTAJE_CAPITAL) as PORCENTAJE_CAPITAL, mean(PORCENTAJE_UTILIDADES) as PORCENTAJE_UTILIDADES from sociedad group by RUT_SOCIEDAD, RUT_SOCIO").createOrReplaceTempView("sociedad")

Primera iteracion

Se realiza el cruce de la data societaria con la data de patrimonio 2022 de personas naturales (1re paso de calculo de materia oscura para sociedades completas). Hay sociedades donde no se completan todos los socios con patrimonio, por lo cual se discrimina mediante un contador de nulos de dicho campo para poder agregar la data y obtener un valor de oscuridad para un rut de soiedad donde todas las entradas de sus socios han sido completadas.

In [11]:
#Iteracion 0
spark.sql("select * from sociedad left join pat on sociedad.RUT_SOCIO=pat.CONT_RUT order by sociedad.RUT_SOCIEDAD asc").createOrReplaceTempView("sociedad")

spark.sql("select RUT_SOCIEDAD, RUT_SOCIO, PORCENTAJE_CAPITAL, PAT_2022 from sociedad").createOrReplaceTempView("sociedad")
spark.sql("select * from sociedad ").show()
spark.sql("select RUT_SOCIEDAD as RUT_SOCIEDAD1, count(*) as nulos  from sociedad where PAT_2022 is null group by RUT_SOCIEDAD order by RUT_SOCIEDAD ASC").createOrReplaceTempView("aux")
spark.sql("select * from aux ").show()
spark.sql("select RUT_SOCIEDAD,RUT_SOCIO,PORCENTAJE_CAPITAL, nulos, PAT_2022 from sociedad left join aux on sociedad.RUT_SOCIEDAD=aux.RUT_SOCIEDAD1 order by RUT_SOCIEDAD asc ").createOrReplaceTempView("aux")
spark.sql("select * from aux where nulos is null ").show()

                                                                                

+--------------------+--------------------+------------------+-------------+
|        RUT_SOCIEDAD|           RUT_SOCIO|PORCENTAJE_CAPITAL|     PAT_2022|
+--------------------+--------------------+------------------+-------------+
|+++JjoAr1RQa1kY7r...|xO5P1s+Yeug4wKtb8...|             14.29|         null|
|+++JjoAr1RQa1kY7r...|S2zsiazT64Jiu/sSE...|             85.71|         null|
|+++P6tU1TAUNG0SZv...|8ejqjUOvnKKfMZSzj...|             100.0|         null|
|+++UeiptXaAQD74N1...|5jgmhcvBN2sVp7Gh5...|              72.0|2.147483647E9|
|+++UeiptXaAQD74N1...|QemGi6c6gkD3rdstn...|              28.0|  7.5308125E7|
|+++VKgYcn1igYZjkT...|XRkbvgfaa5MRVblDF...|             100.0|         null|
|+++VWXcqX/471v55m...|t9AEh7tBjjp3FKWzm...|               1.0|9.265492805E8|
|+++VWXcqX/471v55m...|d/Gh1G45HEItM+byD...|              99.0|    3203827.5|
|+++ZYfC4XA28yQEQd...|Rf14vs+327ZlNsQvX...|             100.0|   2.101449E7|
|+++rmzjqUcW56fKfV...|VULdXIkUAxKz8BFIA...|              52.5|2.147483647E9|

                                                                                

+--------------------+-----+
|       RUT_SOCIEDAD1|nulos|
+--------------------+-----+
|+++JjoAr1RQa1kY7r...|    2|
|+++P6tU1TAUNG0SZv...|    1|
|+++VKgYcn1igYZjkT...|    1|
|+++rmzjqUcW56fKfV...|    1|
|++/BNOl5Jzp3/53dg...|    1|
|++/ILpGAKOZrH+0u+...|    2|
|++/O0DVO8dUreJB37...|    2|
|++/dFaxQQDYaKe8zO...|    2|
|++/fV7MkjN3EYCFO8...|    2|
|++/tLigzCfQxU6467...|    2|
|++0TZZ4dLrh7Mcrrq...|    1|
|++0hvpY1Z3Wx9DKo2...|    1|
|++0kBAcf27iNKS5+c...|    1|
|++1JNpbthfwgogYLW...|    2|
|++2DK/e8a0qWOS7QA...|    2|
|++2JWG4I/BtuVkRaz...|    1|
|++2O7mw5x3lYgK44x...|    1|
|++2eRGzgoveOZXA6V...|    1|
|++3FYBfTE6Si69yVW...|    1|
|++4XOUgYVCe4kq6Bg...|    1|
+--------------------+-----+
only showing top 20 rows





+--------------------+--------------------+------------------+-----+--------------+
|        RUT_SOCIEDAD|           RUT_SOCIO|PORCENTAJE_CAPITAL|nulos|      PAT_2022|
+--------------------+--------------------+------------------+-----+--------------+
|+++UeiptXaAQD74N1...|QemGi6c6gkD3rdstn...|              28.0| null|   7.5308125E7|
|+++UeiptXaAQD74N1...|5jgmhcvBN2sVp7Gh5...|              72.0| null| 2.147483647E9|
|+++VWXcqX/471v55m...|d/Gh1G45HEItM+byD...|              99.0| null|     3203827.5|
|+++VWXcqX/471v55m...|t9AEh7tBjjp3FKWzm...|               1.0| null| 9.265492805E8|
|+++ZYfC4XA28yQEQd...|Rf14vs+327ZlNsQvX...|             100.0| null|    2.101449E7|
|++/UTl2mwL/J484yK...|LxuWEPpbFk7A2Iind...|             100.0| null|   4.1990781E7|
|++0fvj6XSxmeV4no1...|mrnoh9uP16vfcYNyh...|              90.0| null|   6.3206213E7|
|++0fvj6XSxmeV4no1...|wZ+qC6VPjjuYqiJ3K...|              10.0| null|   6.0012793E7|
|++2aG5ztBIvGl/E0s...|cP/L5EFxY1um7QZpw...|              10.0| null|   8.259

                                                                                

In [12]:
# Junto con completar la data con los valores de oscuridad para sociedades completas, se agrega un ponderador. De esta forma, si una entidad tiene dos socios con oscuridad y la composicion suma diferente a 100%, se pondera proporcionalmente para que sumen 100%.

In [13]:
#agregar la ponderacion adecuada

spark.sql("select RUT_SOCIEDAD,RUT_SOCIO,PORCENTAJE_CAPITAL, PAT_2022 from aux where nulos is null").createOrReplaceTempView("aux")
spark.sql("select * from aux").show()
spark.sql("select RUT_SOCIEDAD as CONT_RUT, SUM(PORCENTAJE_CAPITAL*PAT_2022*0.01) as othervalue from aux group by RUT_SOCIEDAD").createOrReplaceTempView("pat")
spark.sql("select * from pat").show()
spark.sql("select CONT_RUT, othervalue as Value  from pat").createOrReplaceTempView("pat")
spark.sql("select * from pat").show()
spark.sql("select pat.CONT_RUT as CONT_RUT,Value , ponderador from pat left join composicion on pat.CONT_RUT=composicion.CONT_RUT order by pat.CONT_RUT desc ").createOrReplaceTempView("pat")
spark.sql("select * from pat").show()
spark.sql("select CONT_RUT, Value /ponderador*100 as Value from pat ").createOrReplaceTempView("pat")


                                                                                

+--------------------+--------------------+------------------+--------------+
|        RUT_SOCIEDAD|           RUT_SOCIO|PORCENTAJE_CAPITAL|      PAT_2022|
+--------------------+--------------------+------------------+--------------+
|+++UeiptXaAQD74N1...|5jgmhcvBN2sVp7Gh5...|              72.0| 2.147483647E9|
|+++UeiptXaAQD74N1...|QemGi6c6gkD3rdstn...|              28.0|   7.5308125E7|
|+++VWXcqX/471v55m...|d/Gh1G45HEItM+byD...|              99.0|     3203827.5|
|+++VWXcqX/471v55m...|t9AEh7tBjjp3FKWzm...|               1.0| 9.265492805E8|
|+++ZYfC4XA28yQEQd...|Rf14vs+327ZlNsQvX...|             100.0|    2.101449E7|
|++/UTl2mwL/J484yK...|LxuWEPpbFk7A2Iind...|             100.0|   4.1990781E7|
|++0fvj6XSxmeV4no1...|mrnoh9uP16vfcYNyh...|              90.0|   6.3206213E7|
|++0fvj6XSxmeV4no1...|wZ+qC6VPjjuYqiJ3K...|              10.0|   6.0012793E7|
|++2aG5ztBIvGl/E0s...|cP/L5EFxY1um7QZpw...|              10.0|   8.2592151E7|
|++2aG5ztBIvGl/E0s...|3sTU9CV92iDjQpnyW...|              40.0| 8

                                                                                

+--------------------+---------------+
|            CONT_RUT|     othervalue|
+--------------------+---------------+
|+++UeiptXaAQD74N1...|1.56727450084E9|
|+++VWXcqX/471v55m...|  1.243728203E7|
|+++ZYfC4XA28yQEQd...|     2.101449E7|
|++/UTl2mwL/J484yK...|    4.1990781E7|
|++0fvj6XSxmeV4no1...|    6.2886871E7|
|++2aG5ztBIvGl/E0s...|  8.068689286E8|
|++40/vrBPOK6RXSL8...|    3.3566024E7|
|++4L5alWpdKm9Bt8E...|  6.688419225E7|
|++4bDjJxZJPO468EH...|  2.147483647E9|
|++500+9A6Vmu8wvdi...|    6.1052051E7|
|++7xFy/9FbLHzCtmn...|    1.3088094E7|
|++7yFnLR2jk0Wr0e8...|7.87360835675E8|
|++7yc2t/5eFjyGHnT...|    1.8862183E7|
|++8yrddEhyjEVxjMJ...|    1.7857147E7|
|++ADTgoTIuaqdg7q5...|      3873444.0|
|++ASy6KjK+xIa2zAI...|   8.80692393E8|
|++AiMoL6dLpbU4OeP...|  1.065433326E9|
|++AidU2ndM/UGI/RD...|    1.7915835E8|
|++BCOAZ6qEHTeUBtw...|    3.5178348E7|
|++C5vdn7A2eRP1B3g...|  1.266298685E8|
+--------------------+---------------+
only showing top 20 rows



                                                                                

+--------------------+---------------+
|            CONT_RUT|          Value|
+--------------------+---------------+
|+++UeiptXaAQD74N1...|1.56727450084E9|
|+++VWXcqX/471v55m...|  1.243728203E7|
|+++ZYfC4XA28yQEQd...|     2.101449E7|
|++/UTl2mwL/J484yK...|    4.1990781E7|
|++0fvj6XSxmeV4no1...|    6.2886871E7|
|++2aG5ztBIvGl/E0s...|  8.068689286E8|
|++40/vrBPOK6RXSL8...|    3.3566024E7|
|++4L5alWpdKm9Bt8E...|  6.688419225E7|
|++4bDjJxZJPO468EH...|  2.147483647E9|
|++500+9A6Vmu8wvdi...|    6.1052051E7|
|++7xFy/9FbLHzCtmn...|    1.3088094E7|
|++7yFnLR2jk0Wr0e8...|7.87360835675E8|
|++7yc2t/5eFjyGHnT...|    1.8862183E7|
|++8yrddEhyjEVxjMJ...|    1.7857147E7|
|++ADTgoTIuaqdg7q5...|      3873444.0|
|++ASy6KjK+xIa2zAI...|   8.80692393E8|
|++AiMoL6dLpbU4OeP...|  1.065433326E9|
|++AidU2ndM/UGI/RD...|    1.7915835E8|
|++BCOAZ6qEHTeUBtw...|    3.5178348E7|
|++C5vdn7A2eRP1B3g...|  1.266298685E8|
+--------------------+---------------+
only showing top 20 rows





+--------------------+--------------+----------+
|            CONT_RUT|         Value|ponderador|
+--------------------+--------------+----------+
|zzzEfsBUrus3+Lpvp...|  1.49550806E8|     100.0|
|zzydn/q5KUdW0Pcyx...|  1.36950192E8|     100.0|
|zzycSmylbx7abTn6L...|  4.21330307E8|     100.0|
|zzy4xjR3eyeQeDulk...|   3.9403952E7|     100.0|
|zzxn+AXDxM7o3dq6t...| 1.642731105E8|     100.0|
|zzxiVVlRVmrHIZn8Z...|  1.14940842E8|     100.0|
|zzw82ZqW8spQp27re...|   5.3330875E8|     100.0|
|zzvkn2WcBhsERkZz5...|  4.14586045E7|     100.0|
|zzuRJI0/j+9Kg8XUq...| 2.205758535E8|     100.0|
|zzttlu6AW7fBR+QT5...|   2.3059132E7|     100.0|
|zztiahFY7OZufBZsU...|  5.80273356E7|     100.0|
|zztPoAxGDNkBcKOHb...|  1.75259232E8|     100.0|
|zztIN7SsWnMIqkfIp...|   2.7279001E7|     100.0|
|zzsvRDNgYgL8X9YBS...|  9.73684485E7|     100.0|
|zzsbM16tZoOEyo1Vh...|   3.0671968E7|     100.0|
|zzsHJY/ePJB1G/GW0...|   4.5605857E7|     100.0|
|zzry8vr0qh0CNTySY...|2.0128139775E8|     100.0|
|zzruxQtAd2RIDmgY6..

                                                                                

In [14]:
tabla_auxiliar=spark.sql('select * from pat')
tabla_auxiliar=tabla_auxiliar.toPandas()
tabla_auxiliar.to_csv("/home/cdsw/data/processed/patrimonio/patrimonio_2022_aux.csv", index=False)

                                                                                

In [15]:
pat_aux=spark.sql("select * from pat").toPandas()
pat_aux['iterations']=0

                                                                                

In [16]:
pat_aux

Unnamed: 0,CONT_RUT,Value,iterations
0,zzzEfsBUrus3+LpvpwcHig==,1.495508e+08,0
1,zzydn/q5KUdW0PcyxWSyzg==,1.369502e+08,0
2,zzycSmylbx7abTn6LsudIw==,4.213303e+08,0
3,zzy4xjR3eyeQeDulkMLsBw==,3.940395e+07,0
4,zzxn+AXDxM7o3dq6t1sAbA==,1.642731e+08,0
...,...,...,...
476448,++0fvj6XSxmeV4no1390og==,6.288687e+07,0
476449,++/UTl2mwL/J484yKcuZrg==,4.199078e+07,0
476450,+++ZYfC4XA28yQEQdCy5yw==,2.101449e+07,0
476451,+++VWXcqX/471v55mS2YiA==,1.243728e+07,0


In [17]:
iter=10

In [18]:
for iteration in range(1,iter):
    temp=spark.read.options(header=True,inferSchema=True,delimiter=",").csv("/home/cdsw/data/processed/patrimonio/patrimonio_2022_aux.csv")
    temp.createOrReplaceTempView("pat")                                                       
   # spark.read.parquet("abfs://data@datalakesii.dfs.core.windows.net/DatoOrigen/lr-629/Oscuridad/intermedia/oscuridad_aux").createOrReplaceTempView("oscuridad")
    
    spark.sql("select * from sociedad left join pat on sociedad.RUT_SOCIO=pat.CONT_RUT order by sociedad.RUT_SOCIEDAD asc").createOrReplaceTempView("sociedad")
    spark.sql("select RUT_SOCIEDAD, RUT_SOCIO, PORCENTAJE_CAPITAL, CASE WHEN PAT_2022 is null THEN Value ELSE PAT_2022 END AS PAT_2022 from sociedad").createOrReplaceTempView("sociedad")
    spark.sql("select RUT_SOCIEDAD, RUT_SOCIO, PORCENTAJE_CAPITAL, PAT_2022 from sociedad").createOrReplaceTempView("sociedad")
    spark.sql("select RUT_SOCIEDAD as RUT_SOCIEDAD1, count(*) as nulos from sociedad where PAT_2022 is null group by RUT_SOCIEDAD order by RUT_SOCIEDAD ASC").createOrReplaceTempView("aux")
    spark.sql("select RUT_SOCIEDAD,RUT_SOCIO,PORCENTAJE_CAPITAL, nulos, PAT_2022 from sociedad left join aux on sociedad.RUT_SOCIEDAD=aux.RUT_SOCIEDAD1 order by RUT_SOCIEDAD asc ").createOrReplaceTempView("aux")
    spark.sql("select RUT_SOCIEDAD,RUT_SOCIO,PORCENTAJE_CAPITAL, PAT_2022 from aux where nulos is null").createOrReplaceTempView("aux")
    spark.sql("select RUT_SOCIEDAD as CONT_RUT, SUM(PORCENTAJE_CAPITAL*PAT_2022*0.01) as othervalue from aux group by RUT_SOCIEDAD").createOrReplaceTempView("pat")
    spark.sql("select CONT_RUT, othervalue as Value from pat").createOrReplaceTempView("pat")
    
    spark.sql("select pat.CONT_RUT as CONT_RUT, Value,ponderador from pat left join composicion on pat.CONT_RUT=composicion.CONT_RUT order by pat.CONT_RUT asc ").createOrReplaceTempView("pat")
    #spark.sql("select * from oscuridad ").show()
    spark.sql("select CONT_RUT, Value/ponderador*100 as Value from pat order by Value desc ").createOrReplaceTempView("pat")

    
    #spark.sql("SELECT * FROM oscuridad").write.mode('overwrite').format("parquet").save("abfs://data@datalakesii.dfs.core.windows.net/DatoOrigen/lr-629/Oscuridad/intermedia/oscuridad_aux")
    
    tabla_auxiliar=spark.sql('select * from pat')
    tabla_auxiliar=tabla_auxiliar.toPandas()
    tabla_auxiliar.to_csv("/home/cdsw/data/processed/patrimonio/patrimonio_2022_aux.csv", index=False)
    
    pat=spark.sql("select * from pat").toPandas()
    pat['iterations']=iteration

    pat=pat.merge(pat_aux, on = "CONT_RUT", how = "left")
    pat['iterations']=pat[["iterations_x", "iterations_y"]].min(axis=1)
    pat = pat.rename(columns={'Value_x': 'Value'})
    pat=pat[['CONT_RUT','Value','iterations']]
    pat_aux=pat.iloc[:,:]
    print(pat_aux.describe())



                                                                                

              Value     iterations
count  5.019640e+05  501964.000000
mean   2.464077e+08       0.050822
std    3.970606e+08       0.219635
min    0.000000e+00       0.000000
25%    3.494416e+07       0.000000
50%    8.734232e+07       0.000000
75%    2.530663e+08       0.000000
max    2.147484e+09       1.000000


                                                                                

              Value     iterations
count  5.038290e+05  503829.000000
mean   2.497687e+08       0.058375
std    4.017231e+08       0.251089
min    0.000000e+00       0.000000
25%    3.509012e+07       0.000000
50%    8.796122e+07       0.000000
75%    2.566358e+08       0.000000
max    2.147484e+09       2.000000


                                                                                

              Value     iterations
count  5.044050e+05  504405.000000
mean   2.508833e+08       0.062115
std    4.033789e+08       0.272906
min    0.000000e+00       0.000000
25%    3.513113e+07       0.000000
50%    8.814049e+07       0.000000
75%    2.578198e+08       0.000000
max    2.147484e+09       3.000000


                                                                                

              Value     iterations
count  5.046250e+05  504625.000000
mean   2.514201e+08       0.064014
std    4.041887e+08       0.286744
min    0.000000e+00       0.000000
25%    3.515012e+07       0.000000
50%    8.821384e+07       0.000000
75%    2.584073e+08       0.000000
max    2.147484e+09       4.000000


                                                                                

              Value     iterations
count  5.046940e+05  504694.000000
mean   2.516049e+08       0.064796
std    4.045011e+08       0.293620
min    0.000000e+00       0.000000
25%    3.515608e+07       0.000000
50%    8.823973e+07       0.000000
75%    2.585590e+08       0.000000
max    2.147484e+09       5.000000


                                                                                

              Value     iterations
count  5.047280e+05  504728.000000
mean   2.516967e+08       0.065259
std    4.046939e+08       0.298374
min    0.000000e+00       0.000000
25%    3.515713e+07       0.000000
50%    8.824359e+07       0.000000
75%    2.586183e+08       0.000000
max    2.147484e+09       6.000000


                                                                                

              Value     iterations
count  5.047380e+05  504738.000000
mean   2.517106e+08       0.065396
std    4.047263e+08       0.299964
min    0.000000e+00       0.000000
25%    3.515734e+07       0.000000
50%    8.824359e+07       0.000000
75%    2.586290e+08       0.000000
max    2.147484e+09       7.000000


                                                                                

              Value     iterations
count  5.047470e+05  504747.000000
mean   2.517068e+08       0.065552
std    4.047237e+08       0.302030
min    0.000000e+00       0.000000
25%    3.515816e+07       0.000000
50%    8.824217e+07       0.000000
75%    2.586191e+08       0.000000
max    2.147484e+09       8.000000


                                                                                

              Value     iterations
count  5.047540e+05  504754.000000
mean   2.517038e+08       0.065676
std    4.047217e+08       0.303855
min    0.000000e+00       0.000000
25%    3.515887e+07       0.000000
50%    8.824152e+07       0.000000
75%    2.586095e+08       0.000000
max    2.147484e+09       9.000000


In [19]:
# Convertir 'oscuridad_aux' a una lista de filas (tuplas)
rows = [tuple(x) for x in pat_aux.to_numpy()]

# Crear un DataFrame de Spark a partir de las filas

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType

# Especificar el esquema manualmente
schema = StructType([
    StructField("CONT_RUT", StringType(), True),
    StructField("Value", DoubleType(), True),
    StructField("iterations", DoubleType(), True)
])
pat_aux = spark.createDataFrame(rows, schema=schema)
pat_aux.write.mode('overwrite').format("parquet").save("abfs://data@datalakesii.dfs.core.windows.net/DatoOrigen/lr-629/Agrupacion_empresas_similares/patrimonio/patrimonios_completos")

                                                                                

In [20]:
pat_aux.filter(pat_aux['Value'] > 0).count()

                                                                                

504752

In [21]:
pat_aux.filter(pat_aux['Value'] == 0).count()

                                                                                

2

In [22]:
sociedad=spark.sql("select * from sociedad")
sociedad.write.mode('overwrite').format("parquet").save("abfs://data@datalakesii.dfs.core.windows.net/DatoOrigen/lr-629/Agrupacion_empresas_similares/patrimonio/patrimonios_incompletos")

                                                                                