### IVA intra grupo y extra grupo en comunidades definidas por persona natural asociada

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import pyspark
import pandas as pd
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

from pyspark_dist_explore import hist
import matplotlib.pyplot as plt
from pyspark.sql.types import StringType,TimestampType


In [5]:

spark = SparkSession.builder \
  .appName("Test")  \
  .config("spark.yarn.access.hadoopFileSystems","abfs://data@datalakesii.dfs.core.windows.net/") \
  .config("spark.executor.memory", "24g") \
  .config("spark.driver.memory", "12g")\
  .config("spark.executor.cores", "12") \
  .config("spark.driver.maxResultSize", "12g") \
  .getOrCreate()

warnings.filterwarnings('ignore', category=DeprecationWarning)
sc=spark.sparkContext
sc.setLogLevel ('ERROR')
spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")



Leemos la data de los arcos comerciales.

In [6]:
spark.read.parquet("abfs://data@datalakesii.dfs.core.windows.net/DatosOrigen/LibSDF/JBA_ARCOS_E").createOrReplaceTempView("arcos")
spark.sql("SELECT PARU_RUT_E0, PARU_RUT_E2, Monto_IVA FROM arcos where Monto_IVA>0 order by PARU_RUT_E2 asc").createOrReplaceTempView("arcos")

In [7]:
spark.sql('select * from arcos').show()



+--------------------+--------------------+---------+
|         PARU_RUT_E0|         PARU_RUT_E2|Monto_IVA|
+--------------------+--------------------+---------+
|NaXazswsZodNytTsS...|+++0Re2TkLe14DpcD...|     4890|
|0Mc7YtTQj2BeLrOxn...|+++4/3jzUwtDPSSo3...|    30122|
|o+qwedorOHCucSLN1...|+++4/3jzUwtDPSSo3...|    29830|
|/+eRts6KZwhgL3x63...|+++4/3jzUwtDPSSo3...|  2281443|
|9toOnP1MK7aI8vDTK...|+++4/3jzUwtDPSSo3...|    70129|
|mNKzwe6Z5TAI7rkgK...|+++4/3jzUwtDPSSo3...|  1092499|
|Uu7ofaB+1aDgfENMi...|+++4/3jzUwtDPSSo3...|  2399086|
|awn1vGJ0+HsBiLukr...|+++4/3jzUwtDPSSo3...| 30457514|
|Va5E3UGP9caIPuMjY...|+++4/3jzUwtDPSSo3...|   764750|
|8cFyR319RFkoK3Srp...|+++4/3jzUwtDPSSo3...|     6163|
|6WKXhsWH5RlOKIi6v...|+++4/3jzUwtDPSSo3...|  1787519|
|NEDDI3b5bkXCP+ae8...|+++4/3jzUwtDPSSo3...| 22352466|
|vMIN0af1h36ymVDyc...|+++4/3jzUwtDPSSo3...|     4332|
|xir6OxC/tCKNSXtX6...|+++4/3jzUwtDPSSo3...|  1414873|
|M9O4fpIGf1Bnwn/XI...|+++4/3jzUwtDPSSo3...|  2878980|
|LD33vBbV2xSePcVnE...|+++4/3

                                                                                

Ahora la data de las sociedades que incluye las caracteristicas asociadadas

In [8]:
contaminados = spark.read.options(header=True,inferSchema=True,delimiter=",").csv("/home/cdsw/data/processed/comunidades_persona_natural/comunidades_natural_sociedades.csv")
contaminados.createOrReplaceTempView("contaminados")

                                                                                

In [9]:
spark.sql('select * from contaminados').show()

+--------------------+--------------------+------------------+--------------------+--------------+-----------+--------------------+------------+--------+--------------+
|       SOCIO_NATURAL|SOCIEDAD_RELACIONADA|PORCENTAJE_CAPITAL|               score|total_pago_f29|   IVA_neto|     unidad_regional|n_documentos|lifetime|alerta_inicial|
+--------------------+--------------------+------------------+--------------------+--------------+-----------+--------------------+------------+--------+--------------+
|+++Dv3VdgQyVqRTgl...|jIjACgvcsprjqLeup...|              24.0|0.052408941789938795|   9.6336447E7|   -5282491|                null|         4.0|   530.0|             0|
|+++YLFFod6As3+EIq...|XdNCugfCv/JNxkjsW...|             16.67| 0.06730458372836294|   1.7331782E7|  -12565236|6TA UNIDAD REGION...|        10.0|  3451.0|             0|
|+++cWi/r31w33XdNY...|dqMQkYCsU9PCaLwUT...|             100.0|  0.0461122114462416|        9500.0|    -955065|                null|         6.0|   633.0|  

Ahora hacemos el cruce de los datos para asignar la comunidad a cada entidad de los arcos comerciales, tanto emisor como receptor. 

In [10]:
spark.sql('select PARU_RUT_E0,contaminados.SOCIO_NATURAL as comunidad_emisor, PARU_RUT_E2, Monto_IVA from arcos left join contaminados on arcos.PARU_RUT_E0=contaminados.SOCIEDAD_RELACIONADA').createOrReplaceTempView('arcos')

In [11]:
spark.sql('select PARU_RUT_E0 as emisor,comunidad_emisor,PARU_RUT_E2 as receptor,contaminados.SOCIO_NATURAL as comunidad_receptor, Monto_IVA from arcos  left join contaminados on arcos.PARU_RUT_E2=contaminados.SOCIEDAD_RELACIONADA').createOrReplaceTempView('arcos')

In [13]:
spark.sql('select * from arcos where comunidad_emisor is not null or comunidad_receptor is not null').createOrReplaceTempView('arcos')
spark.sql('select * from arcos').show()


[Stage 29:>                                                         (0 + 1) / 1]

+--------------------+--------------------+--------------------+--------------------+---------+
|              emisor|    comunidad_emisor|            receptor|  comunidad_receptor|Monto_IVA|
+--------------------+--------------------+--------------------+--------------------+---------+
|DbYpAClXKcn2JRDAg...|8QGOdi60sf6fihvmi...|++0TZZ4dLrh7Mcrrq...|G1NPZeFPbEvK63iXK...|    11800|
|DbYpAClXKcn2JRDAg...|8QGOdi60sf6fihvmi...|++0TZZ4dLrh7Mcrrq...|HuScCDbt9021COD28...|    11800|
|DbYpAClXKcn2JRDAg...|xuLC5pFv8KGOVUCL9...|++0TZZ4dLrh7Mcrrq...|G1NPZeFPbEvK63iXK...|    11800|
|DbYpAClXKcn2JRDAg...|xuLC5pFv8KGOVUCL9...|++0TZZ4dLrh7Mcrrq...|HuScCDbt9021COD28...|    11800|
|FWtmdWaq9VxWjhrzv...|6Wmf/Aknt7B4Ze/Tf...|++0TZZ4dLrh7Mcrrq...|G1NPZeFPbEvK63iXK...|    37278|
|FWtmdWaq9VxWjhrzv...|6Wmf/Aknt7B4Ze/Tf...|++0TZZ4dLrh7Mcrrq...|HuScCDbt9021COD28...|    37278|
|FWtmdWaq9VxWjhrzv...|6Wmf/Aknt7B4Ze/Tf...|++0TZZ4dLrh7Mcrrq...|G1NPZeFPbEvK63iXK...|    37278|
|FWtmdWaq9VxWjhrzv...|6Wmf/Aknt7B4Ze/Tf.

                                                                                

Se calcula la emision intragrupo, extragrupo, recepcion intragrupo y extragrupo

In [15]:
spark.sql('select comunidad_emisor as comunidad, sum(Monto_IVA) as emision_intragrupo from arcos where comunidad_emisor=comunidad_receptor and comunidad_emisor is not null group by comunidad_emisor order by comunidad_emisor asc').createOrReplaceTempView('emision_intra')
spark.sql('select comunidad_emisor as comunidad, sum(Monto_IVA) as emision_extragrupo from arcos where comunidad_emisor<>comunidad_receptor and comunidad_emisor is not null group by comunidad_emisor order by comunidad_emisor asc').createOrReplaceTempView('emision_extra')

In [16]:
spark.sql('select comunidad_receptor as comunidad, sum(Monto_IVA) as recepcion_intragrupo from arcos where comunidad_emisor=comunidad_receptor and comunidad_receptor is not null group by comunidad_receptor order by comunidad_receptor asc').createOrReplaceTempView('recepcion_intra')
spark.sql('select comunidad_receptor as comunidad, sum(Monto_IVA) as recepcion_extragrupo from arcos where comunidad_emisor<>comunidad_receptor and comunidad_receptor is not null group by comunidad_receptor order by comunidad_receptor asc').createOrReplaceTempView('recepcion_extra')

In [17]:
spark.sql('select emision_intra.comunidad as comunidad, emision_intragrupo, emision_extragrupo from emision_intra left join emision_extra on emision_extra.comunidad=emision_intra.comunidad order by emision_intra.comunidad asc').createOrReplaceTempView('emision')
spark.sql('select recepcion_intra.comunidad as comunidad, recepcion_intragrupo, recepcion_extragrupo from recepcion_intra left join recepcion_extra on recepcion_extra.comunidad=recepcion_intra.comunidad order by recepcion_intra.comunidad asc').createOrReplaceTempView('recepcion')

In [18]:
spark.sql('select case when emision_intra.comunidad is null then emision_extra.comunidad else emision_intra.comunidad end as com, emision_intragrupo, emision_extragrupo from emision_intra full outer join emision_extra on emision_extra.comunidad=emision_intra.comunidad').createOrReplaceTempView('emision')
spark.sql('select * from emision').show()
spark.sql('select case when recepcion_intra.comunidad is null then recepcion_extra.comunidad else recepcion_intra.comunidad end as com, recepcion_intragrupo, recepcion_extragrupo from recepcion_intra full outer join recepcion_extra on recepcion_extra.comunidad=recepcion_intra.comunidad order by recepcion_intra.comunidad asc').createOrReplaceTempView('recepcion')
spark.sql('select * from recepcion order by recepcion_intragrupo desc').show()

                                                                                

+--------------------+------------------+------------------+
|                 com|emision_intragrupo|emision_extragrupo|
+--------------------+------------------+------------------+
|++Caq2tCTZVamYIfc...|              null|          13778449|
|++KCc/MVDP3ZynZeM...|              null|           1026000|
|++ZMxopSezccQ68rh...|           3498780|            716429|
|++hTFfaZMZtHqOlN7...|              null|            131725|
|+/IF5eqZiMx47fQ7S...|              null|          84190386|
|+/R99eCcq/2nAFv0F...|              null|           4959000|
|+/pGVOWOBWPL2SXJp...|              null|          62159260|
|+08zX8z3f1EdujiII...|              null|         636858052|
|+0JD7POp6ffyp0PJt...|              null|          11762669|
|+0Nc87eI/yCDtlot+...|              null|            570000|
|+11BMfiAh5lEdW5RS...|          39160067|          94973200|
|+1Ze8nOcYx8/47/LU...|              null|            631903|
|+1qFMTZgSWYfxcKtF...|              null|          30546794|
|+1ryifBASmmOuAhSS...|  



+--------------------+--------------------+--------------------+
|                 com|recepcion_intragrupo|recepcion_extragrupo|
+--------------------+--------------------+--------------------+
|UfGg9ixrG8p2feWLm...|       4083634963740|      18445396318245|
|uh8tD5PEoaxuj3cnb...|       4037774442074|      17790641801428|
|waBLUj802RuChBHz1...|       4037680963610|      17788875126265|
|ztS6xIJHXpmN97spc...|       4037680963610|      17788744642682|
|1t4zCVvjFzlmBNgTX...|        997737051059|       9873467982932|
|hn4wPfnKswm2QpmRU...|        937136159623|       5792937204909|
|R7lHGXYbS/ik1uaOR...|        936993029811|       5792301325366|
|fLcCg9PTTyZVGoHOo...|        736267729078|       3632091518355|
|LThgFHnXs6qXR7CYM...|        629415347925|       4796368743849|
|GUtDd660kpBvQ6xOQ...|        629415347925|       4794223400408|
|ir6Wqb/x+bmjr7k5V...|        508097620400|       1986352594567|
|wMcdnmkumdkkLEIpS...|        401047958414|       2762595581988|
|eSnTpH2uJFseALEoK...|   

                                                                                

In [19]:
aux=spark.sql('select case when emision.com is null then recepcion.com else emision.com end as comunidad, emision_intragrupo, emision_extragrupo,  recepcion_intragrupo, recepcion_extragrupo from emision full outer join recepcion on emision.com=recepcion.com')
aux.show()
aux.createOrReplaceTempView('iva_comunidad')



+--------------------+------------------+------------------+--------------------+--------------------+
|           comunidad|emision_intragrupo|emision_extragrupo|recepcion_intragrupo|recepcion_extragrupo|
+--------------------+------------------+------------------+--------------------+--------------------+
|++Caq2tCTZVamYIfc...|              null|          13778449|                null|            88667307|
|++KCc/MVDP3ZynZeM...|              null|           1026000|                null|             1509331|
|++ZMxopSezccQ68rh...|           3498780|            716429|             3498780|            57674355|
|++hTFfaZMZtHqOlN7...|              null|            131725|                null|             8907030|
|+/GP2pgvQ+KWPs5dT...|              null|              null|                null|             3154320|
|+/IF5eqZiMx47fQ7S...|              null|          84190386|                null|            39164788|
|+/R99eCcq/2nAFv0F...|              null|           4959000|             

                                                                                

Tambien definimos algunas tasas de comparacion entre montos de IVA

In [20]:
spark.sql('select comunidad, emision_extragrupo/(emision_intragrupo+emision_extragrupo)*100 as perct_emision_extra,recepcion_extragrupo/(recepcion_intragrupo+recepcion_extragrupo)*100 as perct_recepcion_extra,emision_extragrupo, recepcion_extragrupo,emision_intragrupo, emision_extragrupo/emision_intragrupo as tasa_emision_extra_intra from iva_comunidad').createOrReplaceTempView('iva_comunidad')

In [21]:
spark.sql('select * from iva_comunidad').show()



+--------------------+-------------------+---------------------+------------------+--------------------+------------------+------------------------+
|           comunidad|perct_emision_extra|perct_recepcion_extra|emision_extragrupo|recepcion_extragrupo|emision_intragrupo|tasa_emision_extra_intra|
+--------------------+-------------------+---------------------+------------------+--------------------+------------------+------------------------+
|++Caq2tCTZVamYIfc...|               null|                 null|          13778449|            88667307|              null|                    null|
|++KCc/MVDP3ZynZeM...|               null|                 null|           1026000|             1509331|              null|                    null|
|++ZMxopSezccQ68rh...|  16.99628654237548|     94.2805285359333|            716429|            57674355|           3498780|       0.204765375359411|
|++hTFfaZMZtHqOlN7...|               null|                 null|            131725|             8907030|  

                                                                                

Ahora ya podemos agregar los datos de IVA del grupo a la estadistica general por grupo

In [22]:
data_comunidades = spark.read.options(header=True,inferSchema=True,delimiter=",").csv("/home/cdsw/data/processed/comunidades_persona_natural/estadistica_comunidades_persona_natural.csv")
data_comunidades.createOrReplaceTempView("data_comunidades")

                                                                                

In [23]:
data=spark.sql('select * from data_comunidades left join iva_comunidad on data_comunidades.SOCIO_NATURAL=iva_comunidad.comunidad')
data=data.toPandas()
data=data.drop('comunidad', axis=1)
data=data.sort_values(by='promedio_score', ascending=False)
data.head(15)

                                                                                ]]

Unnamed: 0,SOCIO_NATURAL,promedio_score,miembros,porcentaje_capital_promedio,contaminados_iniciales,promedio_total_pago_f29,nulos_total_pago_f29,promedio_IVA_neto,nulos_IVA_neto,promedio_n_documentos,...,promedio_lifetime,nulos_lifetime,moda_unidad_regional,frecuencia_unidad_regional,perct_emision_extra,perct_recepcion_extra,emision_extragrupo,recepcion_extragrupo,emision_intragrupo,tasa_emision_extra_intra
693976,EgncfzLr96E5YqclN5apDA==,1.000108,1,100.0,0,1018068.0,0,-601197688.0,0,299.0,...,2324.0,0,SANTIAGO CENTRO,1,,,,940138.0,,
357414,f7Itm9gvmRU9i11YqRwMCg==,1.000082,1,50.0,0,12834338.0,0,-8162942.0,0,833.0,...,1064.0,0,SANTIAGO ORIENTE,1,,,526099062.0,,,
699707,nNhyGxkgL+Kv635SYh8oSA==,1.000082,1,50.0,0,12834338.0,0,-8162942.0,0,833.0,...,1064.0,0,SANTIAGO ORIENTE,1,,,526099062.0,,,
539427,HurmT3qxIftfZhAkan1cug==,1.000079,1,100.0,0,0.0,0,484553.0,0,,...,1371.0,0,SANTIAGO ORIENTE,1,,,,418190.0,,
234988,oeeV712hCnnCh5/wSK9Jtg==,1.000064,1,100.0,1,82553.0,0,-9523758.0,0,24.0,...,719.0,0,SANTIAGO NORTE,1,,,17456201.0,7932443.0,,
736539,MZITWUtgA03WDt4EvpmZtA==,1.00006,1,100.0,0,2282.0,0,6523521.0,0,187.0,...,1526.0,0,SANTIAGO CENTRO,1,,,265607747.0,267287537.0,,
590788,Ef3Ia+0Fyw+jiMd2BPiJjQ==,1.000054,1,100.0,0,,1,-16665332.0,0,9.0,...,906.0,0,SANTIAGO PONIENTE,1,,,11039023.0,2941123.0,,
437452,QI2rHSCeV9OZEnBM4/LDFQ==,1.000052,2,100.0,2,0.0,0,-55527874.5,0,52.5,...,685.0,0,SANTIAGO ORIENTE,2,99.054493,99.314263,139335234.0,192622000.0,1330000.0,104.763334
1000793,m75c+EoweYx/n0rqlB0/BQ==,1.000051,2,100.0,0,59438.0,0,-8729223.0,0,82.0,...,594.0,0,Sin Moda,0,,,128944126.0,182496899.0,,
661753,7989PjN7FYatrx6KTdwJDg==,1.000047,5,80.0,5,0.0,0,-57348364.8,0,19.2,...,723.0,0,SANTIAGO ORIENTE,5,75.950681,0.100479,260109618.0,82840.0,82362122.0,3.158122


Finalmente guardamos el archivo

In [24]:

data.to_csv('/home/cdsw/data/processed/comunidades_persona_natural/estadistica_comunidades_persona_natural_with_iva_balance.csv', index=False)