In [0]:
%run "./Includes/dependencies"

In [0]:
#Limpieza de cache
spark.catalog.clearCache()
spark.conf.set('spark.sql.legacy.timeParserPolicy', 'LEGACY')

## 1. Carga de informacion

### Carga de toda la informacion

In [0]:
for bronze_path in bronze_paths:
    globals()[bronze_path["dataframe_name"]] = (
        spark.read.format("parquet")
        .option("recursiveFileLookup", "true")
        .load(bronze_path["path"])
    )
    print(f'{bronze_path["dataframe_name"]}\t{globals()[bronze_path["dataframe_name"]].count()}')

In [0]:
# Agrega metadato origen de ubicacion a dataframes
contacto_df = contacto_df.withColumn('origen', F.input_file_name())
donaciones_df = donaciones_df.withColumn('origen', F.input_file_name())
grados_df = grados_df.withColumn('origen', F.input_file_name())
comunidad_df = comunidad_df.withColumn('origen', F.input_file_name())
eventos_df = eventos_df.withColumn('origen', F.input_file_name())
revista_df = revista_df.withColumn('origen', F.input_file_name())
laboral_df = laboral_df.withColumn('origen', F.input_file_name())
relaciones_df = relaciones_df.withColumn('origen', F.input_file_name())

In [0]:
#display(donaciones_df)

### Conteo de registros por origen

In [0]:
# Conteo de registros por origen.

schema = StructType([
    StructField('origen',StringType(),True),
    StructField('conteo_registros',LongType(),False)
])
conteos_df = spark.createDataFrame([], schema=schema)
for bronze_path in bronze_paths:
    temp_df = (
        globals()[bronze_path["dataframe_name"]]
         .select('origen')
         .groupBy('origen')
         .count()
         .select(F.col('origen'), F.col('count').alias('conteo_registros'))
         .orderBy(F.col('origen'), F.col('conteo_registros'))
        )
    conteos_df = conteos_df.union(temp_df)

display(conteos_df)


origen,conteo_registros
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,324511
dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet,120145
dbfs:/mnt/donaciones/Bronze/INFO_GRADOS.parquet,150507
dbfs:/mnt/donaciones/Bronze/INFO_INTERACCION_COMUNIDAD.parquet,72584
dbfs:/mnt/donaciones/Bronze/INFO_INTERACCION_EVENTOS.parquet,382044
dbfs:/mnt/donaciones/Bronze/INFO_INTERACCION_REVISTA.parquet,29347
dbfs:/mnt/donaciones/Bronze/INFO_LABORAL.parquet,324511
dbfs:/mnt/donaciones/Bronze/INFO_RELACIONES.parquet,173194


### Conteo de nulos por origen y columna

In [0]:
for bronze_path in bronze_paths:
    print(bronze_path["dataframe_name"])
    copy_df = globals()[bronze_path["dataframe_name"]]
    columns = copy_df.columns
    for column in columns:
        copy_df=copy_df.withColumn(column,
            F.when(F.col(column)== 'NULL',
                   None
                  )
            .otherwise(
                F.col(column)
            ))
        globals()[bronze_path["dataframe_name"]]= copy_df

In [0]:

schema = StructType([
  StructField("source", StringType() ,True),
  StructField("col_name", StringType() ,True),
  StructField("count", IntegerType() ,True),
  StructField("count_null", IntegerType() ,True),
  StructField("precent_null", FloatType() ,True)
])

nullSdf = spark.createDataFrame([],schema)

for bronze_path in bronze_paths:
    print(bronze_path["dataframe_name"])
    copy_df = globals()[bronze_path["dataframe_name"]]
    source_list = copy_df.select('origen').distinct().collect()
    for source_item in source_list:
        tempSdf = null_percent(copy_df.where(col('origen') == source_item['origen']).drop('origen'))
        tempSdf = tempSdf.withColumn("source" , lit(source_item['origen']))  
        tempSdf = tempSdf.select("source" , "col_name" , "count" , "count_null" , "precent_null")
        nullSdf = nullSdf.union(tempSdf)


In [0]:
display(nullSdf.orderBy(F.col('source'), F.col('precent_null').desc()))

source,col_name,count,count_null,precent_null
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,PUNTAJE_RFM,19899.0,304612.0,93.8680044744246
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,ESTRATO,115868.0,208643.0,64.29458477524645
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,ESTADO_CIVIL,137144.0,187367.0,57.73825848738564
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,GENERACION,166079.0,158432.0,48.821765672041934
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,EDAD,166223.0,158288.0,48.77739121324085
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,FECHA_NACIMIENTO,166223.0,158288.0,48.77739121324085
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,DIRECCION,206899.0,117612.0,36.24283922578896
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,DEPARTAMENTO,240724.0,83787.0,25.819463746991627
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,PAIS,240941.0,83570.0,25.7525939028261
dbfs:/mnt/donaciones/Bronze/INFO_CONTACTO.parquet,CIUDAD,241136.0,83375.0,25.69250348986629


## 2. Consolidación de información

In [0]:
#Se crea una tabla consolidada de donaciones
donaciones_df.createOrReplaceTempView("donaciones")

In [0]:
%sql
select * from donaciones

ID_CONTACTO,ID_DONACION,ID_OPPORTUNIDAD,ID_CUENTA_CONTABLE,CUENTA_CONTABLE,ESTADO_DE_PROMESA,FECHA_DE_PAGO,MONTO_DONADO,origen
0031I00000CM12nQAD,a091I00000ZnGIWQA3,0061I00000QLCegQAH,a0e1I00000DvMLtQAN,FONDO NECESITAMOS PENSAR,Closed Won,2021-08-20,250000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00001D60dOQAR,a091I00000Y5Nz7QAF,0061I00000OhzGiQAJ,a0e1I00000DKTtpQAH,FONDO VAMOS PALANTE,Closed Won,2020-10-07,10000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000Aj1OWQAZ,a091I00000ZnGIMQA3,0061I00000QLCebQAH,a0e1I00000DvMLtQAN,FONDO NECESITAMOS PENSAR,Closed Won,2021-08-20,50000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000UvQHWQA3,a091I00000ZmCg1QAF,0061I00000QK7sBQAT,a0e1I000002PsGsQAK,FONDO PALANTE PACÍFICO,Closed Won,2021-06-15,20000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000AislAQAR,a091I00000Y5NxBQAV,0061I00000OhzGgQAJ,a0e1I00000DKTtpQAH,FONDO VAMOS PALANTE,Closed Won,2020-10-06,50000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000AifsHQAR,a091I00000W0diWQAR,0061I00000QK7UEQA1,a0e1I000002PsF4QAK,Quiero Estudiar Beca con Compromiso,Closed Won,2021-07-12,80000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000AixcYQAR,a098W00000ZnofbQAB,0068W00000QM2Z7QAL,a0e1I00000DvcrHQAR,FONDO VAMOS PALANTE 2021,Closed Won,2021-10-15,100000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000Aix9EQAR,a091I00000BgXcDQAV,0061I00000QJPNuQAP,a0e1I000002PsGsQAK,FONDO PALANTE PACÍFICO,Closed Won,2021-05-06,30000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00000CMJ0pQAH,a091I00000Y5NtOQAV,0061I00000OiCmJQAV,a0e1I00000DKYXAQA5,FONDO VAMOS PALANTE FRANCISCO LEAL,Closed Won,2020-10-17,8800000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet
0031I00001FEHRFQA5,a098W00000ZnohUQAR,0068W00000QM2bmQAD,a0e1I00000DvcrHQAR,FONDO VAMOS PALANTE 2021,Closed Won,2021-10-21,500000,dbfs:/mnt/donaciones/Bronze/INFO_DONACIONES.parquet


In [0]:
%sql
create Or Replace Temp View donacionesconsolidado AS
SELECT ID_CONTACTO, SUM(MONTO_DONADO) MONTO_TOTAL, COUNT(DISTINCT ID_DONACION) DONACIONES, COUNT(DISTINCT ID_OPPORTUNIDAD) OPORTUNIDADES
FROM donaciones
GROUP BY ID_CONTACTO

In [0]:
%sql
SELECT *
FROM donacionesconsolidado

ID_CONTACTO,MONTO_TOTAL,DONACIONES,OPORTUNIDADES
0031I00000CM7dvQAD,300000.0,1,1
0031I00000AizqQQAR,2370000.0,8,73
0031I00000CMDXUQA5,2395000.0,6,100
0031I00000Aiv8gQAB,15629652.0,8,46
0031I00000AiyJ0QAJ,1020000.0,3,34
0031I00000CMG0tQAH,1665000.0,3,48
0031I00000CMIPyQAP,5503924.0,18,123
0031I00000AisI6QAJ,1620000.0,17,139
0031I00000CMJP5QAP,2800000.0,3,51
0031I00000AilMCQAZ,47000000.0,4,4


In [0]:
donacionesconsol_df= spark.sql('select * from donacionesconsolidado')

In [0]:
#Joins con tablas de contacto, laboral y consolidado de donaciones.
final_df = (
   contacto_df
    .join(laboral_df,'ID_CONTACTO','left')
    .drop('origen')
    .join(donacionesconsol_df,'ID_CONTACTO','left')
)
print(final_df.select('ID_CONTACTO').distinct().count())

In [0]:
display(final_df)

ID_CONTACTO,ESTADO_CIVIL,FECHA_NACIMIENTO,EDAD,GENERACION,GENERO,PAIS,CIUDAD,DEPARTAMENTO,DIRECCION,ES_GRADUADO,ES_EXALUMNO,ES_ANTIGUO,ES_ESTUDIANTE,ES_MIEMBRO_CONCEJO,ES_PADRE_MADRE,ES_DONANTE,ES_RESPONSABLE_FINANCIERO,ES_ADMINISTRATIVO,ES_PROFESOR,ES_ANTIGUO_MIEMBRO_CONSEJO_SUPERIOR,ES_DONANTE_LOS_ANDES_FOUNDATION,ES_PROFESOR_ADMINISTRATIVO_RETIRADO,VOLUNTARIO_UNIANDES,ES_EGRESADO_CERTIFICADO,ESTRATO,PUNTAJE_RFM,CARGO,NIVEL_JERARQUICO,SECTOR_ECONOMICO_1,SECTOR_ECONOMICO_2,EMPLEADOR,NIT,Empleador__c,MONTO_TOTAL,DONACIONES,OPORTUNIDADES
0031I00000AibubQAB,S - SOLTERO,1986-03-01,36.0,Generación Y (Millennials),Masculino,COLOMBIA,BOGOTA D.C.,BOGOTA,CL 56 71 54,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,,ASESOR FINANCIERO,Profesional,Actividades jurídicas y de contabilidad,"Actividades profesionales, científicas y técnicas",CARRIZOSA CONSULTORES SAS,9005276702,0011I00001PhB8jQAF,,,
0031I00000AibweQAB,,,,,Femenino,COLOMBIA,BOGOTA D.C.,BOGOTA,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
0031I00000AibwHQAR,,,,,Femenino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
0031I00000AibwkQAB,,,,,Masculino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
0031I00000AibwLQAR,S - SOLTERO,1998-09-30,24.0,Generación Z (Centennials),Femenino,COLOMBIA,CAJICA,CUNDINAMARCA,VRD 12 MCP CAJICA CA 304 CN PUERTA DEL SOL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,,,,,,,,,,,
0031I00000AibwNQAR,,,,,Masculino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
0031I00000AibwsQAB,,,,,Femenino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
0031I00000Aibx1QAB,S - SOLTERO,1998-01-30,24.0,Generación Z (Centennials),Masculino,COLOMBIA,BOGOTA D.C.,BOGOTA,CL 147 7 B 37 AP 1303,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,6.0,111.0,,,,,,,,4000.0,1.0,1.0
0031I00000Aibx4QAB,,,,,Femenino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
0031I00000Aibx6QAB,,,,,Masculino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,


### Validación valores unicos por columnas

In [0]:
#Obtiene total registros
total_registros = final_df.count()
    
#Mostrar información por cada columna

for column in final_df:
    displayHTML(f'<h3>{column}</h3>')
    display(final_df.select(column).groupBy(column).count().withColumn('porcentaje', F.round((F.col('count') / total_registros)*100.0, 2)))


ID_CONTACTO,count,porcentaje
0031I00000CMFbUQAX,1,0.0
0031I00000UvQI8QAN,1,0.0
0031I00000H2L4FQAV,1,0.0
0031I00000tX3OaQAK,1,0.0
0031I00000tX3tWQAS,1,0.0
0031I00000rJQbWQAW,1,0.0
0031I00000rJQC8QAO,1,0.0
0031I00000rJQCnQAO,1,0.0
0031I00000jTAAfQAO,1,0.0
0031I00000UuX3fQAF,1,0.0


ESTADO_CIVIL,count,porcentaje
,187367,57.74
C - CASADO,38518,11.87
U - UNION LIBRE,1841,0.57
S - SOLTERO,95067,29.3
D - DIVORCIADO,766,0.24
P - SEPARADO,779,0.24
V - VIUDO,173,0.05


FECHA_NACIMIENTO,count,porcentaje
1996-04-10,18,0.01
1989-07-06,19,0.01
2003-02-22,9,0.0
1983-10-12,13,0.0
1984-03-02,16,0.0
1985-04-02,10,0.0
1980-02-02,6,0.0
1991-05-24,14,0.0
1993-11-09,15,0.0
1989-11-06,21,0.01


EDAD,count,porcentaje
51.0,2390,0.74
7.0,25,0.01
15.0,89,0.03
54.0,2025,0.62
-1.0,1,0.0
101.0,2,0.0
11.0,43,0.01
29.0,4600,1.42
69.0,666,0.21
42.0,3916,1.21


GENERACION,count,porcentaje
Baby Boomer,19145,5.9
,158432,48.82
Generación X,41434,12.77
Silent,1178,0.36
Grandiosa,50,0.02
Generación Z (Centennials),43227,13.32
Generación Y (Millennials),60734,18.72
Alpha,311,0.1


GENERO,count,porcentaje
,4473,1.38
Femenino,149020,45.92
Indefinido,17,0.01
Masculino,171001,52.69


PAIS,count,porcentaje
BAHAMAS,1,0.0
BELGICA,58,0.02
SUDAFRICA,8,0.0
RUSIA,2,0.0
LETONIA,1,0.0
España,2,0.0
CATAR,3,0.0
NORUEGA,14,0.0
COREA DEL SUR,9,0.0
TURQUIA,3,0.0


CIUDAD,count,porcentaje
ARMENIA,585,0.18
PRINCETON,5,0.0
OLD TOWN,2,0.0
ELLINGTON,1,0.0
SANTANDER,3,0.0
SAN PABLO,1,0.0
HIGH WYCOMBE,1,0.0
BRICEÑO,1,0.0
CHIQUIZA,2,0.0
EL ROSAL,13,0.0


DEPARTAMENTO,count,porcentaje
SANTANDER,4789,1.48
TACHIRA,11,0.0
FRIUL VENECIA JULIA,2,0.0
BASILEA CIUDAD,5,0.0
AYACUCHO,8,0.0
MECKLEMBURGO POMERANIA OCCIDENTAL,1,0.0
INGLATERRA,352,0.11
VIRGINIA OCCIDENTAL,14,0.0
SERBIA CENTRAL,1,0.0
RIO DE JANEIRO,20,0.01


DIRECCION,count,porcentaje
3318 PENA DE LOS BANOS VENUSTIANO CARRANZA,1,0.0
CL 56 85 I 06 BL 10 AP 502,1,0.0
CR 19 A 85 31 AP 301,1,0.0
CL 54 9 15 AP 202 ED JORDAN,1,0.0
CL 97 21 95,1,0.0
CR 76 A 80 72,1,0.0
CL 48 73 61,1,0.0
CL 104 A 21 77,3,0.0
CR 51 104 B 70 AP 303,2,0.0
CR 96 F 23 A 60 TO 11 AP 601,1,0.0


ES_GRADUADO,count,porcentaje
0,227938,70.24
1,96573,29.76


ES_EXALUMNO,count,porcentaje
0,316437,97.51
1,8074,2.49


ES_ANTIGUO,count,porcentaje
0,306003,94.3
1,18508,5.7


ES_ESTUDIANTE,count,porcentaje
0,300340,92.55
1,24171,7.45


ES_MIEMBRO_CONCEJO,count,porcentaje
0,324455,99.98
1,56,0.02


ES_PADRE_MADRE,count,porcentaje
0,274763,84.67
1,49748,15.33


ES_DONANTE,count,porcentaje
0,307093,94.63
1,17418,5.37


ES_RESPONSABLE_FINANCIERO,count,porcentaje
0,308824,95.17
1,15687,4.83


ES_ADMINISTRATIVO,count,porcentaje
0,321117,98.95
1,3394,1.05


ES_PROFESOR,count,porcentaje
0,322699,99.44
1,1812,0.56


ES_ANTIGUO_MIEMBRO_CONSEJO_SUPERIOR,count,porcentaje
0,324392,99.96
1,119,0.04


ES_DONANTE_LOS_ANDES_FOUNDATION,count,porcentaje
0,324175,99.9
1,336,0.1


ES_PROFESOR_ADMINISTRATIVO_RETIRADO,count,porcentaje
0,306905,94.57
1,17606,5.43


VOLUNTARIO_UNIANDES,count,porcentaje
0,315578,97.25
1,8933,2.75


ES_EGRESADO_CERTIFICADO,count,porcentaje
0,319185,98.36
1,5326,1.64


ESTRATO,count,porcentaje
3.0,20052,6.18
0.0,1353,0.42
,208643,64.29
5.0,23416,7.22
6.0,26684,8.22
1.0,2826,0.87
4.0,34567,10.65
2.0,6970,2.15


PUNTAJE_RFM,count,porcentaje
544.0,391,0.12
334.0,229,0.07
442.0,25,0.01
232.0,210,0.06
234.0,91,0.03
155.0,90,0.03
132.0,156,0.05
154.0,64,0.02
422.0,433,0.13
433.0,477,0.15


CARGO,count,porcentaje
RYD BPC MANAGER,1,0.0
JEFE DE PLANEACION FINANCIERA,16,0.0
LIDER DE COMPRAS,4,0.0
RESEARCHER,28,0.01
INGENIERO PROYECTOS,9,0.0
REVENUE MANAGEMENT EXECUTIVE,1,0.0
INDEPENDENT PRODUCER AND SHOWRUNNER,1,0.0
BUSINESS ENTERPRISE SENIOR CONSULTANT,1,0.0
VICEPRESIDENTE FINANCIERO,36,0.01
CONTROL FINACIERO DE PROYECTOS,1,0.0


NIVEL_JERARQUICO,count,porcentaje
Independiente,4578,1.41
No clasificado,19230,5.93
Ejecutivo,31802,9.8
,194843,60.04
Soporte,2239,0.69
Jubilado o Pensionado,1696,0.52
Desempleado,4195,1.29
Profesional,49180,15.16
Directivo,16748,5.16


SECTOR_ECONOMICO_1,count,porcentaje
"Acondicionamiento de edificaciones, carreteras y obras de ingeniería civil",739,0.23
Extracción de madera,5,0.0
Caza,3,0.0
Transporte terrestre y/o por tuberías,1188,0.37
Actividades de servicios auxiliares de la intermediación financiera,1534,0.47
"Construcción, terminación, acabados de edificación",122,0.04
Explotación de minerales no metálicos,132,0.04
Actividades de agencia de empleo,415,0.13
Transporte aéreo,568,0.18
Saneamiento ambiental,156,0.05


SECTOR_ECONOMICO_2,count,porcentaje
Otras actividades de servicios,4898,1.51
Comercio al por mayor y al por menor,12479,3.85
Servicios de comidas,678,0.21
"Comercio al por mayor y al por menor, reparación de vehículos automotores y motocicletas",89,0.03
"Actividades profesionales, científicas y técnicas",12911,3.98
Actividades de organizaciones y entidades extraterritoriales,1096,0.34
Alojamiento,467,0.14
"Actividades artísticas, de entretenimiento y recreación",1117,0.34
Comercio al por mayor y al por menor; reparación de vehículos automotores y motocicletas,22,0.01
,220413,67.92


EMPLEADOR,count,porcentaje
THE BREAKTHROUGH S.A.,3,0.0
FUNDACION UNIVERSITARIA SALESIANA,4,0.0
STOCKHOLM RESILIENCE CENTRE,1,0.0
UNIVERSIDAD DE GOTTINGEN,2,0.0
ESCUELA NORMAL SUPERIOR DE IBAGUE,2,0.0
BOSTON CHILDREN'S HOSPITAL,1,0.0
BIODIVERSITY HERITAGE LIBRARY,1,0.0
EMPRESA DE ACUEDUCTO Y ALCANTARILLADO DE PEREIRA S.A. E.S.P.,4,0.0
CORPOAMAZONIA,2,0.0
GRUPO JULIA,1,0.0


NIT,count,porcentaje
CI00000003116,8,0.0
9005350183,7,0.0
9007697461,9,0.0
8060050085,3,0.0
200052487,3,0.0
8600791743,58,0.02
CI00000008956,43,0.01
8600007538,22,0.01
8300001672,24,0.01
201726977,4,0.0


Empleador__c,count,porcentaje
0011I00001PhOCEQA3,1,0.0
0011I00000Gv4EsQAJ,55,0.02
0011I00000Gv4aKQAR,21,0.01
0011I00000Gv7TMQAZ,19,0.01
0011I00000Gv8GSQAZ,85,0.03
0011I00001JhIwPQAV,2,0.0
0011I00000GvBOuQAN,1,0.0
0011I00000Gv8EkQAJ,15,0.0
0011I00000Gv5j9QAB,111,0.03
0011I00000Gv6itQAB,2,0.0


MONTO_TOTAL,count,porcentaje
300000.0,320,0.1
330000.0,39,0.01
4800.0,1,0.0
495000.0,2,0.0
7460000.0,1,0.0
5010000.0,2,0.0
1185000.0,2,0.0
41380000.0,1,0.0
10750000.0,1,0.0
74500000.0,1,0.0


DONACIONES,count,porcentaje
26.0,8,0.0
29.0,9,0.0
19.0,12,0.0
54.0,1,0.0
0.0,362,0.11
22.0,12,0.0
7.0,260,0.08
34.0,5,0.0
32.0,3,0.0
31.0,4,0.0


OPORTUNIDADES,count,porcentaje
29.0,23,0.01
26.0,26,0.01
65.0,15,0.0
191.0,1,0.0
54.0,20,0.01
19.0,28,0.01
113.0,4,0.0
155.0,1,0.0
112.0,2,0.0
167.0,1,0.0


### Conversion tipo de datos columnas

In [0]:
#Definicion de columnas enteras y fechas.
enteros = ['MONTO_TOTAL','ES_ADMINISTRATIVO','ES_ANTIGUO','ES_ESTUDIANTE','ES_EXALUMNO','ES_GRADUADO','ES_MIEMBRO_CONCEJO','ES_PADRE_MADRE','ES_PROFESOR','ES_RESPONSABLE_FINANCIERO','EDAD'] 
fechas = [
    {
        'columna': 'FECHA_NACIMIENTO',
        'formato': 'yyyy-MM-dd'
    }
]


In [0]:
#Procesamiento de columnas de numeros enteros
for columna_entero in enteros:
    final_df = final_df.withColumn(columna_entero, F.col(columna_entero).cast('int'))

In [0]:
final_df.createOrReplaceTempView('TBLFIN')

In [0]:
%sql
select SUM (MONTO_TOTAL)FROM TBLFIN 
-- ,SUM('ES_ADMINISTRATIVO'),SUM('ES_ANTIGUO'),SUM('ES_ESTUDIANTE'),SUM('ES_EXALUMNO'),SUM('ES_GRADUADO'),SUM('ES_MIEMBRO_CONCEJO'),SUM('ES_PADRE_MADRE'),SUM('ES_PROFESOR'),SUM('ES_RESPONSABLE_FINANCIERO'),SUM('ASISTIO'),SUM('INSCRITO')  FROM TBLFIN

sum(MONTO_TOTAL)
22137091978


In [0]:
#Procesamiento de fechas
for columna_fecha in fechas:
    final_df = final_df.withColumn(columna_fecha['columna'], F.regexp_replace(F.col(columna_fecha['columna']), r'\s', ''))
    final_df = final_df.withColumn(columna_fecha['columna'], F.to_date(F.col(columna_fecha['columna']), columna_fecha['formato']))

### Valores unicos por columna datos transformados

In [0]:
#Obtiene total registros
total_registros = final_df.count()
    
#Mostrar información por cada columna sin tener en cuenta IDs de cruce
for column in final_df.columns:
    displayHTML(f'<h3>{column}</h3>')
    display(final_df.select(column).groupBy(column).count().withColumn('porcentaje', F.round((F.col('count') / total_registros)*100.0, 2)))

ID_CONTACTO,count,porcentaje
0031I00000CMFbUQAX,1,0.0
0031I00000UvQI8QAN,1,0.0
0031I00000H2L4FQAV,1,0.0
0031I00000tX3OaQAK,1,0.0
0031I00000tX3tWQAS,1,0.0
0031I00000rJQbWQAW,1,0.0
0031I00000rJQC8QAO,1,0.0
0031I00000rJQCnQAO,1,0.0
0031I00000jTAAfQAO,1,0.0
0031I00000UuX3fQAF,1,0.0


ESTADO_CIVIL,count,porcentaje
,187367,57.74
C - CASADO,38518,11.87
U - UNION LIBRE,1841,0.57
S - SOLTERO,95067,29.3
D - DIVORCIADO,766,0.24
P - SEPARADO,779,0.24
V - VIUDO,173,0.05


FECHA_NACIMIENTO,count,porcentaje
1946-05-02,1,0.0
1982-05-17,10,0.0
1992-12-20,12,0.0
1987-07-08,11,0.0
1988-03-25,21,0.01
1988-03-21,12,0.0
1958-08-15,3,0.0
1988-02-16,9,0.0
1991-03-26,13,0.0
1991-07-30,21,0.01


EDAD,count,porcentaje
148.0,1,0.0
31.0,4918,1.52
85.0,58,0.02
137.0,1,0.0
65.0,1053,0.32
53.0,2098,0.65
78.0,153,0.05
34.0,4934,1.52
101.0,2,0.0
81.0,78,0.02


GENERACION,count,porcentaje
Baby Boomer,19145,5.9
,158432,48.82
Generación X,41434,12.77
Silent,1178,0.36
Grandiosa,50,0.02
Generación Z (Centennials),43227,13.32
Generación Y (Millennials),60734,18.72
Alpha,311,0.1


GENERO,count,porcentaje
,4473,1.38
Femenino,149020,45.92
Indefinido,17,0.01
Masculino,171001,52.69


PAIS,count,porcentaje
BAHAMAS,1,0.0
BELGICA,58,0.02
SUDAFRICA,8,0.0
RUSIA,2,0.0
LETONIA,1,0.0
España,2,0.0
CATAR,3,0.0
NORUEGA,14,0.0
COREA DEL SUR,9,0.0
TURQUIA,3,0.0


CIUDAD,count,porcentaje
ARMENIA,585,0.18
PRINCETON,5,0.0
OLD TOWN,2,0.0
ELLINGTON,1,0.0
SANTANDER,3,0.0
SAN PABLO,1,0.0
HIGH WYCOMBE,1,0.0
BRICEÑO,1,0.0
CHIQUIZA,2,0.0
EL ROSAL,13,0.0


DEPARTAMENTO,count,porcentaje
SANTANDER,4789,1.48
TACHIRA,11,0.0
FRIUL VENECIA JULIA,2,0.0
BASILEA CIUDAD,5,0.0
AYACUCHO,8,0.0
MECKLEMBURGO POMERANIA OCCIDENTAL,1,0.0
INGLATERRA,352,0.11
VIRGINIA OCCIDENTAL,14,0.0
SERBIA CENTRAL,1,0.0
RIO DE JANEIRO,20,0.01


DIRECCION,count,porcentaje
3318 PENA DE LOS BANOS VENUSTIANO CARRANZA,1,0.0
CL 56 85 I 06 BL 10 AP 502,1,0.0
CR 19 A 85 31 AP 301,1,0.0
CL 54 9 15 AP 202 ED JORDAN,1,0.0
CL 97 21 95,1,0.0
CR 76 A 80 72,1,0.0
CL 48 73 61,1,0.0
CL 104 A 21 77,3,0.0
CR 51 104 B 70 AP 303,2,0.0
CR 96 F 23 A 60 TO 11 AP 601,1,0.0


ES_GRADUADO,count,porcentaje
1,96573,29.76
0,227938,70.24


ES_EXALUMNO,count,porcentaje
1,8074,2.49
0,316437,97.51


ES_ANTIGUO,count,porcentaje
1,18508,5.7
0,306003,94.3


ES_ESTUDIANTE,count,porcentaje
1,24171,7.45
0,300340,92.55


ES_MIEMBRO_CONCEJO,count,porcentaje
1,56,0.02
0,324455,99.98


ES_PADRE_MADRE,count,porcentaje
1,49748,15.33
0,274763,84.67


ES_DONANTE,count,porcentaje
0,307093,94.63
1,17418,5.37


ES_RESPONSABLE_FINANCIERO,count,porcentaje
1,15687,4.83
0,308824,95.17


ES_ADMINISTRATIVO,count,porcentaje
1,3394,1.05
0,321117,98.95


ES_PROFESOR,count,porcentaje
1,1812,0.56
0,322699,99.44


ES_ANTIGUO_MIEMBRO_CONSEJO_SUPERIOR,count,porcentaje
0,324392,99.96
1,119,0.04


ES_DONANTE_LOS_ANDES_FOUNDATION,count,porcentaje
0,324175,99.9
1,336,0.1


ES_PROFESOR_ADMINISTRATIVO_RETIRADO,count,porcentaje
0,306905,94.57
1,17606,5.43


VOLUNTARIO_UNIANDES,count,porcentaje
0,315578,97.25
1,8933,2.75


ES_EGRESADO_CERTIFICADO,count,porcentaje
0,319185,98.36
1,5326,1.64


ESTRATO,count,porcentaje
3.0,20052,6.18
0.0,1353,0.42
,208643,64.29
5.0,23416,7.22
6.0,26684,8.22
1.0,2826,0.87
4.0,34567,10.65
2.0,6970,2.15


PUNTAJE_RFM,count,porcentaje
544.0,391,0.12
334.0,229,0.07
442.0,25,0.01
232.0,210,0.06
234.0,91,0.03
155.0,90,0.03
132.0,156,0.05
154.0,64,0.02
422.0,433,0.13
433.0,477,0.15


CARGO,count,porcentaje
RYD BPC MANAGER,1,0.0
JEFE DE PLANEACION FINANCIERA,16,0.0
LIDER DE COMPRAS,4,0.0
RESEARCHER,28,0.01
INGENIERO PROYECTOS,9,0.0
REVENUE MANAGEMENT EXECUTIVE,1,0.0
INDEPENDENT PRODUCER AND SHOWRUNNER,1,0.0
BUSINESS ENTERPRISE SENIOR CONSULTANT,1,0.0
VICEPRESIDENTE FINANCIERO,36,0.01
CONTROL FINACIERO DE PROYECTOS,1,0.0


NIVEL_JERARQUICO,count,porcentaje
Independiente,4578,1.41
No clasificado,19230,5.93
Ejecutivo,31802,9.8
,194843,60.04
Soporte,2239,0.69
Jubilado o Pensionado,1696,0.52
Desempleado,4195,1.29
Profesional,49180,15.16
Directivo,16748,5.16


SECTOR_ECONOMICO_1,count,porcentaje
"Acondicionamiento de edificaciones, carreteras y obras de ingeniería civil",739,0.23
Extracción de madera,5,0.0
Caza,3,0.0
Transporte terrestre y/o por tuberías,1188,0.37
Actividades de servicios auxiliares de la intermediación financiera,1534,0.47
"Construcción, terminación, acabados de edificación",122,0.04
Explotación de minerales no metálicos,132,0.04
Actividades de agencia de empleo,415,0.13
Transporte aéreo,568,0.18
Saneamiento ambiental,156,0.05


SECTOR_ECONOMICO_2,count,porcentaje
Otras actividades de servicios,4898,1.51
Comercio al por mayor y al por menor,12479,3.85
Servicios de comidas,678,0.21
"Comercio al por mayor y al por menor, reparación de vehículos automotores y motocicletas",89,0.03
"Actividades profesionales, científicas y técnicas",12911,3.98
Actividades de organizaciones y entidades extraterritoriales,1096,0.34
Alojamiento,467,0.14
"Actividades artísticas, de entretenimiento y recreación",1117,0.34
Comercio al por mayor y al por menor; reparación de vehículos automotores y motocicletas,22,0.01
,220413,67.92


EMPLEADOR,count,porcentaje
THE BREAKTHROUGH S.A.,3,0.0
FUNDACION UNIVERSITARIA SALESIANA,4,0.0
STOCKHOLM RESILIENCE CENTRE,1,0.0
UNIVERSIDAD DE GOTTINGEN,2,0.0
ESCUELA NORMAL SUPERIOR DE IBAGUE,2,0.0
BOSTON CHILDREN'S HOSPITAL,1,0.0
BIODIVERSITY HERITAGE LIBRARY,1,0.0
EMPRESA DE ACUEDUCTO Y ALCANTARILLADO DE PEREIRA S.A. E.S.P.,4,0.0
CORPOAMAZONIA,2,0.0
GRUPO JULIA,1,0.0


NIT,count,porcentaje
CI00000003116,8,0.0
9005350183,7,0.0
9007697461,9,0.0
8060050085,3,0.0
200052487,3,0.0
8600791743,58,0.02
CI00000008956,43,0.01
8600007538,22,0.01
8300001672,24,0.01
201726977,4,0.0


Empleador__c,count,porcentaje
0011I00001PhOCEQA3,1,0.0
0011I00000Gv4EsQAJ,55,0.02
0011I00000Gv4aKQAR,21,0.01
0011I00000Gv7TMQAZ,19,0.01
0011I00000Gv8GSQAZ,85,0.03
0011I00001JhIwPQAV,2,0.0
0011I00000GvBOuQAN,1,0.0
0011I00000Gv8EkQAJ,15,0.0
0011I00000Gv5j9QAB,111,0.03
0011I00000Gv6itQAB,2,0.0


MONTO_TOTAL,count,porcentaje
135000.0,12,0.0
113000.0,2,0.0
4155000.0,1,0.0
5300.0,1,0.0
450000.0,70,0.02
870000.0,7,0.0
85000.0,21,0.01
3550000.0,2,0.0
1205000.0,1,0.0
234705.0,1,0.0


DONACIONES,count,porcentaje
26.0,8,0.0
29.0,9,0.0
19.0,12,0.0
54.0,1,0.0
0.0,362,0.11
22.0,12,0.0
7.0,260,0.08
34.0,5,0.0
32.0,3,0.0
31.0,4,0.0


OPORTUNIDADES,count,porcentaje
29.0,23,0.01
26.0,26,0.01
65.0,15,0.0
191.0,1,0.0
54.0,20,0.01
19.0,28,0.01
113.0,4,0.0
155.0,1,0.0
112.0,2,0.0
167.0,1,0.0


## 3. Guardado de informacion en silver

In [0]:
final_df.columns

In [0]:
#Conversion a pandas
final_pd_df = final_df.toPandas()

In [0]:
final_pd_df.head()

Unnamed: 0,ID_CONTACTO,ESTADO_CIVIL,FECHA_NACIMIENTO,EDAD,GENERACION,GENERO,PAIS,CIUDAD,DEPARTAMENTO,DIRECCION,ES_GRADUADO,ES_EXALUMNO,ES_ANTIGUO,ES_ESTUDIANTE,ES_MIEMBRO_CONCEJO,ES_PADRE_MADRE,ES_DONANTE,ES_RESPONSABLE_FINANCIERO,ES_ADMINISTRATIVO,ES_PROFESOR,ES_ANTIGUO_MIEMBRO_CONSEJO_SUPERIOR,ES_DONANTE_LOS_ANDES_FOUNDATION,ES_PROFESOR_ADMINISTRATIVO_RETIRADO,VOLUNTARIO_UNIANDES,ES_EGRESADO_CERTIFICADO,ESTRATO,PUNTAJE_RFM,CARGO,NIVEL_JERARQUICO,SECTOR_ECONOMICO_1,SECTOR_ECONOMICO_2,EMPLEADOR,NIT,Empleador__c,MONTO_TOTAL,DONACIONES,OPORTUNIDADES
0,0031I00000AibubQAB,S - SOLTERO,1986-03-01,36.0,Generación Y (Millennials),Masculino,COLOMBIA,BOGOTA D.C.,BOGOTA,CL 56 71 54,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0,,ASESOR FINANCIERO,Profesional,Actividades jurídicas y de contabilidad,"Actividades profesionales, científicas y técnicas",CARRIZOSA CONSULTORES SAS,9005276702.0,0011I00001PhB8jQAF,,,
1,0031I00000AibweQAB,,,,,Femenino,COLOMBIA,BOGOTA D.C.,BOGOTA,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
2,0031I00000AibwHQAR,,,,,Femenino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
3,0031I00000AibwkQAB,,,,,Masculino,,,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,,,,,,,
4,0031I00000AibwLQAR,S - SOLTERO,1998-09-30,24.0,Generación Z (Centennials),Femenino,COLOMBIA,CAJICA,CUNDINAMARCA,VRD 12 MCP CAJICA CA 304 CN PUERTA DEL SOL,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,,,,,,,,,,,


In [0]:
#Ruta de guardado
save_path = f'/dbfs{base_path}/silver/consolidacion_donaciones'

In [0]:
#Verifica si la ruta de guardado ya existe, en caso de que no la crea
import os
if not os.path.exists(f'{save_path}/'):
    os.makedirs(f'{save_path}/')

In [0]:
#Guardado de información en un único archivo
final_pd_df.to_parquet(f'{save_path}/consolidado_donaciones.parquet')