In [0]:
dbutils.widgets.removeAll()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F

In [0]:
dbutils.widgets.text("catalogo", "catalog_jptq")
dbutils.widgets.text("esquema_source", "bronze_jptq")
dbutils.widgets.text("esquema_stage", "silver_jptq")

In [0]:
catalogo = dbutils.widgets.get("catalogo")
esquema_source = dbutils.widgets.get("esquema_source")
esquema_stage = dbutils.widgets.get("esquema_stage")

In [0]:
def fn_transform_director_pelicula(catalogo: str, esquema_source: str):
   sql_select_director = f"""SELECT
                        DISTINCT
                        UPPER(DES_IDIOMA_PELICULA) AS COD_IDIOMA_PELICULA,
                        CASE
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'EN' THEN 'INGLES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'RO' THEN 'RUMANO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'PL' THEN 'POLACO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'CN' THEN 'CHINO (NO ESTANDAR; SUELE USARSE ZH)' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'PT' THEN 'PORTUGUES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'GL' THEN 'GALLEGO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'SH' THEN 'SERBOCROATA (OBSOLETO)' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'KO' THEN 'COREANO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'UK' THEN 'UCRANIANO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'CS' THEN 'CHECO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'SR' THEN 'SERBIO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'TR' THEN 'TURCO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'DE' THEN 'ALEMAN' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'IS' THEN 'ISLANDES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'ES' THEN 'ESPAÑOL' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'EU' THEN 'EUSKERA / VASCO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'EL' THEN 'GRIEGO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'IT' THEN 'ITALIANO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'TE' THEN 'TELUGU' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'AR' THEN 'ARABE' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'SU' THEN 'SUNDANES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'SV' THEN 'SUECO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'NL' THEN 'NEERLANDES / HOLANDES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'KM' THEN 'JEMER / KHMER' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'BN' THEN 'BENGALI' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'HU' THEN 'HUNGARO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'TN' THEN 'SETSUANA / TSWANA' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'RU' THEN 'RUSO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'ML' THEN 'MALAYALAM' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'TH' THEN 'TAILANDES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'GA' THEN 'IRLANDES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'LT' THEN 'LITUANO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'FA' THEN 'PERSA / FARSI' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'NO' THEN 'NORUEGO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'HI' THEN 'HINDI' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'BS' THEN 'BOSNIO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'ET' THEN 'ESTONIO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'ZH' THEN 'CHINO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'FR' THEN 'FRANCES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'JA' THEN 'JAPONES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'ID' THEN 'INDONESIO' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'LA' THEN 'LATIN' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'DA' THEN 'DANES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'FI' THEN 'FINES' 
                        WHEN UPPER(DES_IDIOMA_PELICULA) = 'HE' THEN 'HEBREO' 
                        ELSE 'OTRO' END AS DES_IDIOMA_PELICULA
                        FROM {catalogo}.{esquema_source}.TBL_PELICULAS
                        ORDER BY COD_IDIOMA_PELICULA ASC"""
   df_t_director_pelicula = spark.sql(sql_select_director)
   return df_t_director_pelicula

In [0]:
%sql
WITH TRANSFORMATION AS  (
SELECT
concat(
      regexp_extract(
        regexp_replace(DES_DIRECTOR, '\\b(van|de|del|von|la|el|)\\b', ''),
        '\\b([A-ZÀ-ÖØ-ÝĀ-ž])[A-Za-zÀ-ÖØ-öø-ÿĀ-ž.\\s-]*\\s+([A-ZÀ-ÖØ-ÝĀ-ž])([A-Za-zÀ-ÖØ-öø-ÿĀ-ž]{2})',
        1
      ),
      regexp_extract(
        regexp_replace(DES_DIRECTOR, '\\b(van|de|del|von|la|el)\\b', ''),
        '\\b([A-ZÀ-ÖØ-ÝĀ-ž])[A-Za-zÀ-ÖØ-öø-ÿĀ-ž.\\s-]*\\s+([A-ZÀ-ÖØ-ÝĀ-ž])([A-Za-zÀ-ÖØ-öø-ÿĀ-ž]{2})',
        2
      ),
      regexp_extract(
        regexp_replace(DES_DIRECTOR, '\\b(van|de|del|von|la|el)\\b', ''),
        '\\b([A-ZÀ-ÖØ-ÝĀ-ž])[A-Za-zÀ-ÖØ-öø-ÿĀ-ž.\\s-]*\\s+([A-ZÀ-ÖØ-ÝĀ-ž])([A-Za-zÀ-ÖØ-öø-ÿĀ-ž]{2})',
        3
      )
    ) AS COD_DIRECTOR,
DES_DIRECTOR
FROM catalog_jptq.bronze_jptq.tbl_detalle_peliculas
GROUP BY DES_DIRECTOR
ORDER BY DES_DIRECTOR ASC)
SELECT
*
FROM TRANSFORMATION
WHERE COD_DIRECTOR = ''

In [0]:
df_tmp_director_pelicula = fn_transform_director_pelicula(catalogo, esquema_source)

In [0]:
df_tmp_director_pelicula = df_tmp_director_pelicula.withColumn("FEC_CARGA", current_timestamp())

In [0]:
display(df_tmp_director_pelicula)

In [0]:
df_tmp_director_pelicula.write.mode("overwrite").saveAsTable(f"{catalogo}.{esquema_stage}.TBL_TMP_DIRECTOR_PELICULA")

In [0]:
%sql
SELECT
*
FROM
catalog_jptq.silver_jptq.tbl_tmp_director_pelicula