# Ingestion del archivo person.json

In [0]:
dbutils.widgets.text("p_environment", "")
v_env = dbutils.widgets.get("p_environment")
v_env

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, StringType

In [0]:
name_schema = StructType(fields=[
    StructField('forename', StringType(), True),
    StructField('surname', StringType(), True),
])

In [0]:
person_schema = StructType(fields=[
    StructField('personId', IntegerType(), False),
    StructField('personName', name_schema, True),
])

In [0]:
persons_df = spark.read.schema(person_schema).json(f'{bronze_folder_path}/person.json')

In [0]:
type(persons_df)

In [0]:
persons_df.printSchema()

### Paso 2 - Renombrar las columnas y adicionar nuevas columnas
1. Renombar personId a person_id
2. Agregar las columnas "ingestion_date" y "env"
3. Agregar la columna "name" a partir de la concatenacion de "forename" y "surname"

In [0]:
from pyspark.sql.functions import col, concat, current_timestamp, lit

In [0]:
persons_with_columns_df = (persons_df
    .withColumnRenamed('personId', 'person_id')
    .transform(add_ingestion_date)
    .withColumn('env', lit(v_env))
    .withColumn('name', 
                concat(col('personName.forename'),
                       lit(' '),
                       col('personName.surname'))
               )
)

### Paso 3 - Eliminar columna no requerida

In [0]:
persons_final_df = persons_with_columns_df.drop('personName')

#### Paso 4  - Escribir datos en el DataLake en formato parquet

In [0]:
# persons_final_df.write.mode('overwrite').parquet(f'{silver_folder_path}/persons')
persons_final_df.write.mode('overwrite').format('parquet').saveAsTable('movie_silver.persons')

In [0]:
%python
display(dbutils.fs.ls("/mnt/moviehistory/silver/persons"))

In [0]:
df = spark.read.parquet('/mnt/moviehistory/silver/persons')
display(df)

In [0]:
dbutils.notebook.exit("Sucess")