# Ingestion del archivo country.json

In [0]:
dbutils.widgets.text("p_environment", "")
v_env = dbutils.widgets.get("p_environment")
v_env

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DoubleType, StringType, DateType

In [0]:
countries_schema = "countryId INT, CountryIsoCode STRING, countryName STRING"

In [0]:
countries_df = spark.read.schema(countries_schema).json(f'{bronze_folder_path}/country.json')

In [0]:
type(countries_df)

In [0]:
countries_df.printSchema()

### Paso 2 - Eliminar las columnas no deseadas

In [0]:
from pyspark.sql.functions import col

In [0]:
countries_dropped_df = countries_df.drop(col('CountryIsoCode'))

In [0]:
countries_dropped_df = countries_df.drop('CountryIsoCode')

In [0]:
countries_dropped_df = countries_df.drop(countries_df['CountryIsoCode'])

### Paso 3 - Cambiar el nombre de las columnas según lo requerido, Agregar la columna "ingestion_date" y "env" al DataFrame

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
countries_final_df = (countries_dropped_df
    .withColumnRenamed('countryId', 'country_id')
    .withColumnRenamed('countryName', 'country_name')
    .transform(add_ingestion_date)
    .withColumn('env', lit(v_env))
)

#### Paso 4 - Escribir datos en el DataLake en formato parquet

In [0]:
# countries_final_df.write.mode('overwrite').parquet(f'{silver_folder_path}/countries')
countries_final_df.write.mode("overwrite").format("parquet").saveAsTable("movie_silver.countries")

In [0]:
%python
display(dbutils.fs.ls("/mnt/moviehistory/silver/countries"))

In [0]:
df = spark.read.parquet('/mnt/moviehistory/silver/countries')
display(df)

In [0]:
dbutils.notebook.exit("Sucess")