# Ingestion del archivo language.csv

In [0]:
dbutils.widgets.text("p_environment", "")
v_env = dbutils.widgets.get("p_environment")
v_env

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DoubleType, StringType, DateType

In [0]:
language_schema = StructType(fields=[
    StructField("languageId", IntegerType(), False),
    StructField("languageCode", StringType(), True),
    StructField("languageName", StringType(), True)
])

In [0]:
language_df = spark.read.csv(f'{bronze_folder_path}/language.csv', header = True, schema = language_schema)

In [0]:
type(language_df)

In [0]:
language_df.printSchema()

### Paso 2 - Seleccionar solo las columnas "requeridas"

In [0]:
from pyspark.sql.functions import col 

In [0]:
languages_selected_df = language_df.select(col("languageId"), col("languageName"))

### Paso 3 - Cambiar el nombre de las columnas según lo requerido

In [0]:
languages_renamed_df = languages_selected_df \
    .withColumnRenamed("languageId", "language_id") \
    .withColumnRenamed("languageName", "language_name") 

### Paso 4 - Agregar la columna "ingestion_date" al DataFrame

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
#Primera forma
languages_final_df = add_ingestion_date(languages_renamed_df) \
                                    .withColumn('env',lit(v_env))
#lit crea una columna de un valor literal

#### Paso 5 - Escribir datos en el DataLake en formato parquet

In [0]:
# languages_final_df.write.mode('overwrite').parquet(f'{silver_folder_path}/languages')
languages_final_df.write.mode("overwrite").format("parquet").saveAsTable("movie_silver.languages")

In [0]:
%python
display(dbutils.fs.ls("/mnt/moviehistory/silver/languages"))

In [0]:
df = spark.read.parquet('/mnt/moviehistory/silver/languages')
display(df)

In [0]:
dbutils.notebook.exit("Sucess")