### Ingestión del archivo genre.csv

In [0]:
dbutils.widgets.text('p_enviroment', '')
v_enviroment = dbutils.widgets.get('p_enviroment')

In [0]:
dbutils.widgets.text('p_file_date', '2024-12-16')
v_file_date = dbutils.widgets.get('p_file_date')

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

## Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import *

In [0]:
genre_schema= StructType([
    StructField('genreId', IntegerType(), False),
    StructField('genreName', StringType(), True)
])

In [0]:
genre_df = spark.read \
    .option("header", True) \
    .schema(genre_schema) \
    .csv(f"{bronze_folder_path}/{v_file_date}/genre.csv")

In [0]:
from pyspark.sql.functions import col, lit
genre_selected_df = genre_df.select(col('genreId'), col('genreName'))

In [0]:
genre_renamed_df = genre_selected_df.withColumnsRenamed({"genreId": "genre_id", "genreName": "genre_name"})

In [0]:
from pyspark.sql.functions import current_timestamp, lit

### Agregar la colummna "ingestion_date" y "enviroment" al DataFrame

In [0]:
genre_final_df = add_ingestion_date(genre_renamed_df) \
    .withColumn("enviroment", lit(v_enviroment)) \
    .withColumn("file_date", lit(v_file_date))

### Escribir datos en el deltalake en formato PARQUET

In [0]:
genre_final_df.write.mode("overwrite").format("delta").saveAsTable("movie_silver.genres")

In [0]:
%sql
SELECT file_date, COUNT(1) 
FROM movie_silver.genres
GROUP BY file_date;

In [0]:
display(genre_final_df)

In [0]:
display(spark.read.format("delta").load(f"{silver_folder_path}/genres"))

In [0]:
dbutils.notebook.exit("Success")