### Ingesta del archivo "movie.csv"

### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
dbutils.widgets.help()

In [0]:
dbutils.widgets.text("p_enviroment", "")
v_enviroment = dbutils.widgets.get("p_enviroment")


In [0]:
dbutils.widgets.text("p_file_date", "2024-12-30")
v_file_date = dbutils.widgets.get("p_file_date")


In [0]:
v_enviroment

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
bronze_folder_path

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType


In [0]:
movie_schema = StructType(fields=[
    StructField("movieId", IntegerType(), False),
    StructField("title", StringType(), True),
    StructField("budget", DoubleType(), True),
    StructField("homepage", StringType(), True  ),
    StructField("overview", StringType(), True),
    StructField("popularity", DoubleType(), True),
    StructField("yearReleaseDate", IntegerType(), True),
    StructField("releaseDate", DateType(), True),
    StructField("revenue", DoubleType(), True),
    StructField("durationTime", IntegerType(), True),
    StructField("movieStatus", StringType(), True),
    StructField("tagline", StringType(), True),
    StructField("voteAverage", DoubleType(), True),
    StructField("voteCount", IntegerType(), True)
])#Defino estructura que tendra el df: primero nombnre campo, segundo tipo de dato, tercero si permite nulos

In [0]:
movie_df = spark.read. \
    option("header", True) \
    .schema(movie_schema) \
    .csv(f"{bronze_folder_path}/{v_file_date}/movie.csv")

#### Paso 2 - Seleccionar solo las columnas "requeridas"

In [0]:
from pyspark.sql.functions import col #Importo col para seleccionar columnas

In [0]:
movies_selected_df = movie_df.select(col("movieId"), col("title"), col("budget"), col("popularity"), col("yearReleaseDate"), col("releaseDate"), col("revenue"), col("durationTime"), col("voteAverage"), col("voteCount"))#Con las ultimas 3 funciones puedo agregar otras funciones, con la primera no.

#### Paso 3 - Cambiar el nombre a las columnas segun lo "requerido"

In [0]:
movies_renamed_df = movies_selected_df \
                    .withColumnRenamed("movieId", "movie_id") \
                    .withColumnRenamed("yearReleaseDate", "year_release_date") \
                    .withColumnRenamed("releaseDate", "release_date") \
                    .withColumnRenamed("durationTime", "duration_time") \
                    .withColumnRenamed("voteAverage", "vote_average") \
                    .withColumnRenamed("voteCount", "vote_count")

In [0]:
movies_renamed_df = movies_selected_df \
                    .withColumnsRenamed({"movieId": "movie_id","yearReleaseDate": "year_release_date","releaseDate": "release_date","durationTime": "duration_time","voteAverage": "vote_average","voteCount": "vote_count"})

#### Paso 4 - Agregar una columna "ingestion_date" al DataFrame

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
movies_final_df = add_ingestion_date(movies_renamed_df) \
                                   .withColumn("enviroment", lit(v_enviroment)) \
                                    .withColumn("file_date",lit(v_file_date))#Con lit paso el valor como objeto, porque si no no puedo usar current_timestamp

#### Paso 5 - Escribir datos en el datalake en formato "Parquet"

https://spark.apache.org/docs/latest/sql-ref-syntax.html#ddl-statements

In [0]:
#ovewrite_partition(movies_final_df,"movie_silver","movies","file_date")


In [0]:
merge_condition = 'tgt.movie_id = src.movie_id AND tgt.file_date = src.file_date'
merge_delta_lake(movies_final_df, 'movie_silver', 'movies', silver_folder_path, merge_condition, 'file_date')

In [0]:
%fs
ls /mnt/historialpeliculas/silver/movies/

In [0]:
%sql
SELECT file_date, COUNT(1)
FROM movie_silver.movies
GROUP BY file_date;

In [0]:
dbutils.notebook.exit("Success")