# Ingestion del archivo movies.csv

In [0]:
dbutils.widgets.text("p_environment", "")
v_env = dbutils.widgets.get("p_environment")
v_env

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

### Paso 1 - Leer el archivo CSV usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DoubleType, StringType, DateType

In [0]:
movie_schema = StructType(fields=[
    StructField("movieId", IntegerType(), False),
    StructField("title", StringType(), True),
    StructField("budget", DoubleType(), True),
    StructField("homePage", StringType(), True),
    StructField("overview", StringType(), True),
    StructField("popularity", DoubleType(), True),
    StructField("yearReleaseDate", IntegerType(), True),
    StructField("releaseDate", DateType(), True),
    StructField("revenue", DoubleType(), True),
    StructField("durationTime", IntegerType(), True),
    StructField("movieStatus", StringType(), True),
    StructField("tagline", StringType(), True),
    StructField("voteAverage", DoubleType(), True),
    StructField("voteCount", IntegerType(), True),
])

In [0]:
movie_df = spark.read.csv(f'{bronze_folder_path}/movie.csv', header = True, schema = movie_schema)

In [0]:
type(movie_df)

In [0]:
movie_df.printSchema()

### Paso 2 - Seleccionar solo las columnas "requeridas"

In [0]:
#Primera forma
movies_selected_df = movie_df.select("movieId", "title", "budget", "popularity", "yearReleaseDate", "releaseDate", "revenue", "durationTime",  "voteAverage", "voteCount")

In [0]:
#Segunda forma
movies_selected_df = movie_df.select(movie_df.movieId, movie_df.title, movie_df.budget, movie_df.popularity, movie_df.yearReleaseDate, movie_df.releaseDate, movie_df.revenue, movie_df.durationTime, movie_df.voteAverage, movie_df.voteCount)

In [0]:
#Tercera forma
movies_selected_df = movie_df.select(movie_df["movieId"], movie_df["title"], movie_df["budget"], movie_df["popularity"], movie_df["yearReleaseDate"], movie_df["releaseDate"], movie_df["revenue"], movie_df["durationTime"], movie_df["voteAverage"], movie_df["voteCount"])

In [0]:
from pyspark.sql.functions import col 

In [0]:
#Cuarta forma
movies_selected_df = movie_df.select(col("movieId"), col("title"), col("budget"), col("popularity"), col("yearReleaseDate"), col("releaseDate"), col("revenue"), col("durationTime"), col("voteAverage"), col("voteCount"))

### Paso 3 - Cambiar el nombre de las columnas según lo requerido

In [0]:
movies_renamed_df = movies_selected_df \
    .withColumnRenamed("movieId", "movie_id") \
    .withColumnRenamed("yearReleaseDate", "year_release_date") \
    .withColumnRenamed("releaseDate", "release_date") \
    .withColumnRenamed("durationTime", "duration_time") \
    .withColumnRenamed("voteAverage", "vote_average") \
    .withColumnRenamed("voteCount", "vote_count")

In [0]:
movies_renamed_df = movies_selected_df.withColumnsRenamed({"movieId": "movie_id", "yearReleaseDate": "year_release_date", "releaseDate": "release_date",  "durationTime": "duration_time", "voteAverage": "vote_average", "voteCount": "vote_count"})

### Paso 4 - Agregar la columna "ingestion_date" al DataFrame

In [0]:
from pyspark.sql.functions import current_timestamp, lit

In [0]:
#Primera forma
movies_final_df = add_ingestion_date(movies_renamed_df) \
                                    .withColumn('env',lit(v_env))
#lit crea una columna de un valor literal

In [0]:
#Segunda forma
movies_final_df = movies_renamed_df.withColumns({'ingestion_date': current_timestamp(), 'env': lit(v_env)})

#### Paso 5 - Escribir datos en el DataLake en formato parquet

Se escriben los datos de dos maneras, en parquet en la ruta especificada y en la tabla movie_silver. Este comportamiento se repite para todos los notebooks de ingestion.

In [0]:
# movies_final_df.write.mode('overwrite').partitionBy('year_release_date').parquet(f'{silver_folder_path}/movies')
movies_final_df.write.mode('overwrite').format("parquet").saveAsTable("movie_silver.movies")

In [0]:
%sql
SELECT * FROM movie_silver.movies

In [0]:
%python
display(dbutils.fs.ls(f"{silver_folder_path}/movies"))

In [0]:
df = spark.read.parquet(f"{silver_folder_path}/movies")
display(df)

In [0]:
display(movies_final_df)

In [0]:
dbutils.notebook.exit("Sucess")