In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
catalog_name = 'movies'

#IMDB Movie Lists

In [0]:
raw_data_path = "/Volumes/movies/source_data/raw_data/imdb_list.csv"

In [0]:
df_raw = (spark.read.option("header", "true").option("delimiter", ",").csv(raw_data_path))
display(df_raw.limit(5))

In [0]:
df_raw = df_raw.drop(df_raw.columns[0])

In [0]:
df_raw.show(3)

In [0]:
df_movies = (df_raw
    .select(
        F.col("id").cast("string"),
        F.col("title").cast("string"),
        F.col("rating").cast("double"),
        F.col("genre").cast("string"),
        F.col("year").cast("int")
    )
    .withColumn("_source_file", F.col("_metadata.file_path"))
    .withColumn("ingested_at", F.current_timestamp())
)

display(df_movies.limit(5))

In [0]:
df_movies.write.format("delta").mode("overwrite").saveAsTable(f'{catalog_name}.bronze.bronze_imdb_movies')


#IMDb Reviews 

In [0]:
imdb_reviews_schema = StructType([
    StructField("imdb_id", StringType(), False),     
    StructField("review_title", StringType(), True),
    StructField("review_rating", IntegerType(), True),
    StructField("review", StringType(), True)
])

raw_data_path = "/Volumes/movies/source_data/raw_data/imdb_reviews.csv"

df_reviews = (spark.read.option("header", "true").option("delimiter", ",").schema(imdb_reviews_schema).csv(raw_data_path))

df_reviews = (df_reviews.withColumn("_source_file", F.col("_metadata.file_path")).withColumn("ingested_at", F.current_timestamp()))

display(df_reviews.limit(5))

In [0]:
df_reviews.write.format("delta").mode("overwrite").saveAsTable(f'{catalog_name}.bronze.bronze_imdb_reviews')