In [0]:
spark.sql("DROP TABLE IF EXISTS workspace.bronze.tags") #drop old table to fix schema mismatch

DataFrame[]

In [0]:
from pyspark.sql.functions import current_timestamp, lit

#configuration
raw_path = "/Volumes/workspace/default/movielens_raw"

#ingest ratings
df_ratings = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(f"{raw_path}/ratings.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source", lit("movielens_csv"))
)

df_ratings.write.format("delta").mode("overwrite").saveAsTable("workspace.bronze.ratings")
print(f"✅ Ratings: {spark.table('workspace.bronze.ratings').count()} rows")

#ingest movies 
df_movies = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(f"{raw_path}/movies.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source", lit("movielens_csv"))
)

df_movies.write.format("delta").mode("overwrite").saveAsTable("workspace.bronze.movies")
print(f"✅ Movies: {spark.table('workspace.bronze.movies').count()} rows")

#ingest links
df_links = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(f"{raw_path}/links.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source", lit("movielens_csv"))
)

df_links.write.format("delta").mode("overwrite").saveAsTable("workspace.bronze.links")
print(f"✅ Links: {spark.table('workspace.bronze.links').count()} rows")

#ingest tags
df_tags = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("quote", '"') # recognize double quotes as field wrappers
    .option("escape", '"') # handle escaped quotes inside fields
    .csv(f"{raw_path}/tags.csv")
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source", lit("movielens_csv"))
)

df_tags.write.format("delta").mode("overwrite").saveAsTable("workspace.bronze.tags")
print(f"✅ Tags: {spark.table('workspace.bronze.tags').count()} rows")

✅ Ratings: 32000204 rows
✅ Movies: 87585 rows
✅ Links: 87585 rows
✅ Tags: 2000072 rows


In [0]:
%sql
-- Quick check
SELECT 'ratings' AS table_name, COUNT(*) AS row_count FROM workspace.bronze.ratings
UNION ALL
SELECT 'movies', COUNT(*) FROM workspace.bronze.movies
UNION ALL
SELECT 'links', COUNT(*) FROM workspace.bronze.links
UNION ALL
SELECT 'tags', COUNT(*) FROM workspace.bronze.tags

table_name,row_count
ratings,32000204
movies,87585
links,87585
tags,2000072
