In [None]:
import org.apache.spark.sql.{SparkSession, functions => F}
import org.apache.spark.sql.types._

// Step 2: Define schema for the ratings.csv file
val schema = StructType(Array(
  StructField("userId", IntegerType, true),
  StructField("movieId", IntegerType, true),
  StructField("rating", DoubleType, true),
  StructField("timestamp", LongType, true)
))

Intitializing Scala interpreter ...

In [None]:
// Step 3: Load rating.csv as DataFrame from GCP Cloud Storage
val ratingsDF = spark.read
  .option("header", "true")
  .schema(schema)
  .csv("gs://priyanshi-spark-bucket-2/rating.csv")

In [None]:
// Step 4: Transformation - Add year column by converting timestamp
val ratingsWithYearDF = ratingsDF.withColumn("year", F.year(F.from_unixtime(F.col("timestamp"))))

val ratingsWithYearDF1 = ratingsWithYearDF.limit(1000)

In [None]:
// Step 5: Convert DataFrame to RDD for partitioning by year
val ratingsByYearRDD = ratingsWithYearDF1.rdd.map(row => (row.getAs[Int]("year"), row))

In [None]:
import org.apache.spark.sql.Row

ratingsByYearRDD.groupByKey().foreach {
  case (year, records) =>
    // Convert records (Iterable[Row]) to an RDD[Row]
    val yearRDD = spark.sparkContext.parallelize(records.toSeq)

    // Create DataFrame for the specific year
    val yearDF = spark.createDataFrame(yearRDD, ratingsWithYearDF.schema)

    // Save the DataFrame as a Parquet file
    yearDF.write
      .mode("overwrite")
      .parquet(s"hdfs:///ratings/$year/rating.parquet")
}

In [None]:
// Step 7: Verification (Optional)
// Load and count records for a specific year to ensure correctness
val year = 2020
val specificYearDF = spark.read.parquet(s"hdfs:///ratings/$year/rating.parquet")
println(s"Record count for year $year: ${specificYearDF.count()}")