In [1]:
import org.apache.spark.sql.{SparkSession, functions => F}


// Rading the files from gcp
val spark = SparkSession.builder
  .appName("Genre-Specific Data Aggregation Pipeline")
  .getOrCreate()

// Load movies.csv
val moviesPath = "gs://priyanshi-spark-bucket/movie.csv"
val moviesDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(moviesPath)

// Load ratings.csv
val ratingsPath = "gs://priyanshi-spark-bucket/rating.csv"
val ratingsDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(ratingsPath)

// Preview loaded data
moviesDF.show()
ratingsDF.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

spark = org.apache.spark.sql.SparkSession@127cdcf0
moviesPath = gs://priyanshi-spark-bucket/movie.csv
moviesDF = [movieId: int, title: string ... 1 more field]
ratingsPath = gs://priyanshi-spark-bucket/rating.csv
ratingsDF = [userId: int, movieId: int ... 2 more fields]


import org.apache.spark.sql.{SparkSession, functions=>F}


[userId: int, movieId: int ... 2 more fields]

In [2]:
val explodedGenresDF = moviesDF.withColumn("genre", F.explode(F.split(F.col("genres"), "\\|"))).drop("genres")
explodedGenresDF.show()

+-------+--------------------+---------+
|movieId|               title|    genre|
+-------+--------------------+---------+
|      1|    Toy Story (1995)|Adventure|
|      1|    Toy Story (1995)|Animation|
|      1|    Toy Story (1995)| Children|
|      1|    Toy Story (1995)|   Comedy|
|      1|    Toy Story (1995)|  Fantasy|
|      2|      Jumanji (1995)|Adventure|
|      2|      Jumanji (1995)| Children|
|      2|      Jumanji (1995)|  Fantasy|
|      3|Grumpier Old Men ...|   Comedy|
|      3|Grumpier Old Men ...|  Romance|
|      4|Waiting to Exhale...|   Comedy|
|      4|Waiting to Exhale...|    Drama|
|      4|Waiting to Exhale...|  Romance|
|      5|Father of the Bri...|   Comedy|
|      6|         Heat (1995)|   Action|
|      6|         Heat (1995)|    Crime|
|      6|         Heat (1995)| Thriller|
|      7|      Sabrina (1995)|   Comedy|
|      7|      Sabrina (1995)|  Romance|
|      8| Tom and Huck (1995)|Adventure|
+-------+--------------------+---------+
only showing top

explodedGenresDF = [movieId: int, title: string ... 1 more field]


[movieId: int, title: string ... 1 more field]

In [3]:
val genreRDD = explodedGenresDF.select("movieId", "title", "genre")
  .rdd
  .map(row => (row.getInt(0), row.getString(1), row.getString(2)))
  .map {
    case (movieId, title, genre) =>
      val normalizedGenre = genre match {
        case "Sci-Fi" => "Science Fiction"
        case other    => other
      }
      (movieId, title, normalizedGenre)
  }

genreRDD = MapPartitionsRDD[39] at map at <console>:28


MapPartitionsRDD[39] at map at <console>:28

In [4]:
// Convert normalized RDD back to DataFrame
val normalizedGenresDF = genreRDD.toDF("movieId", "title", "genre")

// Join movies with ratings
val joinedDF = ratingsDF.join(normalizedGenresDF, "movieId")
joinedDF.show()


normalizedGenresDF = [movieId: int, title: string ... 1 more field]
joinedDF = [movieId: int, userId: int ... 4 more fields]


+-------+------+------+-------------------+--------------------+---------------+
|movieId|userId|rating|          timestamp|               title|          genre|
+-------+------+------+-------------------+--------------------+---------------+
|   3997|     1|   3.5|2005-04-02 23:56:32|Dungeons & Dragon...|        Fantasy|
|   3997|     1|   3.5|2005-04-02 23:56:32|Dungeons & Dragon...|      Adventure|
|   1580|     2|   4.0|2000-11-21 15:32:28|Men in Black (a.k...|Science Fiction|
|   1580|     2|   4.0|2000-11-21 15:32:28|Men in Black (a.k...|         Comedy|
|   1580|     2|   4.0|2000-11-21 15:32:28|Men in Black (a.k...|         Action|
|   3918|     2|   3.0|2000-11-21 15:35:43|Hellbound: Hellra...|         Horror|
|   2366|     3|   4.0|1999-12-11 13:18:30|    King Kong (1933)|         Horror|
|   2366|     3|   4.0|1999-12-11 13:18:30|    King Kong (1933)|        Fantasy|
|   2366|     3|   4.0|1999-12-11 13:18:30|    King Kong (1933)|      Adventure|
|   2366|     3|   4.0|1999-

[movieId: int, userId: int ... 4 more fields]

In [5]:
val genreRatingsRDD = joinedDF.select("genre", "rating")
  .rdd
  .map(row => (row.getString(0), row.getDouble(1)))

// Aggregate to calculate the average rating for each genre
val genreAverageRatingsRDD = genreRatingsRDD
  .aggregateByKey((0.0, 0))(
    (acc, rating) => (acc._1 + rating, acc._2 + 1),
    (acc1, acc2) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
  )
  .mapValues { case (sum, count) => sum / count }


genreRatingsRDD = MapPartitionsRDD[66] at map at <console>:27
genreAverageRatingsRDD = MapPartitionsRDD[68] at mapValues at <console>:35


MapPartitionsRDD[68] at mapValues at <console>:35

In [6]:
val genreAverageRatingsDF = genreAverageRatingsRDD.toDF("genre", "average_rating")

val outputPath = "gs://priyanshi-spark-bucket/genre_average_ratings"
genreAverageRatingsDF.write
  .mode("overwrite")
  .parquet(outputPath)

// Confirm data is saved
println(s"Results saved to $outputPath")


Results saved to gs://priyanshi-spark-bucket/genre_average_ratings


genreAverageRatingsDF = [genre: string, average_rating: double]
outputPath = gs://priyanshi-spark-bucket/genre_average_ratings


gs://priyanshi-spark-bucket/genre_average_ratings