In [1]:
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._

Intitializing Scala interpreter ...

Spark Web UI available at http://cluster-bd33-m:8088/proxy/application_1732719982311_0008
SparkContext available as 'sc' (version = 3.1.3, master = yarn, app id = application_1732719982311_0008)
SparkSession available as 'spark'


import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._


In [2]:
// Load rating.csv from GCS
val ratingsPath = s"gs://priyanshi-spark-bucket-2/rating.csv"
val ratingsDF = spark.read.option("header", "true").csv(ratingsPath)

ratingsDF.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



ratingsPath: String = gs://priyanshi-spark-bucket-2/rating.csv
ratingsDF: org.apache.spark.sql.DataFrame = [userId: string, movieId: string ... 2 more fields]


In [3]:
// Filter out invalid records
val validRatingsDF = ratingsDF.filter(col("userId").isNotNull && col("userId").isNotNull && col("rating").isNotNull)

validRatingsDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [userId: string, movieId: string ... 2 more fields]


In [4]:
// Convert DataFrame to RDD
val ratingsRDD = validRatingsDF.limit(100000).rdd.map(row => {
  val userId = row.getAs[String]("userId")
  val movieId = row.getAs[String]("movieId")
  val rating = row.getAs[String]("rating").toDouble
  (userId, (movieId, rating))  // (userId, (movieId, rating))
})

ratingsRDD.take(5).foreach(println)

(1,(2,3.5))
(1,(29,3.5))
(1,(32,3.5))
(1,(47,3.5))
(1,(50,3.5))


ratingsRDD: org.apache.spark.rdd.RDD[(String, (String, Double))] = MapPartitionsRDD[22] at map at <console>:29


In [5]:
// Group ratings by userId
val groupedByUserRDD = ratingsRDD.groupByKey()

groupedByUserRDD: org.apache.spark.rdd.RDD[(String, Iterable[(String, Double)])] = ShuffledRDD[23] at groupByKey at <console>:29


In [6]:
import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.{FileSystem, Path}
import java.io.{BufferedWriter, OutputStreamWriter}

val first10Users = groupedByUserRDD.take(10)
first10Users.foreach { case (userId, ratingsList) =>
  val userFolderPath = s"hdfs:///user/priyanshi/user-data/${userId}/ratings.csv"
  val path = new Path(userFolderPath)
  
  val fs = FileSystem.get(new java.net.URI("hdfs:///"), new org.apache.hadoop.conf.Configuration())
  
  if (!fs.exists(path.getParent)) {
    fs.mkdirs(path.getParent)
  }

  val ratingsText = ratingsList.map { case (movieId, rating) =>
    s"${movieId}, ${rating}"
  }.mkString("\n")

  val outputStream = fs.create(path)
  val writer = new BufferedWriter(new OutputStreamWriter(outputStream))

  writer.write(ratingsText)

  writer.close()
  outputStream.close()
}

import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.{FileSystem, Path}
import java.io.{BufferedWriter, OutputStreamWriter}
first10Users: Array[(String, Iterable[(String, Double)])] = Array((273,CompactBuffer((50,4.0), (104,4.0), (247,4.0), (296,3.0), (305,2.0), (356,3.0), (535,5.0), (593,3.0), (608,4.0), (762,1.0), (904,3.0), (912,4.0), (919,3.0), (1078,3.0), (1193,4.0), (1197,2.0), (1203,3.0), (1204,4.0), (1206,4.0), (1213,3.0), (1221,4.0), (1230,5.0), (1233,3.0), (1247,3.0), (1270,3.0), (1300,4.0), (1617,2.0), (1635,4.0), (1734,4.0), (2289,4.0), (2351,4.0), (2590,4.0), (2692,3.0), (2858,4.0), (2920,4.0), (2978,3.0), (3019,3.0), (3185,3.0), (3198,4.0), (3298,2.0), (3317,3.0), (3408,4.0), (3468,3.0), (3481,4.0), (3538,3.0), (3556,3.0), (3567,4.0), (3594,4.0), (3751,2.0), (3753,1...
