In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.hadoop.fs.{FileSystem, Path}

// Initialize SparkSession
val spark = SparkSession.builder()
    .appName("Partitioning Ratings Data by Year")
    .getOrCreate()

spark = org.apache.spark.sql.SparkSession@104b1be6


org.apache.spark.sql.SparkSession@104b1be6

In [2]:
val ratingsPath = "gs://task-dataset-bucket/Day_16_17/rating.csv"
val ratingsDF = spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(ratingsPath)

// Inspect Schema
ratingsDF.printSchema()
ratingsDF.show(10) // Display first 10 rows for verification


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
|     1|    112|   3.5|2004-09-10 03:09:00|
|     1|    151|   4.0|2004-09-10 03:08:54|
|     1|    223|   4.0|2005-04-02 23:46:13|
|     1|    253|   4.0|2005-04-02 23:35:40|
|     1|    260|   4.0|2005-04-02 23:33:46|
+------+-------+------+-------------------+
only showing top 10 rows



ratingsPath = gs://task-dataset-bucket/Day_16_17/rating.csv
ratingsDF = [userId: int, movieId: int ... 2 more fields]


[userId: int, movieId: int ... 2 more fields]

In [3]:
val enhancedRatingsDF = ratingsDF.withColumn("year", year(col("timestamp")))

// Show updated DataFrame
enhancedRatingsDF.show(10)


+------+-------+------+-------------------+----+
|userId|movieId|rating|          timestamp|year|
+------+-------+------+-------------------+----+
|     1|      2|   3.5|2005-04-02 23:53:47|2005|
|     1|     29|   3.5|2005-04-02 23:31:16|2005|
|     1|     32|   3.5|2005-04-02 23:33:39|2005|
|     1|     47|   3.5|2005-04-02 23:32:07|2005|
|     1|     50|   3.5|2005-04-02 23:29:40|2005|
|     1|    112|   3.5|2004-09-10 03:09:00|2004|
|     1|    151|   4.0|2004-09-10 03:08:54|2004|
|     1|    223|   4.0|2005-04-02 23:46:13|2005|
|     1|    253|   4.0|2005-04-02 23:35:40|2005|
|     1|    260|   4.0|2005-04-02 23:33:46|2005|
+------+-------+------+-------------------+----+
only showing top 10 rows



enhancedRatingsDF = [userId: int, movieId: int ... 3 more fields]


[userId: int, movieId: int ... 3 more fields]

In [4]:
val limitedRatingsDF = enhancedRatingsDF.limit(100000)

// Verify record count
println(s"Number of records in limited dataset: ${limitedRatingsDF.count()}")


Number of records in limited dataset: 100000


limitedRatingsDF = [userId: int, movieId: int ... 3 more fields]


[userId: int, movieId: int ... 3 more fields]

In [5]:
val partitionedOutputPath = "hdfs:///user/day_16_17/partitioned_ratings"

limitedRatingsDF.coalesce(1)
    .write
    .partitionBy("year")
    .format("parquet")
    .mode("overwrite")
    .save(partitionedOutputPath)

println(s"Data saved to: $partitionedOutputPath")


Data saved to: hdfs:///user/day_16_17/partitioned_ratings


partitionedOutputPath = hdfs:///user/day_16_17/partitioned_ratings


hdfs:///user/day_16_17/partitioned_ratings

In [6]:
val partitionPath = "hdfs:///user/day_16_17/partitioned_ratings/year=1996"

val partitionedData = spark.read
    .option("basePath", partitionedOutputPath)
    .parquet(partitionPath)

// Show distinct years from the partitioned dataset
partitionedData.select("year").distinct().show()


+----+
|year|
+----+
|1996|
+----+



partitionPath = hdfs:///user/day_16_17/partitioned_ratings/year=1996
partitionedData = [userId: int, movieId: int ... 3 more fields]


[userId: int, movieId: int ... 3 more fields]

In [7]:
spark.stop()
