In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD
import org.apache.hadoop.fs.{FileSystem, Path}
import scala.util.matching.Regex
import org.apache.spark.sql.types._

// Initialize SparkSession
val spark = SparkSession.builder()
    .appName("CaseStudy3 - Handling Movie Metadata")
    .getOrCreate()

spark = org.apache.spark.sql.SparkSession@25801240


org.apache.spark.sql.SparkSession@25801240

In [2]:
val moviesPath = "gs://task-dataset-bucket/Day_16_17/movie.csv"
val moviesDF = spark.read.option("header", "true").option("inferSchema", "true").csv(moviesPath)

val moviesRDD = moviesDF.rdd.map { row =>
    val movieId = row.getAs[Int]("movieId")
    val title = row.getAs[String]("title")
    val genres = row.getAs[String]("genres")
    (movieId, title, genres)
}

moviesPath = gs://task-dataset-bucket/Day_16_17/movie.csv
moviesDF = [movieId: int, title: string ... 1 more field]
moviesRDD = MapPartitionsRDD[15] at map at <console>:38


MapPartitionsRDD[15] at map at <console>:38

In [3]:
val defaultYear = 9999

def extractYear(title: String): Int = {
    val startIdx = title.lastIndexOf("(")
    val endIdx = title.lastIndexOf(")")
    if (startIdx != -1 && endIdx != -1) {
        val yearStr = title.substring(startIdx + 1, endIdx)
        if (yearStr.forall(_.isDigit) && yearStr.length == 4) yearStr.toInt else defaultYear
    } else defaultYear
}

val metadataRDD: RDD[(Int, Int)] = moviesRDD.map { case (movieId, title, genres) =>
    val year = extractYear(title)
    (movieId, year)
}


defaultYear = 9999
metadataRDD = MapPartitionsRDD[16] at map at <console>:46


extractYear: (title: String)Int


MapPartitionsRDD[16] at map at <console>:46

In [4]:
import spark.implicits._

val metadataDF = metadataRDD.toDF("movieId", "releaseYear")

// Save to JSON
metadataDF.coalesce(1).write.mode("overwrite").json("gs://task-dataset-bucket/Day_16_17/metadata")


metadataDF = [movieId: int, releaseYear: int]


[movieId: int, releaseYear: int]

In [5]:
val metadataJsonDF = spark.read.json("gs://task-dataset-bucket/Day_16_17/metadata")
metadataJsonDF.filter(col("releaseYear").isNull).show()


+-------+-----------+
|movieId|releaseYear|
+-------+-----------+
+-------+-----------+



metadataJsonDF = [movieId: bigint, releaseYear: bigint]


[movieId: bigint, releaseYear: bigint]

In [6]:
val metadataJsonRDD = metadataJsonDF.rdd.map { row =>
    val movieId = row.getAs[Long]("movieId").toInt
    val releaseYear = row.getAs[Long]("releaseYear").toInt
    (movieId, releaseYear)
}

val joinedRDD = moviesRDD.map { case (movieId, title, genres) =>
    (movieId, (title, genres))
}.join(metadataJsonRDD)


metadataJsonRDD = MapPartitionsRDD[34] at map at <console>:40
joinedRDD = MapPartitionsRDD[38] at join at <console>:48


MapPartitionsRDD[38] at join at <console>:48

In [7]:
val finalDF = joinedRDD.map { case (movieId, ((title, genres), releaseYear)) =>
    if (releaseYear != defaultYear) (movieId, title, genres, releaseYear)
    else (movieId, s"$title ($defaultYear)", genres, releaseYear)
}.toDF("movieId", "title", "genre", "releaseYear")


finalDF = [movieId: int, title: string ... 2 more fields]


[movieId: int, title: string ... 2 more fields]

In [8]:
val outputPath = "hdfs:///user/day_16_17/case_study_3"
finalDF.write.mode("overwrite").parquet(outputPath)


outputPath = hdfs:///user/day_16_17/case_study_3


hdfs:///user/day_16_17/case_study_3

In [9]:
spark.read.parquet(outputPath).show(20)
spark.read.parquet(outputPath).filter(col("releaseYear") === defaultYear).show(20)


+-------+--------------------+--------------------+-----------+
|movieId|               title|               genre|releaseYear|
+-------+--------------------+--------------------+-----------+
| 113843|Killing Us Softly...|         Documentary|       2010|
| 103301|   Liz & Dick (2012)|               Drama|       2012|
| 110163|  Aujourd'hui (2012)|               Drama|       2012|
|  91902|        Elena (2011)|               Drama|       2011|
|  68522|        Earth (2007)|         Documentary|       2007|
| 111517|10.000 Km (Long D...|       Drama|Romance|       2014|
| 100306|       Angst  (1983)|        Drama|Horror|       1983|
|   5354|Cactus Flower (1969)|              Comedy|       1969|
|   4926|Everybody's Famou...|Comedy|Drama|Musical|       2000|
| 100494| Incir Reçeli (2011)|       Drama|Romance|       2011|
| 117509| City Slacker (2012)|             Romance|       2012|
|   4992|Kate & Leopold (2...|      Comedy|Romance|       2001|
|  92477|Yes: 9012 Live (1...| Documenta

In [10]:
spark.stop()