In [5]:
from pyspark.sql import SparkSession, Row, Window
from pyspark.sql.functions import *
from pyspark import SparkConf
from pyspark.sql.types import IntegerType


sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("GCSExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set(
    "fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS"
)

# Use the Cloud Storage bucket for temporary BigQuery export data the connector uses.
bucket = "temp_de2023_2065718"
spark.conf.set("temporaryGcsBucket", bucket)

#  Google Storage File Path
gsc_file_path = "gs://data_de2023_2065718"  #  use your gcp bucket name.
# Create data frame
movie_df = (
    spark.read.format("csv")
    .option("header", "true")
    .load(gsc_file_path + "/rotten_tomatoes_movies.csv")
)
movie_df.printSchema()

movie_df.show()

Unnamed: 0,id,title,audienceScore,tomatoMeter,rating,ratingContents,releaseDateTheaters,releaseDateStreaming,runtimeMinutes,genre,originalLanguage,director,writer,boxOffice,distributor,soundMix
0,space-zombie-bingo,Space Zombie Bingo!,50.0,,,,,2018-08-25,75.0,"Comedy, Horror, Sci-fi",English,George Ormrod,"George Ormrod,John Sabotta",,,
1,the_green_grass,The Green Grass,,,,,,2020-02-11,114.0,Drama,English,Tiffany Edwards,Tiffany Edwards,,,
2,love_lies,"Love, Lies",43.0,,,,,,120.0,Drama,Korean,"Park Heung-Sik,Heung-Sik Park","Ha Young-Joon,Jeon Yun-su,Song Hye-jin",,,
3,the_sore_losers_1997,Sore Losers,60.0,,,,,2020-10-23,90.0,"Action, Mystery & thriller",English,John Michael McCarthy,John Michael McCarthy,,,
4,dinosaur_island_2002,Dinosaur Island,70.0,,,,,2017-03-27,80.0,"Fantasy, Adventure, Animation",English,Will Meugniot,John Loy,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143253,nadia_the_secret_of_blue_water_the_motion_pict...,Nadia: The Secret of Blue Water: The Motion Pi...,14.0,,,,2002-08-27,,90.0,"Action, Adventure, Anime",Japanese,Sho Aono,Kaoru Umeno,,ADV Films,
143254,everyone_i_knew_and_loved,Everyone I Knew and Loved,,,,,,,99.0,Drama,English,Andrew Behringer,Erika Heidewald,,,
143255,the-human-body,The Human Body,71.0,89.0,,,,,43.0,Documentary,English,Peter Georgi,Richard Dale,,,
143256,flying_fists,Flying Fists,,,,,,2006-11-21,63.0,Drama,English,Robert F. Hill,"Robert F. Hill,Basil Dickey",,,


In [3]:
# Handling missing values
movie_df = (
    movie_df.withColumn(
        "age_class", when(col("rating").isNotNull(), col("rating")).otherwise("G")
    )
    .withColumn(
        "ratingContents",
        when(col("ratingContents").isNotNull(), col("ratingContents")).otherwise(
            "None"
        ),
    )
    .withColumn(
        "distributor",
        when(col("distributor").isNotNull(), col("distributor")).otherwise(
            "independent"
        ),
    )
)

# Dropping columns
columns_to_drop = ["soundMix", "boxOffice", "rating"]
movie_df = movie_df.drop(*columns_to_drop)

# audienceScore to integer
movie_df = movie_df.withColumn(
    "audienceScore", movie_df["audienceScore"].cast(IntegerType())
)

# Dropping rows with missing values in specific columns
columns_to_check_for_null = [
    "audienceScore",
    "tomatoMeter",
    "releaseDateTheaters",
    "runtimeMinutes",
    "genre",
    "director",
    "originalLanguage",
]
movie_df = movie_df.dropna(subset=columns_to_check_for_null)

# Converting releaseDateTheaters to datetime
movie_df = movie_df.withColumn(
    "releaseDateTheaters", to_date(col("releaseDateTheaters"), "y-M-d")
)  # our dataframe with date column

# Extracting date components
movie_df = (
    movie_df.withColumn("releaseDay", day(col("releaseDateTheaters")))
    .withColumn("releaseMonth", month(col("releaseDateTheaters")))
    .withColumn("releaseYear", year(col("releaseDateTheaters")))
    .withColumn("releaseWeekday", weekday(col("releaseDateTheaters")))
)

# Show the resulting PySpark DataFrame
movie_df.show()

In [None]:
# Rank the resturants in terms of the avg price, per each city. Then, select the best and worst product using ranks
windowasc = Window.partitionBy(col("genre"), col("releaseYear")).orderBy(
    col("audienceScore").asc()
)

movie_df_window = movie_df.withColumn("rank", dense_rank().over(windowasc))

movie_df_window.select(
    col("title"), col("audienceScore"), col("genre"), col("releaseYear"), col("rank")
).write.format("bigquery").option(
    "table", "de23-398309.assignment2dataset.movierank"
).mode(
    "overwrite"
).save()

movie_df_window.select(
    col("title"), col("audienceScore"), col("genre"), col("releaseYear"), col("rank")
).show(100)

In [None]:
movie_df.groupby("age_class").agg(
    avg("audienceScore").alias("avg_audience_score")
).write.format("bigquery").option(
    "table", "de23-398309.assignment2dataset.ageperformance"
).mode(
    "overwrite"
).save()

movie_df.groupby("age_class").agg(
    avg("audienceScore").alias("avg_audience_score")
).show()

In [None]:
spark.stop()