In [0]:
from pyspark.sql import SparkSession

# Load the JSON file into a DataFrame
movies_df = spark.read.json("/FileStore/movies.json")

# Display the first 10 records
movies_df.show(10)

# Print the schema of the DataFrame
movies_df.printSchema()


+-----------------+--------------------+-------------+
|       actor_name|         movie_title|produced_year|
+-----------------+--------------------+-------------+
|McClure, Marc (I)|        Coach Carter|         2005|
|McClure, Marc (I)|         Superman II|         1980|
|McClure, Marc (I)|           Apollo 13|         1995|
|McClure, Marc (I)|            Superman|         1978|
|McClure, Marc (I)|  Back to the Future|         1985|
|McClure, Marc (I)|Back to the Futur...|         1990|
|Cooper, Chris (I)|  Me, Myself & Irene|         2000|
|Cooper, Chris (I)|         October Sky|         1999|
|Cooper, Chris (I)|              Capote|         2005|
|Cooper, Chris (I)|The Bourne Supremacy|         2004|
+-----------------+--------------------+-------------+
only showing top 10 rows

root
 |-- actor_name: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- produced_year: long (nullable = true)



In [0]:
from pyspark.sql.functions import col, floor

# Add a 'decade' column based on the year
movies_with_decade_df = movies_df.withColumn("decade", (floor(col("produced_year") / 10) * 10))

# Display the first 10 records to verify the column was added
movies_with_decade_df.show(10)

+-----------------+--------------------+-------------+------+
|       actor_name|         movie_title|produced_year|decade|
+-----------------+--------------------+-------------+------+
|McClure, Marc (I)|        Coach Carter|         2005|  2000|
|McClure, Marc (I)|         Superman II|         1980|  1980|
|McClure, Marc (I)|           Apollo 13|         1995|  1990|
|McClure, Marc (I)|            Superman|         1978|  1970|
|McClure, Marc (I)|  Back to the Future|         1985|  1980|
|McClure, Marc (I)|Back to the Futur...|         1990|  1990|
|Cooper, Chris (I)|  Me, Myself & Irene|         2000|  2000|
|Cooper, Chris (I)|         October Sky|         1999|  1990|
|Cooper, Chris (I)|              Capote|         2005|  2000|
|Cooper, Chris (I)|The Bourne Supremacy|         2004|  2000|
+-----------------+--------------------+-------------+------+
only showing top 10 rows



In [0]:
# Rename the first two columns to 'actor' and 'title'
renamed_movies_df = movies_with_decade_df.withColumnRenamed("actor_name", "actor").withColumnRenamed("movie_title", "title")

# Display the first 10 records to verify the columns were renamed
renamed_movies_df.show(10)


+-----------------+--------------------+-------------+------+
|            actor|               title|produced_year|decade|
+-----------------+--------------------+-------------+------+
|McClure, Marc (I)|        Coach Carter|         2005|  2000|
|McClure, Marc (I)|         Superman II|         1980|  1980|
|McClure, Marc (I)|           Apollo 13|         1995|  1990|
|McClure, Marc (I)|            Superman|         1978|  1970|
|McClure, Marc (I)|  Back to the Future|         1985|  1980|
|McClure, Marc (I)|Back to the Futur...|         1990|  1990|
|Cooper, Chris (I)|  Me, Myself & Irene|         2000|  2000|
|Cooper, Chris (I)|         October Sky|         1999|  1990|
|Cooper, Chris (I)|              Capote|         2005|  2000|
|Cooper, Chris (I)|The Bourne Supremacy|         2004|  2000|
+-----------------+--------------------+-------------+------+
only showing top 10 rows



In [0]:
# Group by 'decade' and count the number of movies in each decade
decade_counts_df = renamed_movies_df.groupBy("decade").count()

# Order the result by count in descending order and display the decade with the most movies
decade_counts_df.orderBy(col("count").desc()).show(1)


+------+-----+
|decade|count|
+------+-----+
|  2000|18622|
+------+-----+
only showing top 1 row



In [0]:
# Group by 'actor' and count the number of movies for each actor
actor_counts_df = renamed_movies_df.groupBy("actor").count()

# Order the result by count in descending order
actor_counts_df.orderBy(col("count").desc()).show()


+-------------------+-----+
|              actor|count|
+-------------------+-----+
|   Tatasciore, Fred|   38|
|      Welker, Frank|   38|
| Jackson, Samuel L.|   32|
|      Harnell, Jess|   31|
|        Damon, Matt|   27|
|      Willis, Bruce|   27|
|  Cummings, Jim (I)|   26|
|         Hanks, Tom|   25|
|   Lynn, Sherry (I)|   25|
|    McGowan, Mickie|   25|
|    Bergen, Bob (I)|   25|
|      Proctor, Phil|   24|
|   Wilson, Owen (I)|   23|
|        Cruise, Tom|   23|
|         Pitt, Brad|   23|
|Freeman, Morgan (I)|   22|
|Williams, Robin (I)|   22|
|       Depp, Johnny|   22|
|     Morrison, Rana|   22|
|      Diaz, Cameron|   21|
+-------------------+-----+
only showing top 20 rows

