In [5]:
   !pip install pyspark

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
import datetime

# Create a Spark session
spark = SparkSession.builder \
    .appName("PixarFilmAnalysis") \
    .getOrCreate()

# Define the schema for our Pixar films dataset
schema = StructType([
    StructField("Title", StringType(), True),
    StructField("Release_Year", IntegerType(), True),
    StructField("Director", StringType(), True),
    StructField("IMDB_Score", FloatType(), True),
    StructField("Runtime_Minutes", IntegerType(), True),
    StructField("Box_Office_Millions", FloatType(), True),
    StructField("Release_Date", DateType(), True)
])

# Create sample data
data = [
    ("Toy Story", 1995, "John Lasseter", 8.3, 81, 373.6, datetime.date(1995, 11, 22)),
    ("A Bug's Life", 1998, "John Lasseter", 7.2, 95, 363.3, datetime.date(1998, 11, 25)),
    ("Toy Story 2", 1999, "John Lasseter", 7.9, 92, 497.4, datetime.date(1999, 11, 24)),
    ("Monsters, Inc.", 2001, "Pete Docter", 8.1, 92, 577.4, datetime.date(2001, 11, 2)),
    ("Finding Nemo", 2003, "Andrew Stanton", 8.1, 100, 940.3, datetime.date(2003, 5, 30)),
    ("The Incredibles", 2004, "Brad Bird", 8.0, 115, 631.4, datetime.date(2004, 11, 5)),
    ("Cars", 2006, "John Lasseter", 7.1, 117, 462.2, datetime.date(2006, 6, 9)),
    ("Ratatouille", 2007, "Brad Bird", 8.0, 111, 623.7, datetime.date(2007, 6, 29)),
    ("WALL-E", 2008, "Andrew Stanton", 8.4, 98, 521.3, datetime.date(2008, 6, 27)),
    ("Up", 2009, "Pete Docter", 8.2, 96, 735.1, datetime.date(2009, 5, 29)),
    ("Toy Story 3", 2010, "Lee Unkrich", 8.3, 103, 1067.0, datetime.date(2010, 6, 18)),
    ("Cars 2", 2011, "John Lasseter", 6.1, 106, 559.9, datetime.date(2011, 6, 24)),
    ("Brave", 2012, "Mark Andrews", 7.1, 93, 538.8, datetime.date(2012, 6, 22)),
    ("Monsters University", 2013, "Dan Scanlon", 7.3, 104, 743.6, datetime.date(2013, 6, 21)),
    ("Inside Out", 2015, "Pete Docter", 8.2, 95, 858.8, datetime.date(2015, 6, 19)),
    ("The Good Dinosaur", 2015, "Peter Sohn", 6.7, 93, 332.2, datetime.date(2015, 11, 25)),
    ("Finding Dory", 2016, "Andrew Stanton", 7.3, 97, 1028.6, datetime.date(2016, 6, 17)),
    ("Cars 3", 2017, "Brian Fee", 6.7, 102, 383.9, datetime.date(2017, 6, 16)),
    ("Coco", 2017, "Lee Unkrich", 8.4, 105, 807.1, datetime.date(2017, 11, 22)),
    ("Incredibles 2", 2018, "Brad Bird", 7.6, 118, 1242.8, datetime.date(2018, 6, 15)),
    ("Toy Story 4", 2019, "Josh Cooley", 7.7, 100, 1073.4, datetime.date(2019, 6, 21)),
    ("Onward", 2020, "Dan Scanlon", 7.4, 102, 141.9, datetime.date(2020, 3, 6)),
    ("Soul", 2020, "Pete Docter", 8.1, 100, 121.0, datetime.date(2020, 12, 25)),
    ("Luca", 2021, "Enrico Casarosa", 7.5, 95, 49.8, datetime.date(2021, 6, 18)),
    ("Turning Red", 2022, "Domee Shi", 7.0, 100, 20.1, datetime.date(2022, 3, 11)),
    ("Lightyear", 2022, "Angus MacLane", 6.1, 105, 226.4, datetime.date(2022, 6, 17)),
    ("Elemental", 2023, "Peter Sohn", 7.0, 109, 496.0, datetime.date(2023, 6, 16))
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Show the first few rows
print("Sample of Pixar films:")
df.show(5)

# See the structure of the data
print("\nDataset schema:")
df.printSchema()

# Count how many movies are there
print("\nTotal number of films:", df.count())

# Find average IMDB rating
from pyspark.sql.functions import avg, round
avg_rating = df.select(round(avg("IMDB_Score"), 2).alias("Average IMDB Rating")).collect()[0][0]
print(f"\nAverage IMDB Rating: {avg_rating}")

# Group by IMDB Score (rounded) and count
print("\nFilms grouped by IMDB Score (rounded):")
from pyspark.sql.functions import round
df.groupBy(round("IMDB_Score").alias("IMDB_Score_Rounded")).count().orderBy("IMDB_Score_Rounded").show()

# Films by director
print("\nNumber of films by director:")
df.groupBy("Director").count().orderBy("count", ascending=False).show()

# Box office performance over time
print("\nAverage box office by year:")
df.groupBy("Release_Year").agg(round(avg("Box_Office_Millions"), 2).alias("Avg_Box_Office_Millions")) \
  .orderBy("Release_Year").show(30)

Sample of Pixar films:


                                                                                

+--------------+------------+--------------+----------+---------------+-------------------+------------+
|         Title|Release_Year|      Director|IMDB_Score|Runtime_Minutes|Box_Office_Millions|Release_Date|
+--------------+------------+--------------+----------+---------------+-------------------+------------+
|     Toy Story|        1995| John Lasseter|       8.3|             81|              373.6|  1995-11-22|
|  A Bug's Life|        1998| John Lasseter|       7.2|             95|              363.3|  1998-11-25|
|   Toy Story 2|        1999| John Lasseter|       7.9|             92|              497.4|  1999-11-24|
|Monsters, Inc.|        2001|   Pete Docter|       8.1|             92|              577.4|  2001-11-02|
|  Finding Nemo|        2003|Andrew Stanton|       8.1|            100|              940.3|  2003-05-30|
+--------------+------------+--------------+----------+---------------+-------------------+------------+
only showing top 5 rows


Dataset schema:
root
 |-- Tit

