In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

In [2]:
spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

Create schema when reading u.data

In [3]:
schema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

Load up movie data as dataframe

In [4]:
moviesDF = spark.read.option("sep", "\t").schema(schema).csv("ml-100k/u.data")

Some SQL-style magic to sort all movies by popularity in one line!

In [5]:
topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(func.desc("count"))

Grab the top 10

In [6]:
topMovieIDs.show(10)

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
+-------+-----+
only showing top 10 rows



Stop the session

In [7]:
spark.stop()