Set up spark:

In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=f0574f70cb082391c1a78e7716c051f17cfb723cecaab0b491f1d574aaaede28
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0
The following additional packages will be installed:
  libxtst6 openjdk-8-jre-headless
Suggested packages:
  openjdk-8-demo openjdk-8-source libnss-mdns fonts-dejavu-extra fonts-nanum fonts-ipafont-gothic
  fonts-ipafont-mincho fonts-wqy-microhei fonts-wqy-zenhei fonts-indic

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setMaster("local").setAppName("FriendsByAge")
sc = SparkContext(conf = conf)
spark = SparkSession.builder.getOrCreate()

Read data:

In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType

In [4]:
schema = StructType([
    StructField('userID', IntegerType(), nullable=False),
    StructField('movieID', IntegerType(), nullable=False),
    StructField('rating', IntegerType(), nullable=False),
    StructField('timestamp', LongType(), nullable=True)
])

Reads movie names (key - movie ID, value - movie Name). Here we read the dictionary in the name node, and broadcast it to all the worker nodes, since its size is small, and is a sufficiently quick information, thus, no need for worker nodes to do this all the time.

In [5]:
import codecs

def load_movie_names():
  names_dict = {}
  with codecs.open('/content/drive/MyDrive/spark_tutorials/spark_datasets/ml-100k/u.item', 'r', encoding='ISO-8859-1', errors='ignore') as f:
    for line in f:
      fields = line.split('|')
      names_dict[int(fields[0])] = fields[1]

  return names_dict

movie_names_dict = sc.broadcast(load_movie_names())

In [6]:
movies_df = spark.read.option('sep', '\t').schema(schema).csv('/content/drive/MyDrive/spark_tutorials/spark_datasets/ml-100k/u.data')

In [8]:
top_movies = movies_df.groupBy('movieID').agg(F.count('*').alias('count')).orderBy(F.desc('count'))
top_movies.show(10)

+-------+-----+
|movieID|count|
+-------+-----+
|     50|  583|
|    258|  509|
|    100|  508|
|    181|  507|
|    294|  485|
|    286|  481|
|    288|  478|
|      1|  452|
|    300|  431|
|    121|  429|
+-------+-----+
only showing top 10 rows



Lookup the movie names by ID:

In [19]:
def lookup_name(id):
  return movie_names_dict.value[id]

lookup_name_udf = F.udf(lookup_name)

top_movies = movies_df.groupBy('movieID')\
        .agg(F.count('*').alias('ratings_count'), F.round(F.avg('rating'), 2).alias('average_rating'))\
        .orderBy(F.desc('ratings_count'))
top_movies_named_df = top_movies.withColumn('movie_names', lookup_name_udf(F.col('movieID')))
top_movies_named_df.show(10)

+-------+-------------+--------------+--------------------+
|movieID|ratings_count|average_rating|         movie_names|
+-------+-------------+--------------+--------------------+
|     50|          583|          4.36|    Star Wars (1977)|
|    258|          509|           3.8|      Contact (1997)|
|    100|          508|          4.16|        Fargo (1996)|
|    181|          507|          4.01|Return of the Jed...|
|    294|          485|          3.16|    Liar Liar (1997)|
|    286|          481|          3.66|English Patient, ...|
|    288|          478|          3.44|       Scream (1996)|
|      1|          452|          3.88|    Toy Story (1995)|
|    300|          431|          3.63|Air Force One (1997)|
|    121|          429|          3.44|Independence Day ...|
+-------+-------------+--------------+--------------------+
only showing top 10 rows

