In [58]:
import csv
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

# load the data
movies_file = sc.textFile("dataset/movies.csv")
genres_file = sc.textFile("dataset/genres.csv")
actors_file = sc.textFile("dataset/actors.csv")
tagNames_file = sc.textFile("dataset/tag_names.csv")
tags_file = sc.textFile("dataset/tags.csv")

# we separate the fields for each table in csv format
data_movies = movies_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_genres = genres_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_actors = actors_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_tagNames = tagNames_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_tags = tags_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))

# we create the dataframe for each data generated
table_movies = spark.createDataFrame(data_movies, ['mid', 'title','year','rating','num_ratings'])
table_genres = spark.createDataFrame(data_genres, ['mid', 'genre'])
table_actors = spark.createDataFrame(data_actors, ['mid', 'name', 'cast_position'])
table_tagNames = spark.createDataFrame(data_tagNames, ['tid', 'tag'])
table_tags = spark.createDataFrame(data_tags, ['mid', 'tid'])

# create an alias for each table
movies = table_movies.alias('movies')
genres = table_genres.alias('genres')
actors = table_actors.alias('actors')
tagNames = table_tagNames.alias('tagNames')
tags = table_tags.alias('tags')

# we mantain the tables in cache

genres.persist()
actors.persist()
tagNames.persist()
tags.persist()
movies.persist()

mid,title,year,rating,num_ratings
1,Toy story,1995,3.7,102338
2,Jumanji,1995,3.2,44587
3,Grumpy Old Men,1993,3.2,10489
4,Waiting to Exhale,1995,3.3,5666
5,Father of the Bri...,1995,3.0,13761
6,Heat,1995,3.9,42785
7,Sabrina,1954,3.8,12812
8,Tom and Huck,1995,2.7,2649
9,Sudden Death,1995,2.6,3626
10,GoldenEye,1995,3.4,28260


### 1. Print all movie titles starring ‘Daniel Craig’, sorted in an ascending alphabetical order.

In [63]:
movies_with_actors = movies.join(actors, movies.mid == actors.mid)

movies_title_actors = movies_with_actors.select('title','name')
movies_with_Craig = movies_title_actors.filter(movies_title_actors.name == 'Daniel Craig').sort(movies_title_actors.title.asc())

# display the result: we use count to display all the rows
#movies_with_Craig.show(movies_with_Craig.count(), truncate=False)
movies_with_Craig

title,name
A Kid in King Art...,Daniel Craig
Archangel,Daniel Craig
Casino Royale,Daniel Craig
Casino Royale,Daniel Craig
Elizabeth,Daniel Craig
Enduring Love,Daniel Craig
Infamous,Daniel Craig
Lara Croft: Tomb ...,Daniel Craig
Layer Cake,Daniel Craig
Munich,Daniel Craig


### 2. Print names of the cast of the movie ‘The Dark Knight’ in an ascending alphabetical order.

In [64]:
movies_with_actors = movies.join(actors, movies.mid == actors.mid)
movies_title_actors = movies_with_actors.select('title','name')

cast_TheDarkKnight = movies_title_actors.filter(movies_title_actors.title == 'The Dark Knight').sort(movies_title_actors.name.asc())
only_cast = cast_TheDarkKnight.select('name')
#only_cast.show(only_cast.count(), truncate=False)
only_cast

name
Aaron Eckhart
Adam Kalesperis
Aidan Feore
Andrew Bicknell
Andy Luther
Anthony Michael Hall
Ariyon Bakare
Beatrice Rosen
Bill Smille
Brandon Lambdin


### 3. Print the distinct genres in the database and their corresponding number of movies N where N is greater than 1000, sorted in the ascending order of N

In [72]:
reduced_genres = genres.rdd.map(lambda x: (x.mid,1)).reduceByKey(lambda a,b: a+b).filter(lambda x: (int(x[0]) > 1000))
reduced_genres.takeOrdered(20, key = lambda x: -x[1])

[('51709', 8),
 ('2987', 7),
 ('46948', 7),
 ('56152', 7),
 ('2701', 6),
 ('3535', 6),
 ('4306', 6),
 ('7235', 6),
 ('7835', 6),
 ('8481', 6),
 ('26326', 6),
 ('26504', 6),
 ('27773', 6),
 ('31367', 6),
 ('31804', 6),
 ('32031', 6),
 ('36397', 6),
 ('51939', 6),
 ('54278', 6),
 ('62999', 6)]