In [1]:
import csv
from pyspark.sql.functions import min, max, col, when, lit
from pyspark.sql.types import StructType,StructField, StringType
from pyspark.sql import Row

spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

# load the data
movies_file = sc.textFile("dataset/movies.csv")
genres_file = sc.textFile("dataset/genres.csv")
actors_file = sc.textFile("dataset/actors.csv")
tagNames_file = sc.textFile("dataset/tag_names.csv")
tags_file = sc.textFile("dataset/tags.csv")

# we separate the fields for each table in csv format
data_movies = movies_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_genres = genres_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_actors = actors_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_tagNames = tagNames_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))
data_tags = tags_file.map(lambda row: next(csv.reader(row.splitlines(), skipinitialspace=True)))

# we create the dataframe for each data generated
table_movies = spark.createDataFrame(data_movies, ['mid', 'title','year','rating','num_ratings'])
table_genres = spark.createDataFrame(data_genres, ['mid', 'genre'])
table_actors = spark.createDataFrame(data_actors, ['mid', 'name', 'cast_position'])
table_tagNames = spark.createDataFrame(data_tagNames, ['tid', 'tag'])
table_tags = spark.createDataFrame(data_tags, ['mid', 'tid'])

# create an alias for each table
movies = table_movies.alias('movies')
genres = table_genres.alias('genres')
actors = table_actors.alias('actors')
tagNames = table_tagNames.alias('tagNames')
tags = table_tags.alias('tags')

# we mantain the tables in cache

genres.persist()
actors.persist()
tagNames.persist()
tags.persist()
movies.persist()

mid,title,year,rating,num_ratings
1,Toy story,1995,3.7,102338
2,Jumanji,1995,3.2,44587
3,Grumpy Old Men,1993,3.2,10489
4,Waiting to Exhale,1995,3.3,5666
5,Father of the Bri...,1995,3.0,13761
6,Heat,1995,3.9,42785
7,Sabrina,1954,3.8,12812
8,Tom and Huck,1995,2.7,2649
9,Sudden Death,1995,2.6,3626
10,GoldenEye,1995,3.4,28260


### 1. Print all movie titles starring ‘Daniel Craig’, sorted in an ascending alphabetical order.

In [121]:
movies_with_actors = movies.join(actors, movies.mid == actors.mid)

movies_title_actors = movies_with_actors.select('title','name')
movies_with_Craig = movies_title_actors.filter(movies_title_actors.name == 'Daniel Craig').sort(movies_title_actors.title.asc())

# display the result: we use count to display all the rows
#movies_with_Craig.show(movies_with_Craig.count(), truncate=False)
movies_with_Craig

title,name
A Kid in King Art...,Daniel Craig
Archangel,Daniel Craig
Casino Royale,Daniel Craig
Casino Royale,Daniel Craig
Elizabeth,Daniel Craig
Enduring Love,Daniel Craig
Infamous,Daniel Craig
Lara Croft: Tomb ...,Daniel Craig
Layer Cake,Daniel Craig
Munich,Daniel Craig


### 2. Print names of the cast of the movie ‘The Dark Knight’ in an ascending alphabetical order.

In [110]:
movies_with_actors = movies.join(actors, movies.mid == actors.mid)
movies_title_actors = movies_with_actors.select('title','name')

cast_TheDarkKnight = movies_title_actors.filter(movies_title_actors.title == 'The Dark Knight').sort(movies_title_actors.name.asc())
only_cast = cast_TheDarkKnight.select('name')
#only_cast.show(only_cast.count(), truncate=False)
only_cast

name
Aaron Eckhart
Adam Kalesperis
Aidan Feore
Andrew Bicknell
Andy Luther
Anthony Michael Hall
Ariyon Bakare
Beatrice Rosen
Bill Smille
Brandon Lambdin


### 3. Print the distinct genres in the database and their corresponding number of movies N where N is greater than 1000, sorted in the ascending order of N

In [28]:
reduced_genres = genres.rdd.map(lambda x: (x.genre,1)).reduceByKey(lambda a,b: a+b).filter(lambda x: (int(x[1]) > 1000))
#reduced_genres.takeOrdered(20, key = lambda x: -x[1])
reduced_genresdf = reduced_genres.toDF(['genre', 'countMovies']).sort('countMovies')
reduced_genresdf

genre,countMovies
Adventure,1003
Crime,1086
Action,1445
Romance,1644
Thriller,1664
Comedy,3566
Drama,5076


### 4. For each year, print the movie title, year, and rating, sorted in the ascending order of year and the descending order of movie rating

In [29]:
movies_sorted= movies.sort(movies.year.asc(), movies.rating.desc())
movies_sortedFiltered = movies_sorted.select('title', 'year', 'rating')
movies_sortedFiltered

title,year,rating
The Great Train R...,1903,0.0
The Birth of a Na...,1915,3.3
Intolerance: Love...,1916,3.8
The Immigrant,1917,0.0
Otets Sergiy,1917,0.0
A Dog's Life,1918,0.0
Broken Blossoms o...,1919,3.7
"Die Spinnen, 1. T...",1919,0.0
Male and Female,1919,0.0
Das Cabinet des D...,1920,4.1


### 5. Critiques say that some words used in tags to convey emotions are very recurrent. To convey positive and negative emotions, the words ‘good’ and ‘bad’, respectively, are used predominantly in tags. Print all movie titles whose audience opinion is split (i.e., has at least one audience who expresses positive emotion and at least one who expresses negative emotion).

In [76]:
movies_with_tags = movies.join(tags, ['mid']).join(tagNames, ['tid'])
movies_with_tags_onlyGood = movies_with_tags.filter(movies_with_tags.tag.contains( 'good'))
movies_with_tags_onlyBad = movies_with_tags.filter(movies_with_tags.tag.contains( 'bad'))

movies_with_tags_onlyGoodBad = (movies_with_tags_onlyGood.select('mid','title')).intersect(movies_with_tags_onlyBad.select('mid','title'))
result = movies_with_tags_onlyGoodBad.select('title')
result


title
Twilight
The Forgotten
Starship Troopers
Bridget Jones's D...
Howard the Duck
Hercules in New York
C.H.U.D.
The Wicker Man
Ocean's Eleven
Return of the Kil...


### 6. One would expect that the movie with the highest number of user ratings is either the highest rated movie or perhaps the lowest rated movie. Let’s find out if this is the case here:

#### 6.1 : Print all information (mid, title, year, num ratings, rating) for the movie(s) with the highest number of ratings

In [163]:
num_ratings = movies.filter(movies.num_ratings != '\\N').withColumn('num_ratings', col('num_ratings').cast('int'))
max_num_rating = num_ratings.select(max('num_ratings'))

movies_with_max_num_ratings = movies.filter(movies.num_ratings == max_num_rating.collect()[0][0])
movies_with_max_num_ratings


mid,title,year,rating,num_ratings
4201,Pirates of the Ca...,2007,3.8,1768593
53125,Pirates of the Ca...,2007,3.8,1768593


#### 6.2: Print all information (mid, title, year, num ratings, rating) for the movie(s) with the highest rating (include tuples that tie), sorted by the ascending order of movie id.

In [162]:
num_ratings = movies.filter(movies.rating != '\\N').withColumn('rating', col('rating').cast('double'))
max_rating = num_ratings.select(max('rating'))
movies_with_max_ratings = movies.filter(movies.rating == max_rating.collect()[0][0])
movies_with_max_ratings


mid,title,year,rating,num_ratings
4311,1732 Høtten,1998,5,5


#### 6.3: Is (Are) the movie(s) with the most number of user ratings among these highest rated movies? Print the output of the query that will check our conjecture (i.e., your query will print the movie(s) that has (have) the highest number of ratings as well as the highest rating)

In [164]:
result = movies_with_max_num_ratings.intersect(movies_with_max_ratings)
result

mid,title,year,rating,num_ratings


#### 6.4: Print all information (mid, title, year, num ratings, rating) for the movie(s) with the lowest rating (include tuples that tie), sorted by the ascending order of movie id.

In [172]:
num_ratings = movies.filter(movies.rating != '\\N').withColumn('rating', col('rating').cast('double'))
min_rating = num_ratings.select(min('rating'))
movies_with_min_ratings = movies.filter(movies.rating == min_rating.collect()[0][0])
movies_with_min_ratings




mid,title,year,rating,num_ratings
32,Twelve Monkeys,1995,0,0
33,Wings of Courage,1995,0,0
39,Clueless,1995,0,0
56,Kids of the Round...,1997,0,0
59,Le confessionnal,1995,0,0
61,Eye for an Eye,1996,0,0
63,Don't Be a Menace...,1996,0,0
69,Friday,1995,0,0
115,Le bonheur est da...,1995,0,0
126,The Neverending S...,1994,0,0


#### 6.5: Is (Are) the movie(s) with the most number of user ratings among these lowest rated movies? Print the output of the query that will check our conjecture (i.e., your query will print the movie(s) that has (have) the highest number of ratings as well as the lowest rating)

In [173]:
result = movies_with_max_num_ratings.intersect(movies_with_min_ratings)
result

mid,title,year,rating,num_ratings


#### 6.6: In conclusion, is our hypothesis or conjecture true for the MovieLens database?

No, the results of query 6.3 and 6.5 show that both the intersection of max_num_ratings and max_rating/min_rating are empty! 

#### 7. Print the movie title, year, and rating of the lowest and highest movies for each year in 2005 – 2011, inclusive, in the ascending order of year. In case of a tie, print the records in the ascending order of title.

In [2]:
movies_2005_2011 = movies.filter((movies.year >= 2005) & (movies.year <= 2011))
movies_list = []

for year in range(2005,2012):
    ratings = movies_2005_2011.filter((movies_2005_2011.year == year) & (movies_2005_2011.num_ratings != '\\N') & (movies_2005_2011.num_ratings > 0)).withColumn('rating', col('rating').cast('double'))
    min_rating = ratings.select(min('rating'))
    max_rating = ratings.select(max('rating'))
    result_min = ratings.filter(ratings.rating == min_rating.collect()[0][0]).sort(ratings.title)
    result_max = ratings.filter(ratings.rating == max_rating.collect()[0][0]).sort(ratings.title)
    final_result = result_min.union(result_max).select('title','year', 'rating')
    movies_list.extend(final_result.collect())

rdd = sc.parallelize(movies_list)
result = rdd.toDF()
result

title,year,rating
Alone in the Dark,2005,2.1
Alone in the Dark,2005,2.1
Alone in the Dark,2005,2.1
Son of the Mask,2005,2.1
No Direction Home...,2005,4.3
Basic Instinct 2,2006,2.5
Basic Instinct 2,2006,2.5
Bug,2006,2.5
Bug,2006,2.5
Doogal,2006,2.5


#### 8. Let us find out who are the ‘no flop’ actors. A ‘no flop’ actor can be defined as one who has played only in movies which have a rating greater than or equal to 4. We split this problem into the following steps:

##### 8.1 Create a view called high ratings which contains the distinct names of all actors who have played in movies with a rating greater than or equal to 4. Similarly, create a view called low ratings which contains the distinct names of all actors who have played in movies with a rating less than 4. Print the number of rows in each view.

In [52]:
movies_with_actors = movies.join(actors,['mid']).withColumn('rating', col('rating').cast('double'))

high_ratings = movies_with_actors.filter(movies_with_actors.rating >= 4.0).select('name').distinct()
low_ratings = movies_with_actors.filter(movies_with_actors.rating < 4.0).select('name').distinct()
high_ratings.count()
low_ratings.count()

87032

##### 8.2 Use the above views to print the number of ‘no flop’ actors in the database.

In [53]:
result = high_ratings.subtract(low_ratings)
result.count()

7015

###### 8.3 For each ‘no flop’ actor, print the name of the actor and the number of movies N that he/she played in, sorted in descending order of N. Finally, print the top10 only.

In [54]:
noFlopactors_with_mid = actors.join(result,['name'])

reduced_actors = noFlopactors_with_mid.rdd.map(lambda x: (x.name,1)).reduceByKey(lambda a,b: a+b)
reduced_actors_df = reduced_actors.toDF(['name', 'countMovies'])
final_result = reduced_actors_df.withColumn('countMovies', col('countMovies').cast('int')).sort(reduced_actors_df.countMovies.desc())
final_result

name,countMovies
Nikolai Grinko,8
Paul Frankeur,7
John Cazale,7
Tsutomu Yamazaki,6
Gunnel Lindblom,6
Allan Garcia,6
Kuniko Miyake,6
Anatoli Solonitsin,5
Timothy T. Mitchum,5
Megan Gallagher,5


###### 9. Let us find out who is the actor with the highest ‘longevity.’ Print the name of the actor/actress who has been playing in movies for the longest period of time (i.e., the time interval between their first movie and their last movie is the greatest).

In [2]:
movies_with_actors = movies.join(actors,['mid']).withColumn('year', col('year').cast('int'))
partial_result = movies_with_actors.select('name','year').distinct()

result_max = partial_result.groupBy('name').max('year').withColumnRenamed('max(year)', 'recentYear')
result_min = partial_result.groupBy('name').min('year').withColumnRenamed('min(year)', 'firstYear')
result_join = result_min.join(result_max,['name'])
result = result_join.select('name',result_join.recentYear-result_join.firstYear).withColumnRenamed('(recentYear - firstYear)', 'difference')
max_difference = result.select(max('difference'))
final_result = result.filter(result.difference == max_difference.collect()[0][0])
final_result


name,difference
Morgan Jones,102


###### 10. Let us find the close friends of Annette Nicole. Print the names of all actors who have starred in (at least) all movies in which Annette Nicole has starred in. Note that it is OK if these actors have starred in more movies than Annette Nicole has played in.

###### 10.1 First, create a view called co_actors, which returns the distinct names of actors who played in at least one movie with Annette Nicole. Print the number of rows in this view.

In [3]:
movies_with_actors = movies.join(actors,['mid'])
movies_with_Annette = movies_with_actors.filter(movies_with_actors.name == 'Annette Nicole')
movies_with_Annette_mid = movies_with_Annette.select('mid')
movies_with_cast_and_Annette = actors.join(movies_with_Annette_mid,['mid'])
co_actors = movies_with_cast_and_Annette.select('name').distinct()
co_actors.count()

179

###### 10.2 Second, create a view called all_combinations which returns all possible combinations of co_actors and the movie ids in which Annette Nicole played. Print the number of rows in this view. Note how that this view contains fake (co_actor, mid) combinations!

In [4]:
partial_all_combinations = movies_with_Annette.withColumnRenamed('name', 'nameAnnette').crossJoin(co_actors)
all_combinations = partial_all_combinations.select('mid','name')
all_combinations.count()

537

###### 10.3 Third, create a view called non_existent from the view all_combinations by removing all legitimate (co_actor,mid) pairs (i.e., pairs that exist in the actors table). Print the number of rows in this view

In [5]:
actors_real = actors.select('mid', 'name')

non_existent = all_combinations.subtract(actors_real)
#non_existent.select('name').distinct().count()
non_existent.count()

239

###### 10.4 Finally, from the view co_actors, eliminate the distinct actors that appear in the view non_extistent. Print the names of all co_actors except Annette Nicole.

In [6]:
final = co_actors.subtract(non_existent.select('name'))
final.filter(final.name != 'Annette Nicole')

name
Kristen Connolly
Christian Perry


##### 11. Let us find out who is the most social actor. A social actor is the one with the highest number of distinct co-actors. We will break this into two sub-tasks:

###### 11.1 For the actor Tom Cruise, print his name and the number of distinct co-actors.

In [2]:
movies_with_Tom = actors.filter(actors.name == 'Tom Cruise').select('mid','name') 
movies_with_coActors = movies_with_Tom.join(actors.withColumnRenamed('name', 'nameCoActors'),['mid'])
final_coActors_Tom = movies_with_coActors.filter(movies_with_coActors.nameCoActors != 'Tom Cruise').select('name', 'nameCoActors').distinct().groupBy('name').count().withColumnRenamed('count', 'countCoActors')
final_coActors_Tom


name,countCoActors
Tom Cruise,1238


###### 11.2 For each actor, compute the number of distinct co-actors. For the highest such number, print the name of the actor and the number of distinct co-actors. In case of a tie, print the records sorted in alphabetical order by name.

In [8]:
most_social_actor = []
maxCoActors = 0
actors_list = actors.select('name').distinct().collect()

for actor in actors_list:
    movies_with_specific_actor = actors.filter(actors.name == actor[0]).select('mid','name') 
    movies_with_coActors = movies_with_specific_actor.join(actors.withColumnRenamed('name', 'nameCoActors'),['mid'])
    final_coActors_specific_actor = movies_with_coActors.filter(movies_with_coActors.nameCoActors != actor[0]).select('name', 'nameCoActors').distinct().groupBy('name').count().withColumnRenamed('count', 'countCoActors')
    most_social_actor.append((actor[0], final_coActors_specific_actor.collect()[0][1]))
    
rdd = sc.parallelize(most_social_actor)
result = rdd.toDF()
max_coActors = result.select(max('_2'))

final_result = result.filter(result._2 == max_coActors.collect()[0][0]) 
final_result

_1,_2
Oliver Platt,831


In [95]:
reduced_actors = actors.rdd.map(lambda x: (x.mid,1)).reduceByKey(lambda a,b: a+b)
result_countActors = reduced_actors.toDF().withColumnRenamed('_1','mid').withColumnRenamed('_2','countActors')
movies_with_countActors = result_countActors.join(actors,['mid']).select('mid','countActors','name')

movies_with_countActors_reduced = movies_with_countActors.rdd.map(lambda x: (x.name,x.countActors-1)).reduceByKey(lambda a,b: a+b)
result_countActors = movies_with_countActors_reduced.toDF().withColumnRenamed('_1','name').withColumnRenamed('_2','countCoActors')


couples_coActors = actors.join(actors.withColumnRenamed('name','coActor'),['mid']).groupBy('name','coActor').count().withColumnRenamed('count', 'countCoActors')
result_partial_couples = couples_coActors.filter(couples_coActors.name != couples_coActors.coActor).select('name', 'coActor',couples_coActors.countCoActors-1).withColumnRenamed('(countCoActors - 1)', 'duplicates').withColumn('duplicates', col('duplicates').cast('int'))
result_couples = result_partial_couples.groupBy('name').sum('duplicates').withColumnRenamed('sum(duplicates)', 'duplicates')

r= result_countActors.join(result_couples,['name'])
final_r = r.select('name', r.countCoActors - r.duplicates).withColumnRenamed('(countCoActors - duplicates)','countCoActors')
max_coActors = final_r.select(max('countCoActors'))
#final_r.filter(final_r.name == 'Tom Cruise') --> 1238 it's correct!
final_result = final_r.filter(final_r.countCoActors == max_coActors.collect()[0][0]) 
final_result

name,countCoActors
Samuel L. Jackson,1824


##### 12. We will now write some queries for a Content-Based Movie Recommendation System such as NetFlix. However, in this project we shall deploy a simple algorithm that may or may not produce optimal recommendations. Content-based recommendations focus on the properties of items, in our case movies. The similarity of two movies is determined by measuring the similarity of their properties. For a movie item, we shall consider the following five properties: actors, tags, genres, year, and rating. 

##### Given two movies X and Y, the similarity of Y to X, sim(X,Y), can be computed as: (fraction of common actors + fraction of common tags + fraction of common genres + age gap + rating gap) /5 where fraction is the number of common elements between X and Y divided by the number of elements of X, age gap is the normalized difference between the production years of X and Y, and rating gap is the normalized difference between the ratings of X and Y. Intuitively, the smaller the gaps are, the better (since movies of the same decade and rating are more likely to be similar). Moreover, note that we divide by five because each property is given an equal weight of 1. 

##### Given a user who is known to like the movie ‘Mr. & Mrs. Smith’, write a query that prints the movie title, rating, and similarity percentage (i.e., similarity * 100) for the top 10 movies that are most similar to the ‘Mr. & Mrs. Smith’ movie, ordered by the similarity percentage.

In [5]:
minYear = int(movies.select(min(movies.year)).collect()[0][0])
maxYear = int(movies.select(max(movies.year)).collect()[0][0])
maxRating = float(movies.filter(movies.rating != '\\N').select(max(movies.rating)).collect()[0][0])

def get_mid(movie):
    return movies.filter(movies.title == movie).select('mid').collect()[0][0]

def get_num_actor(mid_film):
    return len(actors.filter(actors.mid == mid_film).select('name').collect())

def get_num_tags(mid_film):
    return len(tags.filter(tags.mid == mid_film).select('tid').distinct().collect())

def get_num_genres(mid_film):
    return len(genres.filter(genres.mid == mid_film).select('genre').distinct().collect())

def intersection_actors(mid_movie1, mid_movie2):
    actorsMovie1 = actors.filter(actors.mid == mid_movie1).select('name')
    actorsMovie2 = actors.filter(actors.mid == mid_movie2).select('name')
    return len(actorsMovie1.intersect(actorsMovie2).collect())

def intersection_tags(mid_movie1, mid_movie2):
    tagsMovie1 = tags.filter(tags.mid == mid_movie1).select('tid')
    tagsMovie2 = tags.filter(tags.mid == mid_movie2).select('tid')
    return len(tagsMovie1.intersect(tagsMovie2).collect())

def intersection_genres(mid_movie1, mid_movie2):
    genreMovie1 = genres.filter(genres.mid == mid_movie1).select('genre')
    genreMovie2 = genres.filter(genres.mid == mid_movie2).select('genre')
    return len(genreMovie1.intersect(genreMovie2).collect())
    
def common_actors(movie1, movie2):
    return intersection_actors(movie1, movie2)/get_num_actor(movie1)

def common_tags(movie1, movie2):
    return intersection_tags(movie1, movie2)/get_num_tags(movie1)

def common_genres(movie1, movie2):
    return intersection_genres(movie1, movie2)/get_num_genres(movie1)

def age_gap(mid_movie1, mid_movie2):
    year_movie1 = int(movies.filter(movies.mid == mid_movie1).select('year').collect()[0][0])
    year_movie2 = int(movies.filter(movies.mid == mid_movie2).select('year').collect()[0][0])
    difference = abs(year_movie1 - year_movie2) #check this site https://stats.stackexchange.com/questions/79706/normalizing-difference-between-two-real-values-to-0-1-interval 
    return 1 - (difference/maxYear)

def rating_gap(mid_movie1, mid_movie2):
    rating_movie1 = float(movies.filter(movies.mid == mid_movie1).select('rating').collect()[0][0])
    rating_movie2 = float(movies.filter(movies.mid == mid_movie2).select('rating').collect()[0][0])
    if rating_movie1 == '\\N' or rating_movie2 == '\\N':
        difference = 0
    else:
        difference = abs(rating_movie1 - rating_movie2) #check this site https://stats.stackexchange.com/questions/79706/normalizing-difference-between-two-real-values-to-0-1-interval 
    return 1 - (difference/maxRating)

def similarity_old(mid_movie1, mid_movie2):
    return (common_actors(mid_movie1, mid_movie2)+common_tags(mid_movie1, mid_movie2)+common_genres(mid_movie1, mid_movie2) + age_gap(mid_movie1, mid_movie2) + rating_gap(mid_movie1, mid_movie2))/5


#print('Similarity(Toy story, Toy story) =', float("{:.2f}".format(similarity(get_mid('Toy story'),get_mid('Toy story')) *100)) , '%')
#print('Similarity(Toy story, Mr. & Mrs. Smith) = ',float("{:.2f}".format( similarity(get_mid('Toy story'),get_mid( 'Mr. & Mrs. Smith')) *100)) , '%')


# start calculating top 10 similar movies 
# cross join of movies
cross_join_movies = movies.withColumnRenamed('mid','mid1').crossJoin(movies.withColumnRenamed('mid','mid2')).select('mid1','mid2')
#remove duplicates
cross_join_without_duplicates = cross_join_movies.filter(cross_join_movies.mid1 != cross_join_movies.mid2)
#map
#map_movies = cross_join_without_duplicates.limit(10).rdd.map(lambda x: ((x.mid1, x.mid2),similarity(x.mid1,x.mid2)))
#map_movies.take(5)

#mid_movieMrSmith = get_mid('Mr. & Mrs. Smith')
#movies_without_MrSmith = movies.filter(movies.title != 'Mr. & Mrs. Smith').collect()

#for mid in movies_without_MrSmith[0][0]:
    

In [100]:
import time
minYear = int(movies.select(min(movies.year)).collect()[0][0])
maxYear = int(movies.select(max(movies.year)).collect()[0][0])
maxRating = float(movies.filter(movies.rating != '\\N').select(max(movies.rating)).collect()[0][0])

def get_mid(movie):
    return movies.filter(movies.title == movie).select('mid').collect()[0][0]


def similarity(mid_movie1, mid_movie2):
    
    get_num_actor = len(actors.filter(actors.mid == mid_movie1).select('name').collect())
    get_num_tags = len(tags.filter(tags.mid == mid_movie1).select('tid').distinct().collect())
    get_num_genres = len(genres.filter(genres.mid == mid_movie1).select('genre').distinct().collect())
    
    actorsMovie1 = actors.filter(actors.mid == mid_movie1).select('name')
    actorsMovie2 = actors.filter(actors.mid == mid_movie2).select('name')
    intersect_actor =  len(actorsMovie1.intersect(actorsMovie2).collect())/get_num_actor
    
    tagsMovie1 = tags.filter(tags.mid == mid_movie1).select('tid')
    tagsMovie2 = tags.filter(tags.mid == mid_movie2).select('tid')
    intersect_tag = len(tagsMovie1.intersect(tagsMovie2).collect())/get_num_tags
    
    genreMovie1 = genres.filter(genres.mid == mid_movie1).select('genre')
    genreMovie2 = genres.filter(genres.mid == mid_movie2).select('genre')
    intersect_genre =  len(genreMovie1.intersect(genreMovie2).collect())/get_num_genres
    
    year_movie1 = int(movies.filter(movies.mid == mid_movie1).select('year').collect()[0][0])
    year_movie2 = int(movies.filter(movies.mid == mid_movie2).select('year').collect()[0][0])
    difference = abs(year_movie1 - year_movie2)
    age_gap =  1 - (difference/maxYear)
    
    
    rating_movie1 = float(movies.filter(movies.mid == mid_movie1).select('rating').collect()[0][0])
    rating_movie2 = float(movies.filter(movies.mid == mid_movie2).select('rating').collect()[0][0])
    difference_gap = abs(rating_movie1 - rating_movie2) 
    rating_gap = 1 - (difference_gap/maxRating)
    similarity = (intersect_actor+intersect_tag+intersect_genre + age_gap + rating_gap)/5
    
    myFloatRdd = sc.parallelize([similarity])
    row = Row("similarity") 
    result = myFloatRdd.map(row).toDF()

    
    return result.select('similarity')


mid_movieMrSmith = get_mid('Mr. & Mrs. Smith')
movies_without_MrSmith = movies.filter(movies.mid != mid_movieMrSmith).filter(movies.rating != '\\N').select('mid','rating')
similarity_for_movies = []

'''
for mid in movies_without_MrSmith.collect():
    similarity_for_movies.append((mid_movieMrSmith,mid[0], similarity(mid_movieMrSmith,mid[0])))

print(similarity_for_movies)

'''

#map_film = movies_without_MrSmith.rdd.flatMap(lambda x: ((mid_movieMrSmith,x.mid), similarity(mid_movieMrSmith, x.collect()[0][0])))
map_film = movies_without_MrSmith.rdd.flatMap(lambda x: ((mid_movieMrSmith,x.mid), lambda x: (len(actors.filter(actors.mid == x.mid).select('name').collect()[0][0]))))
#movies_without_MrSmith.withColumn('rating', similarity(mid_movieMrSmith,movies_without_MrSmith.collect()[0][0])[0])
map_film.take(5)


Traceback (most recent call last):
  File "/home/luca/Scrivania/BigData-project/spark-3.0.2-bin-hadoop2.7/python/pyspark/serializers.py", line 468, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/home/luca/Scrivania/BigData-project/spark-3.0.2-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 1097, in dumps
    cp.dump(obj)
  File "/home/luca/Scrivania/BigData-project/spark-3.0.2-bin-hadoop2.7/python/pyspark/cloudpickle.py", line 357, in dump
    return Pickler.dump(self, obj)
  File "/home/luca/anaconda3/lib/python3.7/pickle.py", line 437, in dump
    self.save(obj)
  File "/home/luca/anaconda3/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/luca/anaconda3/lib/python3.7/pickle.py", line 789, in save_tuple
    save(element)
  File "/home/luca/anaconda3/lib/python3.7/pickle.py", line 504, in save
    f(self, obj) # Call unbound method with explicit self
  File "/home/luca/Scrivania/BigData-project/

PicklingError: Could not serialize object: TypeError: can't pickle _thread.RLock objects