# **Code for Google Colab**
The below code mounts the contents of google drive, so that the files in there can be accessed and used.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
path="/content/drive/MyDrive/ColabNotebooks/" # we can change the path to wherever our files are located in the drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%capture
# install swi-prolog
!sudo apt-get install software-properties-common
!sudo apt-add-repository ppa:swi-prolog
!sudo apt-get update
!sudo apt-get install swi-prolog
# install pyswip
!pip install pyswip


# **Part 1: Creating a base of knowledge from the csv files** 


In [3]:
import pandas as pd
from pyswip import Prolog

In [4]:
import pandas as pd 
 
data = pd.read_csv(path + "movies_metadata.csv") 
data.fillna("UNK", inplace=True)   # when we have an null value, we replace it with "UNK"
# Preview the first 5 lines of the loaded data 
data.head()


Unnamed: 0.1,Unnamed: 0,budget,genres,homepage,id,plot_keywords,language,original_title,overview,popularity,production_companies,production_countries,release_date,gross,duration,spoken_languages,status,tagline,movie_title,vote_average,num_voted_users,title_year,country,director_name,actor_1_name,actor_2_name,actor_3_name
0,0,237000000,Action|Adventure|Fantasy|Science Fiction,http://www.avatarmovie.com/,19995,culture clash|future|space war|space colony|so...,English,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'name': 'Ingenious Film Partners', 'id': 289...","[{'iso_3166_1': 'US', 'name': 'United States o...",2009-12-10,2787965087,162,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,2009,United States of America,James Cameron,Zoe Saldana,Sigourney Weaver,Stephen Lang
1,1,300000000,Adventure|Fantasy|Action,http://disney.go.com/disneypictures/pirates/,285,ocean|drug abuse|exotic island|east india trad...,English,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'name': 'Walt Disney Pictures', 'id': 2}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2007-05-19,961000000,169,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,2007,United States of America,Gore Verbinski,Orlando Bloom,Keira Knightley,Stellan Skarsgård
2,2,245000000,Action|Adventure|Crime,http://www.sonypictures.com/movies/spectre/,206647,spy|based on novel|secret agent|sequel|mi6|bri...,Français,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2015-10-26,880674609,148,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,A Plan No One Escapes,Spectre,6.3,4466,2015,United Kingdom,Sam Mendes,Christoph Waltz,Léa Seydoux,Ralph Fiennes
3,3,250000000,Action|Crime|Drama|Thriller,http://www.thedarkknightrises.com/,49026,dc comics|crime fighter|terrorist|secret ident...,English,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{'name': 'Legendary Pictures', 'id': 923}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-07-16,1084939099,165,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,2012,United States of America,Christopher Nolan,Michael Caine,Gary Oldman,Anne Hathaway
4,4,260000000,Action|Adventure|Science Fiction,http://movies.disney.com/john-carter,49529,based on novel|mars|medallion|space travel|pri...,English,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-07,284139100,132,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,2012,United States of America,Andrew Stanton,Lynn Collins,Samantha Morton,Willem Dafoe


In [5]:
# a simple function that solves issues with characters such as ' or " 
def clean_text(text):   
  text = text.replace(u'\xa0', u'')
  text = text.replace(u"'", u'')
  return text

In [6]:
import ast
#create World
prolog = Prolog()

# for each row of the dataframe, we create some facts that will be added to prolog later on
literals = []  # the facts wil be saved in this list
movie_score = {}
for row in data.itertuples(index=True, name='Pandas'):  
  movie_title = clean_text(getattr(row, 'movie_title'))  # cleaning the movie title

  # for almost each column, we create a fact in prolog, for example: genre(movie_title, movie_genre)
  for genre in getattr(row, 'genres').split("|"):
    if genre != 'UNK':
      literals.append("genre('"+ movie_title +"','"+ genre +"')")

  budget = getattr(row, 'budget')
  if budget != 'UNK':
    literals.append("budget('"+ movie_title +"','"+ str(budget) +"')")

  for plot_key in clean_text(getattr(row, 'plot_keywords')).split("|"):
    if plot_key != 'UNK':
      literals.append("plot_keyword('"+ movie_title +"','"+ plot_key +"')")

  language = getattr(row, 'language')
  if language != 'UNK':
    literals.append("language('"+ movie_title +"','"+ language +"')")

  popularity = getattr(row, 'popularity')
  if popularity != 'UNK':
    literals.append("popularity('"+ movie_title +"','"+ str(popularity) +"')")

  if getattr(row, 'production_companies') != 'UKN':
    for elements_comp in ast.literal_eval(getattr(row, 'production_companies')):
      literals.append("production_companies('"+ movie_title +"','"+ clean_text(elements_comp['name']) +"')")

  if getattr(row, 'production_countries') != 'UKN':
    for elements_countr in ast.literal_eval(getattr(row, 'production_countries')):
      literals.append("production_country('"+ movie_title +"','"+ elements_countr['iso_3166_1'] +"')")
  
  release_date = getattr(row, 'release_date').split("-")
  if len(release_date)!=1:
      literals.append("year('"+ movie_title +"','"+ release_date[0] +"')")
      literals.append("month('"+ movie_title +"','"+ release_date[1] +"')")
      literals.append("day('"+ movie_title +"','"+ release_date[2] +"')")

  duration = getattr(row, 'duration')
  if duration != 'UNK':
    literals.append("duration('"+ movie_title +"','"+ str(duration) +"')")
  if getattr(row, 'spoken_languages') != 'UNK':
    for elements_spokenlang in ast.literal_eval(getattr(row, 'spoken_languages')):
      literals.append("spoken_language('"+ movie_title +"','"+ elements_spokenlang['iso_639_1'] +"')")

  status = getattr(row, 'status')
  if status != 'UNK':
    literals.append("status('"+ movie_title +"','"+ status +"')")

  vote_average = getattr(row, 'vote_average')
  if vote_average != 'UNK':
    literals.append("vote_average('"+ movie_title +"','"+ str(vote_average) +"')")

  country = getattr(row, 'country')
  if country != 'UNK':
    literals.append("country('"+ movie_title +"','"+ country +"')")

  director_name = clean_text(getattr(row, 'director_name'))
  if director_name != 'UNK':
    literals.append("director_name('"+ movie_title +"','"+ director_name +"')")

  actor_name = clean_text(getattr(row, 'actor_1_name'))
  if actor_name != 'UNK':
    literals.append("actor_name('"+ movie_title +"','"+ actor_name +"')")

  actor_name = clean_text(getattr(row, 'actor_2_name'))
  if actor_name != 'UNK':
    literals.append("actor_name('"+ movie_title +"','"+ actor_name +"')")

  actor_name = clean_text(getattr(row, 'actor_3_name'))
  if actor_name != 'UNK':
    literals.append("actor_name('"+ movie_title +"','"+ actor_name +"')")

# Prolog requires that facts are grouped by their names 
literals.sort()
for literal in literals:
  prolog.assertz(literal) 

# After creating our facts, we also "import" rules from another prolog file
prolog.consult(path + "db.pl")

In [7]:
# we can check whether a rule works or not with the code below
q = prolog.query("common_director('The Avengers', Y)")  # returns the movies that have the same director with The Avengers   
s = set()
for soln in q:
  m = soln['Y'] 
  if m not in s:
    s.add(soln['Y'])
    print(m)

Avengers: Age of Ultron
Serenity


# **Part 2: Recommendation System based on movies attributes.**
Based on the previously created base of knowledge, we will make some rules that find related movies, named "find_sim". The realativity of those movies will be scaling, for example given a movie X, find_sim_3 will return more related movies than find_sim_2


In [8]:
q = prolog.query("find_sim_5('The Matrix', Y)")  # returns movies closely related to The Matrix
s = set()
for soln in q:
  m = soln['Y'] 
  if m not in s:
    s.add(soln['Y'])
print(s)

{'Æon Flux', 'Babe', 'Whats Love Got to Do with It', 'Jupiter Ascending', 'Dune', 'Terminator 3: Rise of the Machines', 'The Matrix Revolutions', 'The Matrix Reloaded', 'Terminator Salvation', 'Speed Racer'}


In [10]:
def simple_recommender(movie):
  '''
  Given a movie, we use prolog queries find_sim, from find_sim_6 to
  find_sim_1, and we save the movies in a dictionary. The keys of the
  dictionary are the titles, and the values are an integer between 1 and
  6, that represents how much related they are to the given movie. This way, 
  the dict.keys() contain the movies in descending relativity order (the
  most relative movies are the first items in the list).
  '''
  # we use a set to find whether a movie has already been recommended, in O(1) - average case
  s = set()    
  answers = {}

  q6 = prolog.query("find_sim_6('" + movie +"',M)")
  for soln in q6:
      m = soln['M'] 
      if m not in s:   # check if movie has already been recommended
          s.add(soln['M'])
          answers[m] = 6  # adding the title to the dictionary assigning a value of 6
  q6.close()

  q5 = prolog.query("find_sim_5('" + movie +"',M)")
  for soln in q5:
      m = soln['M'] 
      if m not in s:   # check if movie has already been recommended
          s.add(soln['M'])
          answers[m] = 5  # adding the title to the dictionary assigning a value of 5
  q5.close()

  q4 = prolog.query("find_sim_4('" + movie +"',M)")
  for soln in q4:
    m = soln['M'] 
    if m not in s:     # check if movie was already recommended by previous find_sim queries
        s.add(soln['M'])
        answers[m] = 4  #adding the title to the dictionary assigning a value of 4
  q4.close()

  q3 = prolog.query("find_sim_3('" + movie +"',M)")
  for soln in q3:
    m = soln['M'] 
    if m not in s:    # check if movie was already recommended by previous find_sim queries
        s.add(soln['M'])
        answers[m] = 3  # adding the title to the dictionary assigning a value of 3
  q3.close()

  q2 = prolog.query("find_sim_2('" + movie +"',M)")
  for soln in q2:
      m = soln['M'] 
      if m not in s:   # check if movie was already recommended by previous find_sim queries
          s.add(soln['M'])
          answers[m] = 2  # adding the title to the dictionary assigning a value of 2 
  q2.close()

  q1 = prolog.query("find_sim_1('" + movie +"',M)")
  for soln in q1:
      m = soln['M'] 
      if m not in s:   # check if movie was already recommended by previous find_sim queries
          s.add(soln['M'])
          answers[m] = 1  # adding the title to the dictionary assigning a value of 1 
  q1.close()

  return answers   # return the dictionary with all the relative movies


In [11]:
similar_movies = list(simple_recommender('The Avengers').keys())  # returns a list with the related movies
similar_movies[:5]   # prints the 5 most related movies

['Avengers: Age of Ultron',
 'Captain America: Civil War',
 'Captain America: The First Avenger',
 'Captain America: The Winter Soldier',
 'Iron Man 2']

# **Part 3: Recommendation System based on preferences-ratings of the user -Training and Evaluation of predictions**
In this  part, we decide which movies should be recommended, based on ratings from the user. Given a movie and its rating that was given by the user, first we deploy the simple_recommender to find similar movies. Then, we multiply the relativity score of each movie with a value that represents the user's rating, and we get the final_score. If it is higher than 0, then we choose to recommend the movie.\
Finally, we test the above system, to get some metrics.




In [12]:
from tqdm.notebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
import random

rating_weights = {0: -1, 1: -0.5, 2:0, 3:0, 4:0.5, 5:1}  # each rating_weight in [-1,1], is assigned to a rating score in [0,5]
score_weights = {i:i + 1 for i in range(6)} # based on the relativity levels of part 2

def train_recommender(ratings, rating_weights, score_weights, number_of_movies = 10):
    """
    In this functions, we can choose which part of the csv will be used as training data,
    by setting the number_of_movies parameter to the desired value. If we want all movies 
    to be included, we set it to -1. The function, computes the rating_score * relativity_value
    and based on the outcome, decides whether the movie shall be recommended or not
    """

    # picking <number_of_movies> random movies from ratings
    if number_of_movies > len(ratings):   
        number_of_movies = len(ratings)


    if number_of_movies != -1:
        indexes = random.sample(range(len(ratings)), number_of_movies)
        ratings = ratings.iloc[indexes]

    movie_score = {}  # dictionary, with movie titles as keys, and their final_score as values
    for row in tqdm(ratings.itertuples(index=True, name='Pandas')):  
        movie = clean_text(getattr(row, 'movie_title'))
        rating = getattr(row, 'rating')
        
        # for each of choosen movies, find similar ones
        similar_movies = simple_recommender(movie)   # returns a dict of movie titles and similarity value

        # for each similar movie, compute the final_score
        for similar_movie in similar_movies.keys():  # similar_movies.keys() contains the titles of all similar movies
            if similar_movie not in movie_score:
                movie_score[similar_movie] = rating_weights[int(rating)] * score_weights[similar_movies[similar_movie] - 1]
            else:
                movie_score[similar_movie] += rating_weights[int(rating)] * score_weights[similar_movies[similar_movie] - 1] 
    return movie_score


# we have defined that a movie should be recommended if it has a rating > 3
def predict_example(ratings, movie_score):
    real, pred = [], []
    for i, row in enumerate(ratings.itertuples(index=True, name='Pandas')):
        movie = clean_text(getattr(row, 'movie_title'))
        rating = getattr(row, 'rating')

        if movie in movie_score: # if we have recommended
            pred.append(int(movie_score[movie] > 0)) #heuristic for recommending the movie or not
            real.append(int(rating > 3)) # we have defined that the rating of a recommended movie shoulb be higher than 3
        else:  # if we haven't recommended the movie
            pred.append(0)   # pred, represents the predictions from our recommender
            real.append(int(rating > 3))  # real, represents the expected outcome

    return real, pred

# function to get the metrics about our recommender
def get_metrics(real, pred):
    metrics = {}
    metrics["precision"] = precision_score(real, pred)
    metrics["recall"] = recall_score(real, pred)
    metrics["f1"] = f1_score(real, pred)
    return metrics

The above, test the recommender based on a random training set.Thus, in order to have a more general view, we run the test 10 times for specific number of training_movies, and find the average of the metrics that occur.

In [None]:
# reading the training and tesing data
train_ratings = pd.read_csv(path + "train_ratings.csv")
test_ratings = pd.read_csv(path + "test_ratings.csv")

In [None]:
# for number of training_movies = 10
metrics = []
for i in range (10):
    movie_score = train_recommender(train_ratings, rating_weights, score_weights, 10)
    real, pred = predict_example(test_ratings, movie_score)
    metrics.append(get_metrics(real, pred))

for metric in metrics[0].keys():
    print (f"{metric}: {np.mean([m[metric] for m in metrics])}")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision: 0.523069199329463
recall: 0.9597222222222221
f1: 0.6764093771296408


In [None]:
# for number of training_movies = 30
metrics = []
for i in range (10):
    movie_score = train_recommender(train_ratings, rating_weights, score_weights, 30)
    real, pred = predict_example(test_ratings, movie_score)
    metrics.append(get_metrics(real, pred))

for metric in metrics[0].keys():
    print (f"{metric}: {np.mean([m[metric] for m in metrics])}")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision: 0.5150931677712605
recall: 0.9958333333333333
f1: 0.6789742079048633


In [None]:
# for number of training_movies = 50
metrics = []
for i in range (10):
    movie_score = train_recommender(train_ratings, rating_weights, score_weights, 50)
    real, pred = predict_example(test_ratings, movie_score)
    metrics.append(get_metrics(real, pred))

for metric in metrics[0].keys():
    print (f"{metric}: {np.mean([m[metric] for m in metrics])}")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision: 0.5166381907184239
recall: 0.9944444444444445
f1: 0.6799707976222504


In [None]:
# for number of training_movies = 100
metrics = []
for i in range (10):
    movie_score = train_recommender(train_ratings, rating_weights, score_weights, 100)
    real, pred = predict_example(test_ratings, movie_score)
    metrics.append(get_metrics(real, pred))

for metric in metrics[0].keys():
    print (f"{metric}: {np.mean([m[metric] for m in metrics])}")

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

precision: 0.5142857142857142
recall: 1.0
f1: 0.6792452830188679
