In [91]:
import pandas as pd
import json
import math
import operator

## Business Understanding

**Objective**:

## Data Understanding

In [95]:
file_path = '../data/dataset.json'

def load_json(file_path):
    with open(file_path) as file:
        return json.load(file)

In [96]:
data = load_json(file_path)

In [98]:
def get_common_movies(criticA, criticB):
    return [movie for movie in data[criticA] if movie in data[criticB]]

In [116]:
list(data.keys())

['Marlon Brando',
 'Stephen King',
 'Steven Spielberg',
 'George Lucas',
 'Al Pacino',
 'Robert DeNiro',
 'Robert Duvall',
 'Jack Nicholson',
 'Morgan Freeman',
 'Harrison Ford',
 'Tom Hanks',
 'Francis Ford Coppola',
 'Martin Scorsese',
 'Diane Keaton',
 'Richard Dreyfuss',
 'Joe Pesci']

In [100]:
data["Marlon Brando"]

{'The Godfather': 5.0,
 'The Godfather Part II': 4.29,
 'Apocalypse Now': 5.0,
 'Jaws': 1.72}

In [101]:
get_common_movies("Marlon Brando", "Robert DeNiro")

['The Godfather', 'The Godfather Part II']

In [117]:
def get_review(criticA, criticB):
    common_movies = get_common_movies(criticA, criticB)
    return [(data[criticA][movie], data[criticB][movie]) for movie in common_movies]

In [118]:
get_review("Marlon Brando", "Robert DeNiro")

[(5.0, 3.07), (4.29, 4.29)]

In [119]:
def euclidean_distance(points):
    squered_diffs = [(point[0] - point[1]) ** 2 for point in points]
    summed_squered_diffs = sum(squered_diffs)
    distance = math.sqrt(summed_squered_diffs)
    return distance

In [120]:
def similarity(reviews):
    return 1 / (1 + euclidean_distance(reviews))

In [124]:
round(similarity([(5.0, 3.07), (4.29, 4.29)]), 2)

0.34

In [127]:
def get_critic_similarity(criticA, criticB):
    review = get_review(criticA, criticB)
    return similarity(review)

In [129]:
round(get_critic_similarity("Marlon Brando", "Robert DeNiro"), 2)

0.34

In [151]:
def recommend_movies(critic, num_suggestions):
    similarity_scores = [(get_critic_similarity(critic, other), other) for other in data if other != critic]

    recommendations = {}
    for similarity, other in similarity_scores:
        reviewed = data[other]
        for movie in reviewed:
            if movie not in data[critic]:
                weight = similarity * reviewed[movie]
                if movie in recommendations:
                    sim, weights = recommendations[movie]
                    recommendations[movie] = (sim + similarity, weights + [weight])
                else:
                    recommendations[movie] = (similarity, [weight])

    for recommendation in recommendations:
        similarity, movie = recommendations[recommendation]
        recommendations[recommendation] = round(sum(movie) / similarity, 2)

    sorted_recommendations = dict(sorted(recommendations.items(), key = operator.itemgetter(1), reverse = True)[:num_suggestions])

    return sorted_recommendations

In [152]:
recommend_movies("Marlon Brando", 4)

{'Raiders of the Lost Ark': 5.0,
 'Raging Bull': 4.94,
 'Goodfellas': 4.94,
 'The Shawshank Redemption': 4.93}