<a href="https://colab.research.google.com/github/Harry-Turner/Python/blob/main/ML_MovieRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas

movies_dataframe = pandas.read_csv("movies.csv")

# Gets rid of original 'index' colum and replaced with movieId
movies_dataframe.set_index("movieId", inplace = True)

ratings_dataframe = pandas.read_csv("ratings.csv")
ratings_dataframe

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [2]:
# Get count of number of times a movie was rated
total_counts = ratings_dataframe["movieId"].value_counts()
total_counts

356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: movieId, Length: 9724, dtype: int64

In [3]:
movies_dataframe["ratingsCount"] = total_counts
movies_dataframe

Unnamed: 0_level_0,title,genres,ratingsCount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1.0
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1.0
193585,Flint (2017),Drama,1.0
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1.0


In [4]:
movies_dataframe.sort_values("ratingsCount", ascending = False).head(10)

Unnamed: 0_level_0,title,genres,ratingsCount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0
110,Braveheart (1995),Action|Drama|War,237.0
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0
527,Schindler's List (1993),Drama|War,220.0


In [5]:
average_ratings = ratings_dataframe.groupby("movieId").mean()["rating"]
average_ratings

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: rating, Length: 9724, dtype: float64

In [6]:
movies_dataframe["averageRating"] = average_ratings
movies_dataframe

Unnamed: 0_level_0,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.920930
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429
...,...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1.0,4.000000
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1.0,3.500000
193585,Flint (2017),Drama,1.0,3.500000
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1.0,3.500000


In [7]:
movies_dataframe.sort_values(["ratingsCount", "averageRating"], ascending = False).head(10)

Unnamed: 0_level_0,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0,3.75
110,Braveheart (1995),Action|Drama|War,237.0,4.031646
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
527,Schindler's List (1993),Drama|War,220.0,4.225


In [8]:
MINIMUM_RATINGS_COUNT = 100
minimum_ratings_subset = movies_dataframe.query(f"ratingsCount >= {MINIMUM_RATINGS_COUNT}").sort_values("ratingsCount")
minimum_ratings_subset

Unnamed: 0_level_0,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
912,Casablanca (1942),Drama|Romance,100.0,4.240000
4022,Cast Away (2000),Drama,100.0,3.700000
1517,Austin Powers: International Man of Mystery (1...,Action|Adventure|Comedy,100.0,3.535000
44191,V for Vendetta (2006),Action|Sci-Fi|Thriller|IMAX,100.0,3.885000
454,"Firm, The (1993)",Drama|Thriller,101.0,3.534653
...,...,...,...,...
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.161290
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022


In [9]:
import numpy

USER_1_POS_X = 3
USER_1_POS_Y = 7.5

pos_user_1 = numpy.array([USER_1_POS_X, USER_1_POS_Y])

USER_2_POS_X = 7
USER_2_POS_Y = 8

pos_user_2 = numpy.array([USER_2_POS_X, USER_2_POS_Y])

def find_distance_between_users(pos_user_1, pos_user_2):

  (delta_x, delta_y) = pos_user_1 - pos_user_2

  return (delta_x**2 + delta_y**2) ** (1/2)

find_distance_between_users(pos_user_1, pos_user_2)

4.031128874149275

In [10]:
# Find all ratings for a given user
def find_user_ratings(userId):

  user_ratings = ratings_dataframe.query(f"userId == {userId}")

  return user_ratings[["movieId", "rating"]].set_index("movieId")

ID_OF_USER_1 = 1
ID_OF_USER_2 = 610

user_1_ratings = find_user_ratings(ID_OF_USER_1)

user_2_ratings = find_user_ratings(ID_OF_USER_2)

In [11]:
# Create table to show ratings given by 2 users for movies they both rated
ratings_comparison = user_1_ratings.join(user_2_ratings, lsuffix="_user1", rsuffix="_user2").dropna()
ratings_comparison

Unnamed: 0_level_0,rating_user1,rating_user2
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,5.0
6,4.0,5.0
47,5.0,5.0
50,5.0,4.0
70,3.0,4.0
...,...,...
3671,5.0,5.0
3703,5.0,4.5
3740,4.0,4.5
3744,4.0,3.0


In [12]:
# Take values from above table and store in each respective variable
user1_compared = ratings_comparison["rating_user1"]
user2_compared = ratings_comparison["rating_user2"]

# Find distance between users
numpy.linalg.norm(user1_compared - user2_compared)

8.031189202104505

In [13]:
# This carries out above 3 code blocks in 1 function
def find_distance_between_real_users(userId_1, userId_2):
  ratings_user1 = find_user_ratings(userId_1)

  ratings_user2 = find_user_ratings(userId_2)

  ratings_comparison = ratings_user1.join(ratings_user2, lsuffix="_user1", rsuffix="_user2").dropna()

  user1_compared = ratings_comparison["rating_user1"]
  user2_compared = ratings_comparison["rating_user2"]

  distance_between_users = numpy.linalg.norm(user1_compared - user2_compared)

  return [userId_1, userId_2, distance_between_users]

# Test if working
find_distance_between_real_users(1,3)

[1, 3, 8.200609733428363]

In [14]:
def find_relative_distances(userId):

  users = ratings_dataframe["userId"].unique() # Get list of unique users

  users = users[users != userId] # Remove userId of itself

  # Call function above and loop for every user id compared to itself
  distances = [find_distance_between_real_users(userId, every_id) for every_id in users]

  #Return table showing distance of given userId against every other
  return pandas.DataFrame(distances, columns = ["GivenUserId", "userId", "distance"])

example_distances = find_relative_distances(7)
example_distances

Unnamed: 0,GivenUserId,userId,distance
0,7,1,7.106335
1,7,2,4.031129
2,7,3,0.000000
3,7,4,5.634714
4,7,5,3.354102
...,...,...,...
604,7,606,12.579746
605,7,607,5.244044
606,7,608,14.924812
607,7,609,3.122499


In [15]:
# Function to find closest users to given id
def find_top_similar_users(userId):

  distances_to_user = find_relative_distances(userId)

  distances_to_user = distances_to_user.sort_values("distance")

  distances_to_user = distances_to_user.set_index("userId")

  return distances_to_user

SAMPLE_USER_ID = 9

find_top_similar_users(SAMPLE_USER_ID).head(20)

Unnamed: 0_level_0,GivenUserId,distance
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
470,9,0.0
329,9,0.0
150,9,0.0
151,9,0.0
444,9,0.0
258,9,0.0
154,9,0.0
155,9,0.0
442,9,0.0
441,9,0.0


In [16]:
# Function to show movies that have been unwatched by given user that would be reommended
def make_movie_recommendation(userId):

  user_ratings = find_user_ratings(userId) # Find all ratings by given user

  top_similar_users = find_top_similar_users(userId) # Find users who have rated similarly

  MOST_SIMILAR = 0 # Define distance it should be between user and client

  most_similar_user = top_similar_users.iloc[MOST_SIMILAR] # Find most similar user

  most_similar_user_ratings = find_user_ratings(most_similar_user.name) # Get ratings of movies given by another user

  # Get movies that the user has not watched but similar client has
  unwatched_movies = most_similar_user_ratings.drop(user_ratings.index, errors = "ignore")

  # Sort movie list by rating highest first
  unwatched_movies = unwatched_movies.sort_values("rating", ascending = False)

  # Join back to table with names
  recommended_movies = unwatched_movies.join(movies_dataframe)

  return recommended_movies

make_movie_recommendation(SAMPLE_USER_ID)

Unnamed: 0_level_0,rating,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
292,5.0,Outbreak (1995),Action|Drama|Sci-Fi|Thriller,101.0,3.425743
589,5.0,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
587,5.0,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller,115.0,3.434783
527,5.0,Schindler's List (1993),Drama|War,220.0,4.225000
356,5.0,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
...,...,...,...,...,...
420,3.0,Beverly Hills Cop III (1994),Action|Comedy|Crime|Thriller,59.0,2.720339
434,3.0,Cliffhanger (1993),Action|Adventure|Thriller,101.0,3.034653
454,3.0,"Firm, The (1993)",Drama|Thriller,101.0,3.534653
253,3.0,Interview with the Vampire: The Vampire Chroni...,Drama|Horror,109.0,3.458716


In [17]:
NUMBER_OF_NEIGHBORS = 5
def find_k_nearest_neighbors(userId, k=NUMBER_OF_NEIGHBORS):

  distances_to_user = find_relative_distances(userId)

  distances_to_user = distances_to_user.sort_values("distance")

  distances_to_user = distances_to_user.set_index("userId")

  return distances_to_user.head(k)

find_k_nearest_neighbors(SAMPLE_USER_ID)

Unnamed: 0_level_0,GivenUserId,distance
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
470,9,0.0
329,9,0.0
150,9,0.0
151,9,0.0
444,9,0.0


In [18]:
def make_recommendation_with_knn(userId):

 top_k_neighbors = find_k_nearest_neighbors(userId)

 ratings_by_index = ratings_dataframe.set_index("userId")

 top_similar_ratings = ratings_by_index.loc[top_k_neighbors.index]

 top_similar_rating_avg = top_similar_ratings.groupby("movieId").mean()[["rating"]]

 recommended_movie = top_similar_rating_avg.sort_values("rating", ascending=False)

 return recommended_movie.join(movies_dataframe)

make_recommendation_with_knn(SAMPLE_USER_ID)

Unnamed: 0_level_0,rating,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
136850,5.0,Villain (1971),Crime|Drama|Thriller,1.0,5.000000
802,5.0,Phenomenon (1996),Drama|Romance,46.0,3.336957
111,5.0,Taxi Driver (1976),Crime|Drama|Thriller,104.0,4.105769
235,5.0,Ed Wood (1994),Comedy|Drama,70.0,3.678571
292,5.0,Outbreak (1995),Action|Drama|Sci-Fi|Thriller,101.0,3.425743
...,...,...,...,...,...
1732,0.5,"Big Lebowski, The (1998)",Comedy|Crime,106.0,3.924528
1080,0.5,Monty Python's Life of Brian (1979),Comedy,89.0,3.926966
4262,0.5,Scarface (1983),Action|Crime|Drama,67.0,3.932836
1206,0.5,"Clockwork Orange, A (1971)",Crime|Drama|Sci-Fi|Thriller,120.0,3.995833


In [20]:
from sqlalchemy.sql.selectable import TextualSelect
NUMBER_OF_MOVIES = 14

MINIMUM_NUMBER = 1

ROWS_INDEX = 0

MAXIMUM_NUMEBR = movies_dataframe.shape[ROWS_INDEX]

import random

test_user_watched_movies = []

for i in range(0, NUMBER_OF_MOVIES):

  random_movie_index = random.randint(MINIMUM_NUMBER, MAXIMUM_NUMEBR)

  test_user_watched_movies.append(random_movie_index)

test_user_watched_movies



[8517,
 7763,
 7568,
 5981,
 3297,
 3600,
 5609,
 7960,
 2437,
 3727,
 8099,
 9181,
 308,
 6970]

In [21]:
MINIMUM_RATING = 0
MAXIMUM_RATING = 5

test_user_ratings = []

for index in range(0, NUMBER_OF_MOVIES):

  random_rating = random.randint(MINIMUM_RATING, MAXIMUM_RATING + 1)

  test_user_ratings.append(random_rating)

test_user_ratings

[3, 3, 3, 1, 6, 3, 4, 5, 0, 5, 5, 2, 0, 5]

In [25]:
user_data = [list(index) for index in zip(test_user_watched_movies, test_user_ratings)]
user_data

def create_new_user(user_data):

  new_user_id = ratings_dataframe["userId"].max() + 1

  new_user_dataframe = pandas.DataFrame(user_data, columns = ["movieId", "rating"])

  new_user_dataframe["userId"] = new_user_id

  return pandas.concat([ratings_dataframe, new_user_dataframe])

create_new_user(user_data)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703.0
1,1,3,4.0,964981247.0
2,1,6,4.0,964982224.0
3,1,47,5.0,964983815.0
4,1,50,5.0,964982931.0
...,...,...,...,...
9,611,3727,5.0,
10,611,8099,5.0,
11,611,9181,2.0,
12,611,308,0.0,


In [26]:
NEW_USER_ID = 611

make_recommendation_with_knn(NEW_USER_ID)

Unnamed: 0_level_0,rating,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
940,5.0,"Adventures of Robin Hood, The (1938)",Action|Adventure|Romance,8.0,4.000000
1196,5.0,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,211.0,4.215640
1198,5.0,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,200.0,4.207500
3441,5.0,Red Dawn (1984),Action|Drama|War,14.0,3.428571
2899,5.0,Gulliver's Travels (1939),Adventure|Animation|Children,3.0,3.000000
...,...,...,...,...,...
1219,2.0,Psycho (1960),Crime|Horror,83.0,4.036145
2724,2.0,Runaway Bride (1999),Comedy|Romance,42.0,2.833333
531,1.0,"Secret Garden, The (1993)",Children|Drama,24.0,3.250000
3176,1.0,"Talented Mr. Ripley, The (1999)",Drama|Mystery|Thriller,49.0,3.336735
