<a href="https://colab.research.google.com/github/Harry-Turner/Python/blob/main/ML_MovieRecommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas

movies_dataframe = pandas.read_csv("movies.csv")

# Gets rid of original 'index' colum and replaced with movieId
movies_dataframe.set_index("movieId", inplace = True)

ratings_dataframe = pandas.read_csv("ratings.csv")
ratings_dataframe

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
# Get count of number of times a movie was rated
total_counts = ratings_dataframe["movieId"].value_counts()
total_counts

356       329
318       317
296       307
593       279
2571      278
         ... 
86279       1
86922       1
5962        1
87660       1
163981      1
Name: movieId, Length: 9724, dtype: int64

In [4]:
movies_dataframe["ratingsCount"] = total_counts
movies_dataframe

Unnamed: 0_level_0,title,genres,ratingsCount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0
...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1.0
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1.0
193585,Flint (2017),Drama,1.0
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1.0


In [7]:
movies_dataframe.sort_values("ratingsCount", ascending = False).head(10)

Unnamed: 0_level_0,title,genres,ratingsCount
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0
110,Braveheart (1995),Action|Drama|War,237.0
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0
527,Schindler's List (1993),Drama|War,220.0


In [9]:
average_ratings = ratings_dataframe.groupby("movieId").mean()["rating"]
average_ratings

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: rating, Length: 9724, dtype: float64

In [11]:
movies_dataframe["averageRating"] = average_ratings
movies_dataframe

Unnamed: 0_level_0,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0,3.920930
2,Jumanji (1995),Adventure|Children|Fantasy,110.0,3.431818
3,Grumpier Old Men (1995),Comedy|Romance,52.0,3.259615
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0,2.357143
5,Father of the Bride Part II (1995),Comedy,49.0,3.071429
...,...,...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,1.0,4.000000
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,1.0,3.500000
193585,Flint (2017),Drama,1.0,3.500000
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,1.0,3.500000


In [15]:
movies_dataframe.sort_values(["ratingsCount", "averageRating"], ascending = False).head(10)

Unnamed: 0_level_0,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0,3.75
110,Braveheart (1995),Action|Drama|War,237.0,4.031646
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
527,Schindler's List (1993),Drama|War,220.0,4.225


In [18]:
MINIMUM_RATINGS_COUNT = 100
minimum_ratings_subset = movies_dataframe.query(f"ratingsCount >= {MINIMUM_RATINGS_COUNT}").sort_values("ratingsCount")
minimum_ratings_subset

Unnamed: 0_level_0,title,genres,ratingsCount,averageRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
912,Casablanca (1942),Drama|Romance,100.0,4.240000
4022,Cast Away (2000),Drama,100.0,3.700000
1517,Austin Powers: International Man of Mystery (1...,Action|Adventure|Comedy,100.0,3.535000
44191,V for Vendetta (2006),Action|Sci-Fi|Thriller|IMAX,100.0,3.885000
454,"Firm, The (1993)",Drama|Thriller,101.0,3.534653
...,...,...,...,...
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.161290
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022


In [19]:
import numpy

USER_1_POS_X = 3
USER_1_POS_Y = 7.5

pos_user_1 = numpy.array([USER_1_POS_X, USER_1_POS_Y])

USER_2_POS_X = 7
USER_2_POS_Y = 8

pos_user_2 = numpy.array([USER_2_POS_X, USER_2_POS_Y])

def find_distance_between_users(pos_user_1, pos_user_2):

  (delta_x, delta_y) = pos_user_1 - pos_user_2

  return (delta_x**2 + delta_y**2) ** (1/2)

find_distance_between_users(pos_user_1, pos_user_2)

4.031128874149275

In [24]:
def find_user_ratings(userId):

  user_ratings = ratings_dataframe.query(f"userId == {userId}")

  return user_ratings[["movieId", "rating"]].set_index("movieId")

ID_OF_USER_1 = 1
ID_OF_USER_2 = 610

user_1_ratings = find_user_ratings(ID_OF_USER_1)

user_2_ratings = find_user_ratings(ID_OF_USER_2)

In [26]:
ratings_comparison = user_1_ratings.join(user_2_ratings, lsuffix="_user1", rsuffix="_user2").dropna()
ratings_comparison

Unnamed: 0_level_0,rating_user1,rating_user2
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.0,5.0
6,4.0,5.0
47,5.0,5.0
50,5.0,4.0
70,3.0,4.0
...,...,...
3671,5.0,5.0
3703,5.0,4.5
3740,4.0,4.5
3744,4.0,3.0


In [27]:
user1_compared = ratings_comparison["rating_user1"]
user2_compared = ratings_comparison["rating_user2"]

numpy.linalg.norm(user1_compared - user2_compared)

8.031189202104505