In [14]:
import pandas as pd
from sklearn.decomposition import PCA
import sklearn.neighbors

In [15]:
reviews = pd.read_csv(
    "../dataset/ratings.csv",
    names=["UserID", "MovieID", "Ratings", "Timestamp"]
).drop(columns=["Timestamp"])

movies = pd.read_csv(
    "../dataset/movies.csv",
    encoding="latin1",
    names=["MovieID", "Title", "Genre"]
)

movies["Genre"] = movies["Genre"].str.split("|")
movies

Unnamed: 0,MovieID,Title,Genre
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"


In [16]:
movie_review_df = pd.merge(reviews, movies, on="MovieID")
movie_review_df

Unnamed: 0,UserID,MovieID,Ratings,Title,Genre
0,1,1,4.0,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,1,3,4.0,Grumpier Old Men (1995),"[Comedy, Romance]"
2,1,6,4.0,Heat (1995),"[Action, Crime, Thriller]"
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
4,1,50,5.0,"Usual Suspects, The (1995)","[Crime, Mystery, Thriller]"
...,...,...,...,...,...
100831,610,166534,4.0,Split (2017),"[Drama, Horror, Thriller]"
100832,610,168248,5.0,John Wick: Chapter Two (2017),"[Action, Crime, Thriller]"
100833,610,168250,5.0,Get Out (2017),[Horror]
100834,610,168252,5.0,Logan (2017),"[Action, Sci-Fi]"


In [17]:
ratings_matrix = movie_review_df.pivot_table(
    index="MovieID",
    columns="UserID",
    values="Ratings"
)

movie_review_matrix = ratings_matrix.apply(lambda row: row.fillna(row.mean()), axis = 1)

In [18]:
pca = PCA(n_components=50)
ratings_pca = pca.fit_transform(movie_review_matrix)

model_knn = sklearn.neighbors.NearestNeighbors()
model_knn.fit(ratings_pca)

movie_id_to_title = movies.set_index('MovieID')['Title'].to_dict()

input_title = "Star Wars: Episode IV - A New Hope (1977)"  # Example input title

In [19]:
recList = []
input_movie = movies[movies['Title'] == input_title]
input_id = input_movie['MovieID'].values[0]
input_genres = input_movie["Genre"].values[0]

movie_idx = movie_review_matrix.index.get_loc(input_id)
_, indices = model_knn.kneighbors(
    ratings_pca[movie_idx].reshape(1, -1), n_neighbors=100
)

In [20]:
count = 0
i = 1

while (count < 5) and (i < 100):
    similar_idx = movie_review_matrix.index[indices[0][i]]
    rec_movie = movies[movies["MovieID"] == similar_idx]
    
    """ 
        Very boondocked way to filter for relating genre but currently it is set
        to only recommend films similar to the original but I can probably edit it
        to include the users choice.
    """
    for genres in rec_movie["Genre"].values[0]:
        if genres in input_genres:
            recList.append(rec_movie["Title"].values[0])
            count+=1
            break

    i+=1

print(recList)

['Star Wars: Episode V - The Empire Strikes Back (1980)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Indiana Jones and the Last Crusade (1989)', 'Yojimbo (1961)']


# Todo
- Somehow weight the movies based on the number of reviews
- Somehow also recommend movies not just on rating score but also the genre.
- (Then maybe can do the director)