In [6]:
import pandas as pd


In [7]:
movies = pd.read_csv("dataset/small_dataset/movies_full_2.csv")
ratings = pd.read_csv("dataset/small_dataset/ratings.csv")

In [8]:
def create_weighted_rating_df(movies_df, ratings_df):
    movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")

    movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'url']].groupby(['movieId', 'title', 'genres', 'year', 'url'])['rating'].agg(['count', 'mean']).round(1)
    movies_rating_df.sort_values('count', ascending=False, inplace=True)
    movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)

    C = round(ratings_df['rating'].mean(), 2)
    m = 500
    movies_rating_df['Bayesian_rating'] = (movies_rating_df['Num_ratings'] / (movies_rating_df['Num_ratings'] + m)) * movies_rating_df['Average_rating'] + (m / (movies_rating_df['Num_ratings'] + m)) * C
    movies_rating_df.drop(columns='Average_rating', inplace=True)
    movies_rating_df.sort_values(by='Bayesian_rating', ascending=False, inplace=True)
    movies_rating_df.rename(columns={'Num_ratings' : 'count', 'Bayesian_rating' : 'weighted_rating'}, inplace=True)
    movies_rating_df.reset_index(inplace=True)
    movies_rating_df['genres'] = movies_rating_df['genres'].str.split('|')

    return movies_rating_df




In [9]:
movies_rating_df = create_weighted_rating_df(movies, ratings)
movies_rating_df.head(15)

Unnamed: 0,movieId,title,genres,year,url,count,weighted_rating
0,318,"Shawshank Redemption, The","[Crime, Drama]",1994.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,317,3.849204
1,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994.0,https://m.media-amazon.com/images/M/MV5BNWIwOD...,329,3.777805
2,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994.0,https://m.media-amazon.com/images/M/MV5BNGNhMD...,307,3.766295
3,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991.0,https://m.media-amazon.com/images/M/MV5BNjNhZT...,279,3.750706
4,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BNzQzOT...,278,3.750129
5,2959,Fight Club,"[Action, Crime, Drama, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BMmEzNT...,218,3.742897
6,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977.0,https://m.media-amazon.com/images/M/MV5BNzVlY2...,251,3.733955
7,858,"Godfather, The","[Crime, Drama]",1972.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,192,3.721965
8,527,Schindler's List,"[Drama, War]",1993.0,https://m.media-amazon.com/images/M/MV5BNDE4OT...,220,3.713889
9,1196,Star Wars: Episode V - The Empire Strikes Back,"[Action, Adventure, Sci-Fi]",1980.0,https://m.media-amazon.com/images/M/MV5BYmU1ND...,211,3.707736


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
    
    
    
def find_movie_indices(df, title):
    df_copy = df.copy()
    df_copy['genres_str'] = df_copy['genres'].apply(lambda x: ' '.join(x))
    count_vect = CountVectorizer()
    genre_matrix = count_vect.fit_transform(df_copy['genres_str'])
    cosine_sim = cosine_similarity(genre_matrix, genre_matrix)
    idx = df_copy.index[df_copy['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores]
    movie_indices = movie_indices[1:20]
    del df_copy
    return movie_indices

def recommend_movies(df, movie_indices, preferred_genres=None, disliked_genres=None):
    recommended_movies = []
    for i in movie_indices:
        movie_genres = set(df.loc[i, 'genres'])

        if disliked_genres:
            if movie_genres.intersection(set(disliked_genres)):
                continue

        if preferred_genres:
            if not movie_genres.intersection(set(preferred_genres)):
                continue
    
        recommended_movies.append(i)
    
        # Limit the number of recommended movies to 10
        if len(recommended_movies) >= 5:
            break


    recommended_movies_df = df.loc[recommended_movies]
    recommended_movies_df = recommended_movies_df[['title', 'year', 'url', 'count', 'weighted_rating']]
    #print(recommended_movies_df)

    return recommended_movies_df
    
 
preferred_genres = ['Animation', 'Drama', 'Mystery']
disliked_genres = ['Action']

movie_indices_list = find_movie_indices(movies_rating_df, 'Toy Story')
recommended_movies = recommend_movies(movies_rating_df, movie_indices_list, preferred_genres, disliked_genres)
print(recommended_movies)

                                                 title    year  \
76                                      Monsters, Inc.  2001.0   
110                                        Toy Story 2  1999.0   
596                          Emperor's New Groove, The  2000.0   
1625  Asterix and the Vikings (Astérix et les Vikings)  2006.0   
4725                           Tale of Despereaux, The  2008.0   

                                                    url  count  \
76    https://m.media-amazon.com/images/M/MV5BMTY1NT...    132   
110   https://m.media-amazon.com/images/M/MV5BMWM5ZD...     97   
596   https://m.media-amazon.com/images/M/MV5BZGQwNm...     37   
1625  https://m.media-amazon.com/images/M/MV5BYTU3M2...      1   
4725  https://m.media-amazon.com/images/M/MV5BMTg0MT...      1   

      weighted_rating  
76           3.583544  
110          3.564992  
596          3.513780  
1625         3.502994  
4725         3.499002  
