In [255]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Concatenate, Dense, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder


In [256]:
movies = pd.read_csv("dataset/small_dataset/movies_full_2.csv")
ratings = pd.read_csv("dataset/small_dataset/ratings.csv")
tags = pd.read_csv("dataset/small_dataset/tags.csv")
tags.drop(columns=['userId', 'timestamp'], inplace=True)

In [257]:
def create_weighted_rating_tags_df(movies_df, ratings_df, tags_df):
    movies_rating_user_df = pd.merge(movies_df, ratings_df, on="movieId", how="inner")

    movies_rating_df = movies_rating_user_df[['movieId', 'title', 'rating', 'genres', 'year', 'url']].groupby(['movieId', 'title', 'genres', 'year', 'url'])['rating'].agg(['count', 'mean']).round(1)
    movies_rating_df.sort_values('count', ascending=False, inplace=True)
    movies_rating_df.rename(columns={'count' : 'Num_ratings', 'mean': 'Average_rating'}, inplace=True)

    C = round(ratings_df['rating'].mean(), 2)
    m = 500
    movies_rating_df['Bayesian_rating'] = (movies_rating_df['Num_ratings'] / (movies_rating_df['Num_ratings'] + m)) * movies_rating_df['Average_rating'] + (m / (movies_rating_df['Num_ratings'] + m)) * C
    movies_rating_df.drop(columns='Average_rating', inplace=True)
    movies_rating_df.rename(columns={'Num_ratings' : 'count', 'Bayesian_rating' : 'weighted_rating'}, inplace=True)
    movies_rating_df.reset_index(inplace=True)
    

    movies_rating_tags_df = pd.merge(movies_rating_df, tags_df, how='left', on='movieId')
    movies_rating_tags_df['tag'] = movies_rating_tags_df['tag'].fillna(value='')
    movies_rating_tags_df = movies_rating_tags_df.groupby(['movieId', 'title', 'genres', 'year', 'url', 'count', 'weighted_rating'])['tag'].apply(list).reset_index()
    movies_rating_tags_df['genres'] = movies_rating_tags_df['genres'].str.split('|')
    movies_rating_tags_df['tag'] = movies_rating_tags_df['tag'].apply(lambda x: [] if x == [float('nan')] else x)
    movies_rating_tags_df.sort_values(by='weighted_rating', ascending=False, inplace=True)
    return movies_rating_tags_df


movies_rating_tags_df = create_weighted_rating_tags_df(movies, ratings, tags)
movies_rating_tags_df.head(20)

Unnamed: 0,movieId,title,genres,year,url,count,weighted_rating,tag
276,318,"Shawshank Redemption, The","[Crime, Drama]",1994.0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,317,3.849204,"[prison, Stephen King, wrongful imprisonment, ..."
313,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994.0,https://m.media-amazon.com/images/M/MV5BNWIwOD...,329,3.777805,"[shrimp, Vietnam, bubba gump shrimp, lieutenan..."
256,296,Pulp Fiction,"[Comedy, Crime, Drama, Thriller]",1994.0,https://m.media-amazon.com/images/M/MV5BNGNhMD...,307,3.766295,"[good dialogue, great soundtrack, non-linear, ..."
509,593,"Silence of the Lambs, The","[Crime, Horror, Thriller]",1991.0,https://m.media-amazon.com/images/M/MV5BNjNhZT...,279,3.750706,"[Hannibal Lector, disturbing, drama, gothic, p..."
1932,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BNzQzOT...,278,3.750129,"[martial arts, sci-fi, alternate universe, phi..."
2217,2959,Fight Club,"[Action, Crime, Drama, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BMmEzNT...,218,3.742897,"[dark comedy, psychology, thought-provoking, t..."
224,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977.0,https://m.media-amazon.com/images/M/MV5BNzVlY2...,251,3.733955,"[classic, space action, action, sci-fi, EPIC, ..."
657,858,"Godfather, The","[Crime, Drama]",1972.0,https://m.media-amazon.com/images/M/MV5BM2MyNj...,192,3.721965,[Mafia]
460,527,Schindler's List,"[Drama, War]",1993.0,https://m.media-amazon.com/images/M/MV5BNDE4OT...,220,3.713889,"[moving, thought-provoking, Holocaust, based o..."
892,1196,Star Wars: Episode V - The Empire Strikes Back,"[Action, Adventure, Sci-Fi]",1980.0,https://m.media-amazon.com/images/M/MV5BYmU1ND...,211,3.707736,"[I am your father, space, space opera, classic..."


In [258]:
print(movies_rating_tags_df.iloc[2137])

movieId                                                       157110
title                           00 Schneider - Jagd auf Nihil Baxter
genres                                               [Comedy, Crime]
year                                                          1994.0
url                https://m.media-amazon.com/images/M/MV5BOTQzMz...
count                                                              1
weighted_rating                                             3.501996
tag                                                               []
Name: 8954, dtype: object


# Knowledge-based

In [259]:
def knowledge_based_recommendation(df, user_preferences):
    recommended_movies = df[~df['genres'].apply(lambda x: any(genre in user_preferences['disliked_genres'] for genre in x))]

    # Filter movies based on user's preferred genres
    recommended_movies = recommended_movies[recommended_movies['genres'].apply(lambda x: any(genre in user_preferences['preferred_genres'] for genre in x))]
    
   # Remove movies with disliked genres

    #print(recommended_movies)
  
    # recommended_movies = recommended_movies[~recommended_movies['genres'].apply(lambda x: any(genre in user_preferences['disliked_genres'] for genre in x))]



    # Return only the movie IDs of the top 3 recommendations
    return recommended_movies.head(5)['movieId'].tolist()

user_preferences = {'preferred_genres': ['Fantasy', 'Action', 'Thriller'], 'disliked_genres': ['Drama']}
k_recommendations_ids = knowledge_based_recommendation(movies_rating_tags_df, user_preferences)
print(k_recommendations_ids)



[593, 2571, 260, 1196, 50]


In [260]:
print(movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(k_recommendations_ids)])


      movieId                                           title  \
509       593                       Silence of the Lambs, The   
1932     2571                                     Matrix, The   
224       260              Star Wars: Episode IV - A New Hope   
892      1196  Star Wars: Episode V - The Empire Strikes Back   
46         50                             Usual Suspects, The   

                           genres    year  \
509     [Crime, Horror, Thriller]  1991.0   
1932   [Action, Sci-Fi, Thriller]  1999.0   
224   [Action, Adventure, Sci-Fi]  1977.0   
892   [Action, Adventure, Sci-Fi]  1980.0   
46     [Crime, Mystery, Thriller]  1995.0   

                                                    url  count  \
509   https://m.media-amazon.com/images/M/MV5BNjNhZT...    279   
1932  https://m.media-amazon.com/images/M/MV5BNzQzOT...    278   
224   https://m.media-amazon.com/images/M/MV5BNzVlY2...    251   
892   https://m.media-amazon.com/images/M/MV5BYmU1ND...    211   
46    ht

# Content-based

In [261]:

def content_based_recommendation(df, last_seen_movies, user_preferences):

    # Combine relevant features
    df['features'] = df['genres'] + df['tag'] + df['title'].apply(lambda x: [x])  # Add movie titles to features
    
    df['features'] = df['features'].apply(lambda x: ' '.join(x))  # Ensure 'features' is a string
    
    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])
    
    # Calculate cosine similarity
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # Get indices of movies similar to the last seen movies
    similar_movies_indices = []
    for movie_title in last_seen_movies['liked_movies']:
        movie_indices = df.index[df['title'] == movie_title]
        if len(movie_indices) > 0:
            idx = movie_indices[0]
            similar_movies_indices.extend(cosine_sim[idx].argsort()[-21:-1])  # Get top 20 similar movies

    #print(similar_movies_indices)
    
    # Remove duplicates and movies already seen
    similar_movies_indices = list(set(similar_movies_indices) - set(df.index[df['title'].isin(last_seen_movies)]))

    #print(similar_movies_indices)
    
    # Filter movies based on user's disliked genres
    
    if user_preferences['disliked_genres']:
        disliked_genres = user_preferences['disliked_genres']
        similar_movies_indices = [idx for idx in similar_movies_indices if not any(genre in disliked_genres for genre in df.loc[idx, 'genres'])]
    
    # Return recommended movie titles
    # return similar_movies_indices[:5]
    return df.loc[similar_movies_indices[:5], 'movieId'].tolist()

# Example usage:
user_preferences = {'preferred_genres': ['Fantasy', 'Action', 'Thriller'], 'disliked_genres': ['Drama']}
user_history = {'liked_movies': ['Zack and Miri Make a Porno', 'Casino', 'Cutthroat Island']}
content_based_recommendations_ids = content_based_recommendation(movies_rating_tags_df, user_history, user_preferences)
print(content_based_recommendations_ids)


[140110, 94833, 128097, 7832, 8]


In [262]:
#content_based_recomendations_df = movies_rating_tags_df.loc[content_based_recommendations_ids]

content_based_recomendations_df = movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(content_based_recommendations_ids)]
content_based_recomendations_df.head()

Unnamed: 0,movieId,title,genres,year,url,count,weighted_rating,tag,features
8704,140110,The Intern,[Comedy],2015.0,https://m.media-amazon.com/images/M/MV5BMTUyNj...,13,3.505068,[],Comedy The Intern
8453,128097,Jim Norton: American Degenerate,[Comedy],2013.0,https://m.media-amazon.com/images/M/MV5BMTU3NT...,1,3.500998,[],Comedy Jim Norton: American Degenerate
4998,7832,"Thin Man Goes Home, The","[Comedy, Crime, Mystery]",1945.0,https://m.media-amazon.com/images/M/MV5BOWQ2ND...,1,3.5,[Nick and Nora Charles],Comedy Crime Mystery Nick and Nora Charles Thi...
7683,94833,"Pirates! Band of Misfits, The","[Adventure, Animation, Children, Comedy]",2012.0,https://m.media-amazon.com/images/M/MV5BNDhkOG...,3,3.498807,[],Adventure Animation Children Comedy Pirates! ...
7,8,Tom and Huck,"[Adventure, Children]",1995.0,https://m.media-amazon.com/images/M/MV5BN2ZkZT...,8,3.490551,[],Adventure Children Tom and Huck


# collaborative-filtering

In [263]:
# Function to create the utility matrix
def create_utility_matrix(df):
    utility_matrix = df.pivot(index='userId', columns='movieId', values='rating').fillna(0)
    return utility_matrix

# Function for collaborative filtering recommendation using kNN
def collaborative_filtering_recommendation(df, user_id, k, num_recommendations, user_preferences):
    # Create the utility matrix
    utility_matrix = create_utility_matrix(df)
    
    # Map user IDs to indices
    user_indices = {user_id: idx for idx, user_id in enumerate(utility_matrix.index)}
    
    # Check if the given user ID exists
    if user_id not in user_indices:
        print("User ID does not exist.")
        return []
    
    # Calculate cosine similarity between users
    user_similarity = cosine_similarity(utility_matrix)
    
    # Find k nearest neighbors for the target user
    knn = NearestNeighbors(n_neighbors=k, metric='cosine')
    knn.fit(user_similarity)
    _, indices = knn.kneighbors([user_similarity[user_indices[user_id]]])
    
    # Get the ratings of the nearest neighbors
    neighbor_ratings = utility_matrix.iloc[indices[0]]
    
    # Calculate the average rating for each item
    item_ratings = neighbor_ratings.mean(axis=0)
    
    # Filter out items already rated by the target user
    user_ratings = utility_matrix.loc[user_id]
    recommended_items = item_ratings[user_ratings == 0].sort_values(ascending=False).index.tolist()[:num_recommendations]
    
    # # Filter out recommended items based on user preferences
    # if user_preferences['disliked_genres']:
    #     df_filtered = movies_rating_tags_df[~movies_rating_tags_df['genres'].apply(lambda x: any(genre in user_preferences['disliked_genres'] for genre in x))]
    #     recommended_items = [item for item in recommended_items if item in df_filtered['movieId']]
    
    return movies_rating_tags_df.loc[recommended_items, 'movieId'].tolist()

# Example usage:
user_id = 200010
k_neighbors = 5
num_recommendations = 5
user_preferences = {'preferred_genres': ['Fantasy', 'Action', 'Thriller'], 'disliked_genres': ['Drama']}
collaborative_filtering_recommendations = collaborative_filtering_recommendation(ratings, user_id, k_neighbors, num_recommendations, user_preferences)


# Get collaborative filtering recommendations using kNN
collaborative_filtering_recommendations_ids = collaborative_filtering_recommendation(ratings, user_id, k_neighbors, num_recommendations, user_preferences)

# Display recommended movie IDs
print("Collaborative Filtering Recommendations:", collaborative_filtering_recommendations_ids)

# Filter the movies DataFrame based on recommended movie IDs
collaborative_filtering_recommendations_df = movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(collaborative_filtering_recommendations_ids)]
#collaborative_filtering_recommendations_df = movies_rating_tags_df.loc[collaborative_filtering_recommendations_ids]

# Display recommended movies
print("Collaborative Filtering Recommendations:")
print(collaborative_filtering_recommendations_df)

Collaborative Filtering Recommendations: [1679, 5673, 4, 741, 132]
Collaborative Filtering Recommendations:
      movieId                                title                    genres  \
594       741  Ghost in the Shell (Kôkaku kidôtai)       [Animation, Sci-Fi]   
3996     5673                     Punch-Drunk Love  [Comedy, Drama, Romance]   
1258     1679                Chairman of the Board                  [Comedy]   
112       132                                 Jade                [Thriller]   
3           4                    Waiting to Exhale  [Comedy, Drama, Romance]   

        year                                                url  count  \
594   1995.0  https://m.media-amazon.com/images/M/MV5BYWRiYj...     27   
3996  2002.0  https://m.media-amazon.com/images/M/MV5BYmE1OT...     33   
1258  1998.0  https://m.media-amazon.com/images/M/MV5BMDhmYz...      1   
112   1995.0  https://m.media-amazon.com/images/M/MV5BYTcwZD...      6   
3     1995.0  https://m.media-amazon.com/

# Hybrid-filtering

In [264]:
def hybrid_based_recommendation(knowledge_recommendations_ids, content_recommendations_ids, collaborative_recommendations_ids):

    knowledge_weight = 1
    content_weight = 0.5
    collaborative_weight = 0.8

     # Calculate the number of movies to be recommended from each technique
    num_movies_per_technique = 5
    
    # Apply weights to each technique's recommendations
    weighted_knowledge_recommendations = knowledge_recommendations_ids[:num_movies_per_technique] * knowledge_weight
    weighted_content_recommendations = content_recommendations_ids[:num_movies_per_technique] * content_weight
    weighted_collaborative_recommendations = collaborative_recommendations_ids[:num_movies_per_technique] * collaborative_weight
    
    # Combine recommendations from all techniques
    combined_recommendations = weighted_knowledge_recommendations + weighted_content_recommendations + weighted_collaborative_recommendations

    combined_recommendations = list(set(combined_recommendations))

    
    # # Ensure the total number of recommendations is not more than 10
    combined_recommendations = combined_recommendations[:10]
    
    return combined_recommendations




user_preferences = {'preferred_genres' : ['Fantasy', 'Action', 'Thriller'], 'disliked_genres' : ['Drama']}
user_history = {'liked_movies': ['Zack and Miri Make a Porno', 'Cutthroat Island']}
user_id = 200010
k_neighbors = 5
num_recommendations = 5

k_recommendations_ids = knowledge_based_recommendation(movies_rating_tags_df, user_preferences)
content_based_recommendations_ids = content_based_recommendation(movies_rating_tags_df, user_history, user_preferences)
collaborative_filtering_recommendations_ids = collaborative_filtering_recommendation(ratings, user_id, k_neighbors, num_recommendations, user_preferences)
    
recommendations_hybrid = hybrid_based_recommendation(k_recommendations_ids,content_based_recommendations_ids, collaborative_filtering_recommendations_ids)
print(recommendations_hybrid)


[260, 4, 741, 7, 132, 5673, 10, 2571, 1196, 140110]


In [265]:
movies_recomend = movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(recommendations_hybrid)]
print(movies_recomend)

disliked_genre = 'Drama'

# Filter out movies with the disliked genre
filtered_recommendations = movies_recomend[~movies_recomend['genres'].str.contains(disliked_genre, regex=False)]

# Print or use filtered_recommendations as needed
filtered_recommendations.head(10)

      movieId                                           title  \
1932     2571                                     Matrix, The   
224       260              Star Wars: Episode IV - A New Hope   
892      1196  Star Wars: Episode V - The Empire Strikes Back   
594       741             Ghost in the Shell (Kôkaku kidôtai)   
3996     5673                                Punch-Drunk Love   
8704   140110                                      The Intern   
9          10                                       GoldenEye   
112       132                                            Jade   
3           4                               Waiting to Exhale   
6           7                                         Sabrina   

                             genres    year  \
1932     [Action, Sci-Fi, Thriller]  1999.0   
224     [Action, Adventure, Sci-Fi]  1977.0   
892     [Action, Adventure, Sci-Fi]  1980.0   
594             [Animation, Sci-Fi]  1995.0   
3996       [Comedy, Drama, Romance]  2002.0   
87

Unnamed: 0,movieId,title,genres,year,url,count,weighted_rating,tag,features
1932,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999.0,https://m.media-amazon.com/images/M/MV5BNzQzOT...,278,3.750129,"[martial arts, sci-fi, alternate universe, phi...",Action Sci-Fi Thriller martial arts sci-fi alt...
224,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977.0,https://m.media-amazon.com/images/M/MV5BNzVlY2...,251,3.733955,"[classic, space action, action, sci-fi, EPIC, ...",Action Adventure Sci-Fi classic space action a...
892,1196,Star Wars: Episode V - The Empire Strikes Back,"[Action, Adventure, Sci-Fi]",1980.0,https://m.media-amazon.com/images/M/MV5BYmU1ND...,211,3.707736,"[I am your father, space, space opera, classic...",Action Adventure Sci-Fi I am your father space...
594,741,Ghost in the Shell (Kôkaku kidôtai),"[Animation, Sci-Fi]",1995.0,https://m.media-amazon.com/images/M/MV5BYWRiYj...,27,3.53074,[],Animation Sci-Fi Ghost in the Shell (Kôkaku k...
8704,140110,The Intern,[Comedy],2015.0,https://m.media-amazon.com/images/M/MV5BMTUyNj...,13,3.505068,[],Comedy The Intern
9,10,GoldenEye,"[Action, Adventure, Thriller]",1995.0,https://m.media-amazon.com/images/M/MV5BMzk2OT...,132,3.5,[],Action Adventure Thriller GoldenEye
112,132,Jade,[Thriller],1995.0,https://m.media-amazon.com/images/M/MV5BYTcwZD...,6,3.4917,[],Thriller Jade
6,7,Sabrina,"[Comedy, Romance]",1995.0,https://m.media-amazon.com/images/M/MV5BYjQ5Zj...,54,3.470758,[remake],Comedy Romance remake Sabrina


# Debugging


In [266]:
# user_id = 5

# # Filter ratings for the specified user
# ratings_user = ratings[ratings['userId'] == user_id]

# # Get the IDs of the last three movies seen by the user
# titles_ids = ratings_user['movieId'].tail(3).tolist()

# # Filter the movie titles based on the IDs
# title_list = movies_rating_tags_df[movies_rating_tags_df['movieId'].isin(titles_ids)]['title'].tolist()

# # Print the titles
# print(title_list)

user_preferences = {'disliked_genres' : ['Drama']}
print(movies_rating_tags_df[~movies_rating_tags_df['genres'].apply(lambda x: any(genre in user_preferences['disliked_genres'] for genre in x))])


      movieId                                           title  \
509       593                       Silence of the Lambs, The   
1932     2571                                     Matrix, The   
224       260              Star Wars: Episode IV - A New Hope   
892      1196  Star Wars: Episode V - The Empire Strikes Back   
46         50                             Usual Suspects, The   
...       ...                                             ...   
18         19                  Ace Ventura: When Nature Calls   
301       344                      Ace Ventura: Pet Detective   
378       435                                       Coneheads   
2022     2701                                  Wild Wild West   
126       153                                  Batman Forever   

                                  genres    year  \
509            [Crime, Horror, Thriller]  1991.0   
1932          [Action, Sci-Fi, Thriller]  1999.0   
224          [Action, Adventure, Sci-Fi]  1977.0   
892        