# MOVIE RECOMMENDER SYSTEM

# Objective 1:

Create a popularity-based recommender system at a genre level. The user will input a 
genre (g), minimum rating threshold (t) for a movie, and no. of
recommendations(N) for which it should be recommended top N movies which are 
most popular within that genre (g) ordered by ratings in descending order where each 
movie has at least (t) reviews.

In [4]:
import pandas as pd

In [5]:
# Load your movie dataset
movies_data = pd.read_csv('movies.csv') 
ratings_data = pd.read_csv('ratings.csv') 

In [6]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [8]:
# Merge based on 'movieId'
movie_info = pd.merge(movies_data, ratings_data, on='movieId', how='inner')
movie_info.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286


In [9]:
#movie_info[movie_info['title'] == 'Cosmic Scrat-tastrophe (2015)']

In [10]:
#finding the total number of individual rating of each movie
num_reviews = movie_info.groupby('title')['rating'].count().reset_index(name='num_reviews')

num_reviews = num_reviews.nlargest(20,'num_reviews') # Display the resulting DataFrame with the top 20 rows
num_reviews.head()

Unnamed: 0,title,num_reviews
7323,Pulp Fiction (1994),325
3349,Forrest Gump (1994),311
8136,"Shawshank Redemption, The (1994)",308
4934,Jurassic Park (1993),294
8228,"Silence of the Lambs, The (1991)",290


In [11]:
# Merge based on 'movieId'
merge_movie_info = pd.merge(movie_info, num_reviews, on='title', how='inner')
merge_movie_info.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,num_reviews
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2,5.0,859046895,232
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,1303501039,232
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,5.0,858610933,232
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.0,850815810,232
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,14,4.0,851766286,232


### Receiving User Input

In [13]:
print("Fill in all the neccessary fields")
g = input("Enter the genre: ")
t = float(input("Enter the minimum rating threshold: "))
N = int(input("Enter the number of recommendations: "))

Fill in all the neccessary fields


Enter the genre:  romance
Enter the minimum rating threshold:  4
Enter the number of recommendations:  4


In [14]:
genre_filtered = merge_movie_info[merge_movie_info['genres'].str.contains(g, case=False, na=False)]

In [15]:
genre_filtered.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,num_reviews
1614,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1,3.0,1217896231,311
1615,356,Forrest Gump (1994),Comedy|Drama|Romance|War,3,3.0,841483545,311
1616,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7,5.0,1322062754,311
1617,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9,3.0,842686293,311
1618,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10,4.0,1318897003,311


In [16]:
idx_max_rating = genre_filtered.groupby('title')['rating'].idxmax()

In [17]:
filtered_df = genre_filtered.loc[idx_max_rating]

In [18]:
high_rated_filtered = filtered_df[filtered_df['rating'] >= t]
high_rated_filtered.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,num_reviews
4830,2858,American Beauty (1999),Drama|Romance,6,5.0,1348693734,216
1616,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7,5.0,1322062754,311


In [19]:
high_rated_filtered.shape

(2, 7)

In [20]:
top_N_recommendations = high_rated_filtered.sort_values(by=['rating'], ascending=False).head(N)
top_N_recommendations

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,num_reviews
4830,2858,American Beauty (1999),Drama|Romance,6,5.0,1348693734,216
1616,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7,5.0,1322062754,311


In [21]:
print(f"Top {N} recommendations in the genre '{g}' with minimum rating {t}:\n")
#displaying dataframe for the filtered result
top_N_recommendations[['title', 'rating', 'num_reviews']]

Top 4 recommendations in the genre 'romance' with minimum rating 4.0:



Unnamed: 0,title,rating,num_reviews
4830,American Beauty (1999),5.0,216
1616,Forrest Gump (1994),5.0,311


## Objective 2:
Create a content-based recommender system that recommends top N movies based on 
similar movie(m) genres.

In [23]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [24]:
# Load MovieLens dataset
movies = pd.read_csv('movies.csv')

# Preprocess genres (convert to lowercase and remove spaces)
movies['genres'] = movies['genres'].str.lower().str.replace(' ', '')

# Create a TF-IDF vectorizer to convert genres into numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])
tfidf_matrix

<10329x22 sparse matrix of type '<class 'numpy.float64'>'
	with 24169 stored elements in Compressed Sparse Row format>

In [25]:
# Calculate cosine similarity between movies based on genres
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.79977247, 0.1589222 , ..., 0.2638368 , 0.        ,
        0.        ],
       [0.79977247, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.1589222 , 0.        , 1.        , ..., 0.60235038, 0.        ,
        0.        ],
       ...,
       [0.2638368 , 0.        , 0.60235038, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [68]:
# Function to get top N movie recommendations based on genres
def get_recommendations(movie_title, N=5):
    idx = movies.index[movies['title'] == movie_title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:N+1]  # Exclude the movie itself (index 0) and get top N
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

In [78]:
movie_title = input("Enter the moviet title: ")
N = int(input("Enter the number of recommendations: "))

Enter the moviet title:  Wild, The (2006)
Enter the number of recommendations:  4


In [80]:
# Example: Get top 5 recommendations for a movie with a given title
top_recommendations = get_recommendations(movie_title, N)

# Display the recommendations
print(f"Top {N} recommendations for '{movie_title}':")
top_recommendations

Top 4 recommendations for 'Wild, The (2006)':


1815                                       Antz (1998)
2496                                Toy Story 2 (1999)
2967    Adventures of Rocky and Bullwinkle, The (2000)
3166                  Emperor's New Groove, The (2000)
Name: title, dtype: object

## Objective 3
Create a collaborative based recommender system which recommends top N movies 
based on “K” similar users for a target user “u”

In [29]:
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import train_test_split, cross_validate

In [30]:
# Load MovieLens dataset (assuming 'ratings.csv' with columns 'userId', 'movieId', 'rating')
ratings = pd.read_csv('ratings.csv')

# Surprise requires a 'Reader' object to parse the dataframe
reader = Reader(rating_scale=(1, 5))

In [31]:
# Load the dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split the dataset into training and test sets
trainset, testset = train_test_split(data, test_size=0.25)

In [32]:
# Use k-NN collaborative filtering algorithm
sim_options = {'name': 'cosine', 'user_based': True}
knn_model = KNNBasic(sim_options=sim_options)

# Train the model
knn_model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f4a7b7bd790>

In [33]:
# Get top N movie recommendations for a target user
target_user_id = int(input("Enter the target your ID: "))
N = int(input("Enter the number of recommendations: "))

Enter the target your ID:  8
Enter the number of recommendations:  7


In [34]:
# Get the list of all movie IDs
all_movie_ids = ratings['movieId'].unique()
# Exclude movies that the target user has already rated
target_user_rated_movies = ratings[ratings['userId'] == target_user_id]['movieId'].tolist()
movies_to_predict = list(set(all_movie_ids) - set(target_user_rated_movies))

# Predict ratings for movies not yet rated by the target user
predictions = [knn_model.predict(target_user_id, movie_id) for movie_id in movies_to_predict]

In [35]:
# Sort the predictions by predicted rating in descending order
sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

# Get top N recommended movies
top_recommendations = [(pred.iid, pred.est) for pred in sorted_predictions[:N]]

In [84]:
top_recommendations

1815                                       Antz (1998)
2496                                Toy Story 2 (1999)
2967    Adventures of Rocky and Bullwinkle, The (2000)
3166                  Emperor's New Groove, The (2000)
Name: title, dtype: object

In [36]:
# Display the recommendations
print(f"Top {N} recommendations for user {target_user_id}:")
for movie_id, predicted_rating in top_recommendations:
    movie_title = movies.loc[movies['movieId'] == movie_id, 'title'].values[0]
    print(f"Movie: {movie_title}, Predicted Rating: {predicted_rating}")

Top 7 recommendations for user 8:
Movie: Star Maker, The (Uomo delle stelle, L') (1995), Predicted Rating: 5
Movie: Fluke (1995), Predicted Rating: 5
Movie: 8 Seconds (1994), Predicted Rating: 5
Movie: Being Human (1993), Predicted Rating: 5
Movie: Heaven & Earth (1993), Predicted Rating: 5
Movie: Cold Fever (Á köldum klaka) (1995), Predicted Rating: 5
Movie: Captives (1994), Predicted Rating: 5


In [88]:
import pickle

In [107]:
pickle.dump(merge_movie_info, open('merge_movie_info.pkl', 'wb'))
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(ratings, open('ratings.pkl', 'wb'))

In [117]:
pickle.load(open('ratings.pkl', 'rb'))

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898
