In [None]:
pip install fuzzywuzzy



In [None]:
pip install surprise



In [None]:
import numpy as np
import numpy.ma as ma
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process



# Item based

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
movie_dataset = movies
merged_dataset = pd.merge(ratings, movie_dataset, how='inner', on='movieId')
merged_dataset.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [None]:
refined_dataset = merged_dataset.groupby(by=['userId','title'], as_index=False).agg({"rating":"mean"})
refined_dataset.head()

Unnamed: 0,userId,title,rating
0,1,"13th Warrior, The (1999)",4.0
1,1,20 Dates (1998),4.0
2,1,"Abyss, The (1989)",4.0
3,1,"Adventures of Robin Hood, The (1938)",5.0
4,1,Alice in Wonderland (1951),5.0


In [None]:
# get rating frequency
movies_count_df = pd.DataFrame(refined_dataset.groupby('title').size(), columns=['count'])
movies_count_df.head()

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
'71 (2014),1
'Hellboy': The Seeds of Creation (2004),1
'Round Midnight (1986),2
'Salem's Lot (2004),1
'Til There Was You (1997),2


In [None]:
# pivot and create movie-user matrix
movie_to_user_df = refined_dataset.pivot(
     index='title',
   columns='userId',
      values='rating').fillna(0)

movie_to_user_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# transform matrix to scipy sparse matrix
movie_to_user_sparse_df = csr_matrix(movie_to_user_df.values)
movie_to_user_sparse_df

<9719x610 sparse matrix of type '<class 'numpy.float64'>'
	with 100832 stored elements in Compressed Sparse Row format>

In [None]:
#Create dictionary with movie name as key
movies_list = list(movie_to_user_df.index)
movie_dict = {movie : index for index, movie in enumerate(movies_list)}
case_insensitive_movies_list = [i.lower() for i in movies_list]

In [None]:
knn_movie_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_movie_model.fit(movie_to_user_sparse_df)

In [None]:
def get_similar_movies(movie, n = 10):
  index = movie_dict[movie]
  knn_input = np.asarray([movie_to_user_df.values[index]])
  n = min(len(movies_list)-1,n)
  distances, indices = knn_movie_model.kneighbors(knn_input, n_neighbors=n+1)

  print("Top",n,"movies which are very much similar to the Movie-",movie, "are: ")
  print(" ")
  for i in range(1,len(distances[0])):
    print(movies_list[indices[0][i]])



In [None]:
from pprint import pprint
movie_name = '101 Dalmatians (1996)'
get_similar_movies(movie_name)

Top 10 movies which are very much similar to the Movie- 101 Dalmatians (1996) are: 
 
Matilda (1996)
Jack (1996)
Dumbo (1941)
George of the Jungle (1997)
Hunchback of Notre Dame, The (1996)
Space Jam (1996)
Peter Pan (1953)
Willy Wonka & the Chocolate Factory (1971)
James and the Giant Peach (1996)
How the Grinch Stole Christmas (a.k.a. The Grinch) (2000)


In [None]:
movie_name = 'Space Jam (1996)'
get_similar_movies(movie_name)

Top 10 movies which are very much similar to the Movie- Space Jam (1996) are: 
 
Hook (1991)
Matilda (1996)
Jingle All the Way (1996)
Dracula: Dead and Loving It (1995)
Panic Room (2002)
101 Dalmatians (1996)
Alice in Wonderland (1951)
Happy Gilmore (1996)
Rundown, The (2003)
Small Soldiers (1998)


In [None]:
from sklearn.metrics import mean_squared_error

def evaluate_movie_model(model, data):
    distances, indices = model.kneighbors(data, n_neighbors=10)
    mse_scores = []

    for i in range(data.shape[0]):
        # Extract the true ratings
        true_ratings = data[i, :].toarray().flatten()

        # Extract the predicted ratings
        predicted_ratings = data[indices[i], :].toarray()

        # Flatten the arrays
        true_ratings_flat = true_ratings.flatten()
        predicted_ratings_flat = predicted_ratings.flatten()

        # Keep non-zero true ratings
        non_zero_mask = true_ratings_flat.nonzero()
        true_ratings_nonzero = true_ratings_flat[non_zero_mask]
        predicted_ratings_nonzero = predicted_ratings_flat[non_zero_mask]

        # Calculate MSE
        mse_fold = mean_squared_error(true_ratings_nonzero, predicted_ratings_nonzero)
        mse_scores.append(mse_fold)

    # Calculate the overall MSE and RMSE
    overall_mse = np.mean(mse_scores)
    overall_rmse = np.sqrt(overall_mse)
    return overall_rmse

rmse_movie_model = evaluate_movie_model(knn_movie_model, movie_to_user_sparse_df)
print(f'Root Mean Squared Error for Movie Model: {rmse_movie_model}')


Root Mean Squared Error for Movie Model: 0.6598365697626742


In [None]:
movie_name = 'Matilda (1996)'
get_similar_movies(movie_name,15)

Top 15 movies which are very much similar to the Movie- Matilda (1996) are: 
 
Homeward Bound: The Incredible Journey (1993)
Space Jam (1996)
101 Dalmatians (1996)
Peter Pan (1953)
Homeward Bound II: Lost in San Francisco (1996)
James and the Giant Peach (1996)
Hercules (1997)
George of the Jungle (1997)
How the Grinch Stole Christmas (a.k.a. The Grinch) (2000)
Jungle Book, The (1967)
Robin Hood (1973)
Holes (2003)
Alice in Wonderland (1951)
Harriet the Spy (1996)
Oliver & Company (1988)


# User based

In [None]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
merged_dataset = pd.merge(ratings, movies, how='inner', on='movieId')
merged_dataset = merged_dataset.drop(columns = 'genres')
merged_dataset.head()


Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


In [None]:

refined_dataset = merged_dataset.groupby(by=['userId','title'], as_index=False).agg({"rating":"mean"})
refined_dataset.head()

Unnamed: 0,userId,title,rating
0,1,"13th Warrior, The (1999)",4.0
1,1,20 Dates (1998),4.0
2,1,"Abyss, The (1989)",4.0
3,1,"Adventures of Robin Hood, The (1938)",5.0
4,1,Alice in Wonderland (1951),5.0


In [None]:
user_to_movie_df = refined_dataset.pivot(
    index='userId',
     columns='title',
      values='rating').fillna(0)

user_to_movie_df.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# transform matrix to scipy sparse matrix
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)
user_to_movie_sparse_df


<610x9719 sparse matrix of type '<class 'numpy.float64'>'
	with 100832 stored elements in Compressed Sparse Row format>

In [None]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

In [None]:
from pprint import pprint

def recommender_system(user_id, n_similar_users, n_movies):

  def get_similar_users(user, n = 5):
    knn_input = np.asarray([user_to_movie_df.values[user-1]])
    distances, indices = knn_model.kneighbors(knn_input, n_neighbors=n+1)

    print("Top",n,"users who are very much similar to the User-",user, "are: ")
    print(" ")

    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
    print("")
    return indices.flatten()[1:] + 1, distances.flatten()[1:]


  def filtered_movie_recommendations(n = 10):
    first_zero_index = np.where(mean_rating_list == 0)[0][-1]
    sortd_index = np.argsort(mean_rating_list)[::-1]
    sortd_index = sortd_index[:list(sortd_index).index(first_zero_index)]
    n = min(len(sortd_index),n)
    movies_watched = list(refined_dataset[refined_dataset['userId'] == user_id]['title'])
    filtered_movie_list = list(movies_list[sortd_index])
    count = 0
    final_movie_list = []
    for i in filtered_movie_list:
      if i not in movies_watched:
        count+=1
        final_movie_list.append(i)
      if count == n:
        break
    if count == 0:
      print("There are no movies left which are not seen by the input users and seen by similar users. May be increasing the number of similar users who are to be considered may give a chance of suggesting an unseen good movie.")
    else:
      pprint(final_movie_list)

  similar_user_list, distance_list = get_similar_users(user_id,n_similar_users)
  weightage_list = distance_list/np.sum(distance_list)
  mov_rtngs_sim_users = user_to_movie_df.values[similar_user_list]
  movies_list = user_to_movie_df.columns
  weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
  new_rating_matrix = weightage_list*mov_rtngs_sim_users
  mean_rating_list = new_rating_matrix.sum(axis =0)
  print("")
  print("Movies recommended based on similar users are: ")
  print("")
  filtered_movie_recommendations(n_movies)

In [None]:
print("Enter user id")
user_id= int(input())
print("number of similar users to be considered")
sim_users = int(input())
print("Enter number of movies to be recommended:")
n_movies = int(input())
recommender_system(user_id,sim_users,n_movies)

Enter user id
67
number of similar users to be considered
10
Enter number of movies to be recommended:
10
Top 10 users who are very much similar to the User- 67 are: 
 
1 . User: 549 separated by distance of 0.5993372783144303
2 . User: 209 separated by distance of 0.6283838658335951
3 . User: 399 separated by distance of 0.6454884428968559
4 . User: 407 separated by distance of 0.6639150442082619
5 . User: 247 separated by distance of 0.6685795243057455
6 . User: 439 separated by distance of 0.6688514740625997
7 . User: 15 separated by distance of 0.671875234693037
8 . User: 471 separated by distance of 0.6792052357756951
9 . User: 434 separated by distance of 0.6836599209833294
10 . User: 254 separated by distance of 0.6893686606431346


Movies recommended based on similar users are: 

['Dark Knight, The (2008)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Raiders of th

### KNN using surprise library

In [None]:
pip install surprise



## Item Based



In [None]:
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train-test split
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

# Train
knn_model = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
knn_model.fit(trainset)

# Evaluate the model using cross-validation
cross_validate(knn_model, data, measures=['RMSE'], cv=5, verbose=True)

# Predict similar movies
def get_surprise_similar_movies(movie_name, model, dataset, n=10):
    movie_id = dataset[dataset['title'] == movie_name]['movieId'].values[0]

    # Get top N recommendations for the movie
    top_n = model.get_neighbors(movie_id, k=n)

    print(f"Top {n} movies similar to {movie_name}:")
    for movie_id in top_n:
        movie_title = dataset[dataset['movieId'] == movie_id]['title'].values[0]
        print(movie_title)

# Example
movie_name = '101 Dalmatians (1996)'
get_surprise_similar_movies(movie_name, knn_model, movie_dataset, n=10)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9721  0.9749  0.9720  0.9778  0.9815  0.9757  0.0036  
Fit time          20.35   8.43    8.43    7.95    8.24    10.68   4.84    
Test time         9.12    8.20    8.96    7.96    8.71    8.59    0.44    
Top 10 movies similar to 101 Dalmatians (1996):
Grumpier Old Men (1995)
Father of the Bride Part II (1995)
Tom and Huck (1995)
Dracula: Dead and Loving It (1995)
Cutthroat Island (1995)
Casi

## User Based

In [None]:
from surprise import Dataset, Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate

# Load data
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Train-test
trainset = data.build_full_trainset()
testset = trainset.build_anti_testset()

knn_model = KNNBasic(sim_options={'name': 'cosine', 'user_based': True})
knn_model.fit(trainset)

# Evaluate the model using cross-validation
cross_validate(knn_model, data, measures=['RMSE'], cv=5, verbose=True)

# Predict movies
def get_movies(movie_name, model, dataset, n=10):
    user_id_col = 'userId'
    movie_title_col = 'movieId'
    movie_id = dataset[dataset[movie_title_col] == movie_name][movie_title_col].values[0]

    # Get top N recommendations for the user
    user_id = dataset[dataset[movie_title_col] == movie_id][user_id_col].values[0]
    top_n = model.get_neighbors(user_id, k=n)

    print(f"Top {n} movies similar to {movie_name}:")
    for neighbor_id in top_n:
        neighbor_title = dataset[dataset[user_id_col] == neighbor_id][movie_title_col].values[0]
        print(neighbor_title)

movie_name = 1367
get_movies(movie_name, knn_model, ratings, n=10)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9805  0.9740  0.9677  0.9708  0.9701  0.9726  0.0044  
Fit time          0.19    0.18    0.28    0.28    0.28    0.24    0.05    
Test time         2.77    1.51    1.85    1.73    1.90    1.95    0.43    
Top 10 movies similar to 1367:
107
10
150
10
1
107
1
1
1
1
