In [4]:
!pip install fuzzywuzzy
!pip install python-Levenshtein



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

In [2]:
user_ratings_df = pd.read_csv("ratings.csv")
user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425942000.0
1,1,147,4.5,1425942000.0
2,1,858,5.0,1425942000.0
3,1,1221,5.0,1425942000.0
4,1,1246,5.0,1425942000.0


In [3]:
movie_metadata = pd.read_csv("movies_metadata.csv", low_memory=False)
movie_metadata = movie_metadata[['id', 'title', 'genres']]
movie_metadata['id'] = pd.to_numeric(movie_metadata['id'], errors='coerce')
movie_metadata.dropna(subset=['id', 'title'], inplace=True)
movie_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45460 entries, 0 to 45465
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      45460 non-null  float64
 1   title   45460 non-null  object 
 2   genres  45460 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.4+ MB


In [4]:
merged_df = user_ratings_df.merge(movie_metadata, left_on='movieId', right_on='id')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1315388 entries, 0 to 1315387
Data columns (total 7 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1315388 non-null  int64  
 1   movieId    1315388 non-null  int64  
 2   rating     1315387 non-null  float64
 3   timestamp  1315387 non-null  float64
 4   id         1315388 non-null  float64
 5   title      1315388 non-null  object 
 6   genres     1315388 non-null  object 
dtypes: float64(3), int64(2), object(2)
memory usage: 70.2+ MB


In [5]:
user_item_matrix = merged_df.pivot_table(index='userId', columns='title', values='rating').fillna(0)
user_item_matrix

title,!Women Art Revolution,$5 a Day,'Gator Bait,'R Xmas,'Twas the Night Before Christmas,(A)Sexual,...And God Created Woman,...And the Pursuit of Happiness,00 Schneider - Jagd auf Nihil Baxter,10 Items or Less,...,eXistenZ,xXx,¡A volar joven!,¡Three Amigos!,À nos amours,Åke and His World,Æon Flux,Ödipussi,Şaban Oğlu Şaban,Šíleně smutná princezna
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
cf_knn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
cf_knn_model.fit(user_item_matrix.T)

In [7]:
def movie_recommender_engine(movie_name, matrix, cf_model, n_recs):
    # Extract input movie title
    movie_title = process.extractOne(movie_name, matrix.columns)[0]

    if pd.isna(movie_title):
        raise ValueError("Movie not found in the dataset.")

    # Check if movie_title is in the user-item matrix columns
    if movie_title not in matrix.columns:
        print(f"Movie Title '{movie_title}' not found in the user-item matrix.")
        print(f"Available movie titles in the matrix: {list(matrix.columns)}")
        raise ValueError(f"Movie Title '{movie_title}' not found in the user-item matrix.")

    # Get the index of the movie_title
    movie_index = matrix.columns.get_loc(movie_title)

    # Calculate neighbour distances
    distances, indices = cf_model.kneighbors(matrix[movie_title].values.reshape(1, -1), n_neighbors=n_recs+1)

    # Create a list of movie recommendations
    movie_rec_ids = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]

    # List to store recommendations
    cf_recs = []
    for i in movie_rec_ids:
        rec_movie_title = matrix.columns[i[0]]
        cf_recs.append({'Title': rec_movie_title, 'Distance': i[1]})

    # Select top number of recommendations needed
    df = pd.DataFrame(cf_recs, index=range(1, n_recs+1))

    return df

In [8]:
movie_names = pd.DataFrame({
    'id': movie_metadata['id'],
    'title': movie_metadata['title']
})

In [9]:
n_recs = 10
try:
    recommendations = movie_recommender_engine('Batman', user_item_matrix, cf_knn_model, n_recs)
    print(recommendations)
except ValueError as e:
    print(e)

                                          Title  Distance
1                       Airplane II: The Sequel  0.663322
2                       The Blair Witch Project  0.688146
3                                          Rize  0.735964
4                              30 Days of Night  0.765798
5                                Be Kind Rewind  0.766829
6                The Bible: In the Beginning...  0.768532
7                                   Les Cousins  0.784461
8   The Strange Case of Dr. Jekyll and Mr. Hyde  0.788522
9                                     Red Beard  0.793102
10                                     The Game  0.793144
