In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import metrics,preprocessing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow.compat.v1 as tf
import keras

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Activation, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization, Input, Lambda
from tensorflow.keras.layers import Embedding, Flatten, dot
from tensorflow.keras import regularizers
from tensorflow.keras.losses import mse, binary_crossentropy
from tensorflow.keras import models, layers, utils




In [2]:
df = pd.read_csv('ratings.csv')

In [3]:
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
movies = pd.read_csv('movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
tags = pd.read_csv('tags.csv')
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


# FIXING MOVIE ID

In [6]:
oldmovieid = movies['movieId']
oldmovieid

0            1
1            2
2            3
3            4
4            5
         ...  
9737    193581
9738    193583
9739    193585
9740    193587
9741    193609
Name: movieId, Length: 9742, dtype: int64

In [7]:
for i in range(9742):
 df['movieId'].replace(oldmovieid[i],i,inplace=True)
 tags['movieId'].replace(oldmovieid[i],i,inplace=True)

In [8]:
for i in range(9742):
 movies.iloc[i,0]=i

In [9]:
movies

Unnamed: 0,movieId,title,genres
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,Jumanji (1995),Adventure|Children|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama|Romance
4,4,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,9739,Flint (2017),Drama
9740,9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [10]:
df

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,2,4.0
2,1,5,4.0
3,1,43,5.0
4,1,46,5.0
...,...,...,...
100831,610,9434,4.0
100832,610,9461,5.0
100833,610,9462,5.0
100834,610,9463,5.0


In [11]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,6801,funny,1445714994
1,2,6801,Highly quotable,1445714996
2,2,6801,will ferrell,1445714992
3,2,7697,Boxing story,1445715207
4,2,7697,MMA,1445715200
...,...,...,...,...
3678,606,4925,for katie,1171234019
3679,606,5062,austere,1173392334
3680,610,2452,gun fu,1493843984
3681,610,2452,heroic bloodshed,1493843978


# Collaborative filtering with SVD

In [12]:
#SVD
def create_sparse_matrix(df):
    user_ids = df['userId'].values
    movie_ids = df['movieId'].values
    ratings = df['rating'].values
    return csr_matrix((ratings, (user_ids, movie_ids)), shape=(user_ids.max()+1 , movie_ids.max() + 1))

def apply_svd_and_reconstruct(matrix, k):
    U, sigma, Vt = svds(matrix.astype(float), k=k)
    sigma = np.diag(sigma)
    return np.dot(np.dot(U, sigma), Vt)

def find_top_n_ratings(matrix, n=3, num_users=5):
    df = pd.DataFrame(matrix, columns=[f"Movie {i}" for i in range(matrix.shape[1])])
    return df.head(num_users).apply(lambda row: row.nlargest(n).index.tolist(), axis=1)



In [13]:
sparse_matrix = create_sparse_matrix(df)
fullratings = apply_svd_and_reconstruct(sparse_matrix, k=5)
top_ratings = find_top_n_ratings(fullratings)

print(top_ratings)

0    [Movie 2248, Movie 2636, Movie 2193]
1       [Movie 224, Movie 898, Movie 507]
2    [Movie 2226, Movie 4800, Movie 7372]
3       [Movie 902, Movie 898, Movie 939]
4      [Movie 257, Movie 520, Movie 2145]
dtype: object


In [14]:
def recommend_items_svd(user_id, original_df, reconstructed_df, num_recommendations=5):
    # Identify items not rated by the user
    unrated_items = original_df.columns[original_df.loc[user_id] == 0]


    # Predict ratings for unrated items
    predicted_ratings = reconstructed_df.loc[user_id, unrated_items]

    # Recommend items with highest predicted ratings
    recommended_items = predicted_ratings.nlargest(num_recommendations).index.tolist()
    for i in range(9742):
        recommended_items = [movies.iloc[i,1] if item == i else item for item in recommended_items]
    return recommended_items


In [15]:
original_ratings =  sparse_matrix.todense()
original_ratings = pd.DataFrame(original_ratings, index=range(0,611), columns=range(0,9742))
original_ratings.head()

fullratings = pd.DataFrame(fullratings, index=range(0,611), columns=range(0,9742))
fullratings.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
0,-2.192244e-15,-1.001836e-15,-2.177073e-16,-2.579358e-16,-1.030332e-15,1.630246e-16,-1.49637e-15,-6.003771e-17,2.362449e-17,-2.923507e-16,...,1.3319960000000001e-17,1.141711e-17,1.522281e-17,1.522281e-17,1.3319960000000001e-17,1.522281e-17,1.3319960000000001e-17,1.3319960000000001e-17,1.3319960000000001e-17,-2.044485e-17
1,2.474364,1.016143,0.8699298,0.01975528,0.2147739,2.269497,0.294799,0.07672353,0.1999519,2.212759,...,-0.004787858,-0.004103878,-0.005471838,-0.005471838,-0.004787858,-0.005471838,-0.004787858,-0.004787858,-0.004787858,-0.02029045
2,0.180673,0.04978383,-0.1214531,-0.009824506,-0.04439136,0.04788825,-0.09038569,-0.02712808,-0.02060954,-0.0356536,...,0.002659166,0.002279285,0.003039047,0.003039047,0.002659166,0.003039047,0.002659166,0.002659166,0.002659166,0.008376285
3,0.02392089,0.004451627,0.03012017,-0.004261649,-0.01310024,0.07470179,-0.01549075,0.002610902,0.007413554,0.06074872,...,-6.630361e-05,-5.683167e-05,-7.577556e-05,-7.577556e-05,-6.630361e-05,-7.577556e-05,-6.630361e-05,-6.630361e-05,-6.630361e-05,-0.001580653
4,1.088776,0.04771032,0.2336066,0.02880778,0.0707791,0.8268457,0.3007374,-0.0004117944,0.009759105,0.2705673,...,-0.00580153,-0.00497274,-0.00663032,-0.00663032,-0.00580153,-0.00663032,-0.00580153,-0.00580153,-0.00580153,-0.004923501


In [16]:
i=1 #userid
recommendations_svd = recommend_items_svd(i, original_ratings, fullratings)
print(recommendations_svd)


['Terminator 2: Judgment Day (1991)', 'Aliens (1986)', 'Godfather, The (1972)', 'Die Hard (1988)', 'Blade Runner (1982)']


# Content based filtering (Using genre)

In [17]:
allgenres=['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir',
           'Horror','IMAX','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','(no genres listed)']

In [18]:
a = np.zeros((9742,20))


In [19]:
for i in range(9742):
 for j in  range(20) :
  if allgenres[j] in movies.iloc[i,2]:
     a[i,j]=1
  else:
     a[i,j]=0

In [20]:
genre_matrix = pd.DataFrame(a,columns=['Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir',
           'Horror','IMAX','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western','(no genres listed)'])

In [21]:
#1 if the movie belongs to the genre
genre_matrix

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Compute the cosine similarity matrix
similarity = cosine_similarity(genre_matrix)
similarity

array([[1.        , 0.77459667, 0.31622777, ..., 0.        , 0.31622777,
        0.4472136 ],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 1.        , ..., 0.        , 0.        ,
        0.70710678],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.4472136 , 0.        , 0.70710678, ..., 0.        , 0.        ,
        1.        ]])

In [23]:
# Function to get the recommended movies
def get_recommendations(title, top_n=5):
    # Find the index of the movie with the given title
    idx = movies[movies['title'] == title].index[0]
    
    # Get the cosine similarity scores for the movie
    similarity_scores = list(enumerate(similarity[idx]))
    
    # Sort the similarity scores in descending order
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top_n movie indices
    movie_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    
    # Return the top_n most similar movies
    return movies['title'].iloc[movie_indices]
     

In [24]:
# Ask the user for the movie name
title = input("Enter the title of your favorite movie: ")


# Get the recommended movies
print("Top 5 similar movies:")
print(get_recommendations(title))

Enter the title of your favorite movie: Jumanji (1995)
Top 5 similar movies:
53             Indian in the Cupboard, The (1995)
109             NeverEnding Story III, The (1994)
767               Escape to Witch Mountain (1975)
1514    Darby O'Gill and the Little People (1959)
1556                          Return to Oz (1985)
Name: title, dtype: object


# Testing and comparing

In [25]:
# Ask the user for the movie name
title = input("Enter the title of your favorite movie: ")


# Get the recommended movies
print("Top 5 similar movies:")
print(get_recommendations(title))


Enter the title of your favorite movie: Toy Story (1995)
Top 5 similar movies:
1706                                       Antz (1998)
2355                                Toy Story 2 (1999)
2809    Adventures of Rocky and Bullwinkle, The (2000)
3000                  Emperor's New Groove, The (2000)
3568                             Monsters, Inc. (2001)
Name: title, dtype: object


Top 5 according to imdb (that are also in this database):Toy story 3, Toy story 2,Monsters Inc,Up,Finding Nemo (2/5 hits)

In [26]:
# Ask the user for the movie name
title = input("Enter the title of your favorite movie: ")


# Get the recommended movies
print("Top 5 similar movies:")
print(get_recommendations(title))


Enter the title of your favorite movie: Jingle All the Way (1996)
Top 5 similar movies:
49                   Big Green, The (1995)
78                Dunston Checks In (1996)
214    Heavyweights (Heavy Weights) (1995)
332                     Richie Rich (1994)
497             Little Rascals, The (1994)
Name: title, dtype: object


Top 5 according to imdb (that are also in this database):Christmas with the kranks,Jack Frost,Elf,Home Alone 2: Lost in New York,Miracle on 34th Street (0/5 hits)

# Content based filtering (Using tag and genre)

In [27]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,6801,funny,1445714994
1,2,6801,Highly quotable,1445714996
2,2,6801,will ferrell,1445714992
3,2,7697,Boxing story,1445715207
4,2,7697,MMA,1445715200
...,...,...,...,...
3678,606,4925,for katie,1171234019
3679,606,5062,austere,1173392334
3680,610,2452,gun fu,1493843984
3681,610,2452,heroic bloodshed,1493843978


In [28]:
# Group the tags dataframe by `movieId` and join the `tag` values in the `tag` column separated by a comma
grouped_tags = tags.groupby('movieId')['tag'].apply(lambda x: ', '.join(x)).reset_index()

# Merge the `movies` dataframe with the `grouped_tags` dataframe
final_df = movies.merge(grouped_tags, on='movieId', how='left')

# Select only the desired columns in the final dataframe
final_df = final_df[['movieId', 'title', 'genres', 'tag']]
final_df

Unnamed: 0,movieId,title,genres,tag
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"pixar, pixar, fun"
1,1,Jumanji (1995),Adventure|Children|Fantasy,"fantasy, magic board game, Robin Williams, game"
2,2,Grumpier Old Men (1995),Comedy|Romance,"moldy, old"
3,3,Waiting to Exhale (1995),Comedy|Drama|Romance,
4,4,Father of the Bride Part II (1995),Comedy,"pregnancy, remake"
...,...,...,...,...
9737,9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,
9738,9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,
9739,9739,Flint (2017),Drama,
9740,9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,


In [29]:
def add_genres_to_tag(row):
    if pd.isnull(row['tag']):
        return row['genres'].replace("|", ",")
    else:
        return row['tag'] + "," + row['genres'].replace("|", ",")

final_df['tag'] = final_df.apply(lambda row: add_genres_to_tag(row), axis=1)
     

In [30]:
final_df

Unnamed: 0,movieId,title,genres,tag
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"pixar, pixar, fun,Adventure,Animation,Children..."
1,1,Jumanji (1995),Adventure|Children|Fantasy,"fantasy, magic board game, Robin Williams, gam..."
2,2,Grumpier Old Men (1995),Comedy|Romance,"moldy, old,Comedy,Romance"
3,3,Waiting to Exhale (1995),Comedy|Drama|Romance,"Comedy,Drama,Romance"
4,4,Father of the Bride Part II (1995),Comedy,"pregnancy, remake,Comedy"
...,...,...,...,...
9737,9737,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"Action,Animation,Comedy,Fantasy"
9738,9738,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"Animation,Comedy,Fantasy"
9739,9739,Flint (2017),Drama,Drama
9740,9740,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"Action,Animation"


In [31]:
# Extract the movie titles and tags into separate lists
titles = final_df['title'].tolist()
alltags = final_df['tag'].str.strip().str.split(",").tolist()


# Create a bag of words representation of the movie tags
def create_bow(tag_list):
    bow = {}
    if not isinstance(tag_list, float):
        for tag in tag_list:
            bow[tag] = 1
    return bow
     


In [32]:
alltags

[['pixar',
  ' pixar',
  ' fun',
  'Adventure',
  'Animation',
  'Children',
  'Comedy',
  'Fantasy'],
 ['fantasy',
  ' magic board game',
  ' Robin Williams',
  ' game',
  'Adventure',
  'Children',
  'Fantasy'],
 ['moldy', ' old', 'Comedy', 'Romance'],
 ['Comedy', 'Drama', 'Romance'],
 ['pregnancy', ' remake', 'Comedy'],
 ['Action', 'Crime', 'Thriller'],
 ['remake', 'Comedy', 'Romance'],
 ['Adventure', 'Children'],
 ['Action'],
 ['Action', 'Adventure', 'Thriller'],
 ['politics', ' president', 'Comedy', 'Drama', 'Romance'],
 ['Comedy', 'Horror'],
 ['Adventure', 'Animation', 'Children'],
 ['politics', ' president', 'Drama'],
 ['Action', 'Adventure', 'Romance'],
 ['Mafia', 'Crime', 'Drama'],
 ['Jane Austen', 'Drama', 'Romance'],
 ['Comedy'],
 ['Comedy'],
 ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'],
 ['Hollywood', 'Comedy', 'Crime', 'Thriller'],
 ['serial killer', 'Crime', 'Drama', 'Horror', 'Mystery', 'Thriller'],
 ['Action', 'Crime', 'Thriller'],
 ['Drama', 'Sci-Fi'],
 ['alcoho

In [33]:
# Create a list of bags of words representations of the movie tags
bags_of_words = [create_bow(movie_tags) for movie_tags in alltags]

In [34]:
# Create a dataframe to store the bags of words representation of the movie tags
tag_df = pd.DataFrame(bags_of_words, index=titles).fillna(0)

In [35]:
tag_df

Unnamed: 0,pixar,pixar.1,fun,Adventure,Animation,Children,Comedy,Fantasy,fantasy,magic board game,...,Dwayne Johnson,bad music,Rachel McAdams,Alicia Vikander,video game adaptation,Josh Brolin,Emilia Clarke,star wars,gintama,remaster
Toy Story (1995),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji (1995),0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Black Butler: Book of the Atlantic (2017),0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
No Game No Life: Zero (2017),0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Flint (2017),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bungo Stray Dogs: Dead Apple (2018),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Calculate the cosine similarity matrix between the movies
cosine_similarity = cosine_similarity(tag_df)

# Create a dataframe with the cosine similarity scores
similarity_df = pd.DataFrame(cosine_similarity, index=tag_df.index, columns=tag_df.index)

In [37]:
# Ask the user for a movie they like
movie = input('Enter a movie you like: ')

# Find the index of the movie in the similarity dataframe
movie_index = similarity_df.index.get_loc(movie)

# Get the top 5 most similar movies to the movie
top_5 = similarity_df.iloc[movie_index].sort_values(ascending=False)[1:6]

# Print the top 5 most similar movies to the movie
print(f'\nTop 5 similar movies to {movie}:')
print(top_5)
     

Enter a movie you like: Jumanji (1995)

Top 5 similar movies to Jumanji (1995):
Santa Claus: The Movie (1985)                         0.654654
Indian in the Cupboard, The (1995)                    0.654654
NeverEnding Story II: The Next Chapter, The (1990)    0.654654
Percy Jackson: Sea of Monsters (2013)                 0.654654
Alice in Wonderland (1933)                            0.654654
Name: Jumanji (1995), dtype: float64


# Testing and comparing

In [38]:
# Ask the user for a movie they like
movie = input('Enter a movie you like: ')

# Find the index of the movie in the similarity dataframe
movie_index = similarity_df.index.get_loc(movie)

# Get the top 5 most similar movies to the movie
top_5 = similarity_df.iloc[movie_index].sort_values(ascending=False)[1:6]

# Print the top 5 most similar movies to the movie
print(f'\nTop 5 similar movies to {movie}:')
print(top_5)
     

Enter a movie you like: Toy Story (1995)

Top 5 similar movies to Toy Story (1995):
Wild, The (2006)                                  0.790569
Antz (1998)                                       0.790569
Adventures of Rocky and Bullwinkle, The (2000)    0.790569
Monsters, Inc. (2001)                             0.790569
Moana (2016)                                      0.790569
Name: Toy Story (1995), dtype: float64


Top 5 according to imdb (that are also in this database):Toy story 3, Toy story 2,Monsters Inc,Up,Finding Nemo (1/5 hits)

In [39]:
# Ask the user for a movie they like
movie = input('Enter a movie you like: ')

# Find the index of the movie in the similarity dataframe
movie_index = similarity_df.index.get_loc(movie)

# Get the top 5 most similar movies to the movie
top_5 = similarity_df.iloc[movie_index].sort_values(ascending=False)[1:6]

# Print the top 5 most similar movies to the movie
print(f'\nTop 5 similar movies to {movie}:')
print(top_5)
     

Enter a movie you like: Jingle All the Way (1996)

Top 5 similar movies to Jingle All the Way (1996):
Big Green, The (1995)               1.0
Christmas with the Kranks (2004)    1.0
Home Alone 3 (1997)                 1.0
Mouse Hunt (1997)                   1.0
Problem Child (1990)                1.0
Name: Jingle All the Way (1996), dtype: float64


Top 5 according to imdb (that are also in this database):Christmas with the kranks,Jack Frost,Elf,Home Alone 2: Lost in New York,Miracle on 34th Street (1/5 hits)

# Neural network

In [40]:
ratings_matrix = df.pivot_table(index='userId',columns='movieId',values='rating')
missing_cols = list(set(movies.index) - set(ratings_matrix.columns))
for col in missing_cols:
    ratings_matrix[col] = np.nan
ratings_matrix = ratings_matrix[sorted(ratings_matrix.columns)]
ratings_matrix

movieId,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [41]:
full_df = ratings_matrix.stack(dropna=False).reset_index().rename(columns={0:"rating"})
full_df

Unnamed: 0,userId,movieId,rating
0,1,0,4.0
1,1,1,
2,1,2,4.0
3,1,3,
4,1,4,
...,...,...,...
5942615,610,9737,
5942616,610,9738,
5942617,610,9739,
5942618,610,9740,


In [42]:
genre_matrix


Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9738,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9740,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
id = 1

In [44]:
genre_ratings = genre_matrix.copy()

In [45]:
genre_ratings['rating'] = full_df[full_df['userId']==id]['rating']


In [46]:
genre_ratings

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9738,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9740,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [47]:
train = genre_ratings[pd.notnull(genre_ratings['rating'])]

In [48]:
train

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0
5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
46,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2802,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0
2836,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0
2847,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2991,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0


In [49]:
test = genre_ratings[pd.isnull(genre_ratings['rating'])]
test

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9738,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
9740,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [50]:
train_features = train.copy()
test_features = test.copy()

train_labels = train_features.pop('rating')
test_labels = test_features.pop('rating')

In [51]:
train_features

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
46,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2802,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2836,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2847,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2991,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [52]:
train_labels

0       4.0
2       4.0
5       4.0
43      5.0
46      5.0
       ... 
2802    4.0
2836    5.0
2847    4.0
2991    4.0
3673    5.0
Name: rating, Length: 232, dtype: float64

In [53]:
normalizer = tf.keras.layers.Normalization(axis=-1)




In [54]:
normalizer.adapt(np.array(train_features))

In [55]:
def build_and_compile_model(norm):
  model = keras.Sequential([
      norm,
      layers.Dense(64, activation='relu'),
      layers.Dense(64, activation='relu'),
      layers.Dense(1)
  ])

  model.compile(loss='mean_absolute_error',
                optimizer=tf.keras.optimizers.Adam(0.001))
  return model

In [56]:
dnn_model = build_and_compile_model(normalizer)
dnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizati  (None, 20)                41        
 on)                                                             
                                                                 
 dense (Dense)               (None, 64)                1344      
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 5610 (21.92 KB)
Trainable params: 5569 (21.75 KB)
Non-trainable params: 41 (168.00 Byte)
_________________________________________________________________


In [57]:
dnn_model.fit(
    train_features,
    train_labels,
    validation_split=0.2,
    verbose=0, epochs=100)




<keras.src.callbacks.History at 0x1cf679a7e90>

In [58]:
test_predictions = dnn_model.predict(test_features).flatten()
test_predictions



array([3.5824573, 4.393328 , 4.6054077, ..., 5.068549 , 5.0694   ,
       4.6054077], dtype=float32)

In [59]:
test['predictions'] = test_predictions
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predictions'] = test_predictions


Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed),rating,predictions
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.582457
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,4.393328
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,4.605408
6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,4.099924
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,4.573824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,4.585506
9738,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,3.890144
9739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5.068549
9740,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,5.069400


In [60]:
a = test.sort_values("predictions", ascending=False)[:5].index

In [61]:
recommendations =  movies['title'].iloc[a]

In [62]:
print("Top 5 recommendations for User ",id,":")
print(recommendations)

Top 5 recommendations for User  1 :
8836               Let It Be Me (1995)
9248            Noin 7 veljestä (1968)
9033                  Guardians (2016)
9053                 Green Room (2015)
9070    The Brand New Testament (2015)
Name: title, dtype: object
