## Movie Recommendation System

In [47]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster  import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [48]:
# loading the dataset
ratings = pd.read_csv("ratings.csv")
ratings = ratings.head(1500)
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
1495,9,434,5.0,859383142
1496,9,435,3.0,859384480
1497,9,442,3.0,859383979
1498,9,445,4.0,859384328


In [49]:
# creating the user interaction matrix
user_item_matrix = ratings.pivot_table(index="userId", columns= "movieId", fill_value=0)
user_item_matrix

Unnamed: 0_level_0,rating,rating,rating,rating,rating,rating,rating,rating,rating,rating,...,timestamp,timestamp,timestamp,timestamp,timestamp,timestamp,timestamp,timestamp,timestamp,timestamp
movieId,1,2,3,6,7,10,11,16,17,18,...,182715,182823,187541,187593,189333,195159,200818,200838,203375,203519
userId,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1566090000.0,1566091000.0,1566090000.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1573943000.0,0.0,1573945000.0,1573938000.0,1573941000.0,1573937000.0,1573939000.0,1573940000.0,1573939000.0,1573945000.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,4.0,0.0,4.0,3.0,1.0,4.0,3.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
# standardardisng the user_interaction_matrix
scaler = StandardScaler()
user_item_matrix_standardized = scaler.fit_transform(user_item_matrix)
user_item_matrix_standardized

array([[-1.10361747, -0.35355339, -0.35355339, ..., -0.35355339,
        -0.35355339, -0.35355339],
       [ 0.77551498, -0.35355339, -0.35355339, ..., -0.35355339,
        -0.35355339, -0.35355339],
       [ 1.04396248, -0.35355339, -0.35355339, ..., -0.35355339,
        -0.35355339, -0.35355339],
       ...,
       [-1.10361747, -0.35355339, -0.35355339, ..., -0.35355339,
        -0.35355339, -0.35355339],
       [ 1.04396248, -0.35355339,  2.82842712, ..., -0.35355339,
        -0.35355339, -0.35355339],
       [-1.10361747,  2.82842712, -0.35355339, ..., -0.35355339,
        -0.35355339, -0.35355339]])

In [51]:
# performing k means clustering
kmeans = KMeans(n_clusters=5, random_state=42, n_init= 10)
user_clusters = kmeans.fit_predict(user_item_matrix_standardized)
user_clusters

array([1, 2, 3, 0, 1, 1, 1, 4, 1], dtype=int32)

In [52]:
# user Clusters
user_clusters[1-1]

1

In [54]:
# finding the similar users 
user_cluster = user_clusters[0-1]
similar_users = np.where(user_clusters==user_cluster)[0]
similar_users

# calcluating the similarities  using the function cosine similarity
cluster_centers = kmeans.cluster_centers_
user_similarity = cosine_similarity(cluster_centers)


In [56]:
# function to recommend  a movie for a given user based on the cluster similarity
def recommend_movies_cluster(user_id,user_similarity,user_clusters,n=5):
    user_cluster= user_clusters[user_id -1]
    similar_users =np.where(user_clusters == user_cluster)[0]

    #exclude the user itself from recommendations
    similar_users = similar_users[similar_users != (user_id -1)]
    print("calclulating the similar users by avoiding the same user itself", similar_users)

    # calculating the mean ratings of the similar users 
    mean_ratings =  user_item_matrix.iloc[similar_users].mean(axis=0)
    print("calculating the mean ratings of the similar users", mean_ratings)

    # finding the unrated movies by the user 
    unrated_movies = (user_item_matrix.iloc[user_id -1] ==0)

    # recommended movies based on movie ratings
    recommendations =  mean_ratings[unrated_movies].sort_values(ascending= False).index[:n]
    return recommendations




In [57]:
# for example recommending a  movie to a user 1 based on the K-Means clustering
user_id_to_recommend = 1
recommendations = recommend_movies_cluster(user_id_to_recommend,user_similarity, user_clusters,n=10)
print("top 5 movie recommendations for user:{user_id_to_recommend} based on k means clustering",recommendations)




calclulating the similar users by avoiding the same user itself [4 5 6 8]
calculating the mean ratings of the similar users            movieId
rating     1          1.00
           2          1.25
           3          0.00
           6          0.00
           7          0.00
                      ... 
timestamp  195159     0.00
           200818     0.00
           200838     0.00
           203375     0.00
           203519     0.00
Length: 2254, dtype: float64
top 5 movie recommendations for user:{user_id_to_recommend} based on k means clustering MultiIndex([('timestamp',  260),
            ('timestamp',  593),
            ('timestamp',  150),
            ('timestamp',  161),
            ('timestamp', 1097),
            ('timestamp', 1196),
            ('timestamp',  608),
            ('timestamp',  527),
            ('timestamp',   39),
            ('timestamp',  277)],
           names=[None, 'movieId'])


## Content Based Recommendations


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies = pd.read_csv("movies.csv")
movies = movies.head(1500)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
1495,1550,Trial and Error (1997),Comedy|Romance
1496,1551,Buddy (1997),Adventure|Children|Drama
1497,1552,Con Air (1997),Action|Adventure|Thriller
1498,1553,Late Bloomers (1996),Comedy


In [59]:
# Create  a TF-IDF  vectorizer for a movie genre
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [60]:
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])
#performing k-means clustering on the tfidf_matrix
kmeans = KMeans(n_clusters=5 ,random_state= 42,n_init=10)
movie_clusters = kmeans.fit_predict(tfidf_matrix)
movie_clusters

array([4, 4, 1, ..., 4, 0, 1], dtype=int32)

In [61]:
# calculating the cosine similarity based on the cluster centroids 
cluster_centers = kmeans.cluster_centers_
movie_similarity = cosine_similarity(cluster_centers)
movie_similarity

array([[1.        , 0.39241473, 0.22606414, 0.01860923, 0.23312016],
       [0.39241473, 1.        , 0.32810223, 0.01937624, 0.245089  ],
       [0.22606414, 0.32810223, 1.        , 0.01920008, 0.32694568],
       [0.01860923, 0.01937624, 0.01920008, 1.        , 0.01281024],
       [0.23312016, 0.245089  , 0.32694568, 0.01281024, 1.        ]])

In [62]:
# print the cluster labels for all movies 
print("movie clusters :",movie_clusters)

# print the size of the movie cluster array
print("size of the movie cluster array :",movie_clusters.shape)

# to know the similar movies to the recommend the movie 
movie_cluster = movie_clusters[1495]
similar_movies = np.where(movie_clusters == movie_cluster)[0]
similar_movies

movie clusters : [4 4 1 ... 4 0 1]
size of the movie cluster array : (1500,)


array([   2,    3,    6,   10,   16,   24,   27,   34,   38,   45,   48,
         51,   57,   63,   67,   73,   80,   82,   84,   91,   92,   99,
        103,  116,  120,  121,  127,  129,  138,  149,  166,  177,  178,
        180,  184,  189,  193,  197,  200,  205,  213,  219,  221,  229,
        233,  234,  236,  246,  249,  262,  263,  267,  273,  278,  284,
        286,  290,  291,  297,  298,  334,  342,  346,  351,  352,  354,
        356,  367,  373,  376,  384,  390,  412,  420,  435,  439,  441,
        442,  446,  448,  456,  463,  464,  483,  491,  492,  493,  494,
        501,  504,  508,  510,  529,  534,  537,  538,  544,  546,  556,
        559,  564,  576,  579,  589,  597,  605,  610,  614,  620,  628,
        643,  655,  673,  675,  676,  682,  693,  700,  737,  748,  763,
        786,  787,  788,  791,  803,  804,  807,  820,  834,  868,  871,
        873,  876,  877,  878,  879,  881,  884,  886,  888,  891,  893,
        894,  895,  899,  901,  904,  911,  915,  9

In [74]:
# creating a function to recommend the movie
def recommend_movies_content(movie_title,movie_similarity,movie_clusters,movies_df, n=5):
    # finding the movie in the dataset
    movie_row = movies_df[movies_df['title']==movie_title ]

    if not movie_row.empty:
        movie_idx = movie_row.index[0]
        print("Movie Index :",movie_idx)
        # checking that if the movie is within the valid range or not
        if(movie_idx<len(movie_similarity)):
            movie_cluster = movie_clusters[movie_idx]
            print("Movie clusters :",movie_cluster)

            # finding the movie present in the same cluster or not
            similar_movies = np.where(movie_clusters= movie_cluster)[0]

            # removing the same movie from the recommendation list (excluding the watched movie)
            similar_movies = similar_movies[similar_movies!=movie_idx]
            print("movie after removing from the recommendation list :",similar_movies)

            # recommending the movies based on the mean similarity
            recommendations_idx= np.sort(similar_movies)[:5]
            recommendations - movies['title'].iloc[recommendations_idx].tolist()
            return recommendations
        else:
            print(f"movie index '{movie_idx} is out of bounds")
            return []
    else:
        print(f"Movie '{movie_title}' not found in the recommendation list")
        return[]


In [76]:
# recommending a movie to the customer based on the index of the movie

'''for example recomending a movie similar to 'Trial and Error' (1997) based on content'''

movie_title = "Toy Story (1995)"
content_recommendations = recommend_movies_content(movie_title, movie_similarity, movie_clusters, movies, n=5 )
print(f"Top 5 content based recommendations for '{movie_title}' :")
for movie in content_recommendations:
    print("recommended movies are:",movie)

Movie Index : 0
Movie clusters : 4


TypeError: where() got an unexpected keyword argument 'movie_clusters'