<h1>Movie Recommendation

In [46]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
movies_df = pd.read_csv('movies.dat', sep='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding="ISO-8859-1")
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [48]:
ratings_df = pd.read_csv('ratings.dat', sep='::', engine='python', header=None, names=["UserID", "MovieID", "Rating", "Timestamp"])
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [49]:
users_df = pd.read_csv('users.dat', sep='::', engine='python', header=None, names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
users_df.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [50]:
movies_df.shape, ratings_df.shape, users_df.shape

((3883, 3), (1000209, 4), (6040, 5))

In [51]:
movies_df.isna().sum(), ratings_df.isna().sum(), users_df.isna().sum()

(MovieID    0
 Title      0
 Genres     0
 dtype: int64,
 UserID       0
 MovieID      0
 Rating       0
 Timestamp    0
 dtype: int64,
 UserID        0
 Gender        0
 Age           0
 Occupation    0
 Zip-code      0
 dtype: int64)

NO null values

In [52]:
merged_df = pd.merge(pd.merge(ratings_df, movies_df), users_df)

In [53]:
merged_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [54]:
merged_df['Genres'] = merged_df['Genres'].str.replace("|", " ")
merged_df.head()

  merged_df['Genres'] = merged_df['Genres'].str.replace("|", " ")


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation Children's Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation Children's Comedy,F,1,10,48067


In [55]:
merged_df["features"] = merged_df['Rating'].astype(str) + " " + merged_df['Genres'].astype(str) + " " + merged_df['Gender'].astype(str) + " " + merged_df['Age'].astype(str)

merged_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code,features
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067,5 Drama F 1
1,1,661,3,978302109,James and the Giant Peach (1996),Animation Children's Musical,F,1,10,48067,3 Animation Children's Musical F 1
2,1,914,3,978301968,My Fair Lady (1964),Musical Romance,F,1,10,48067,3 Musical Romance F 1
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067,4 Drama F 1
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation Children's Comedy,F,1,10,48067,5 Animation Children's Comedy F 1


Shuffeling the dataset and selecting first 20000 rows because the dataset is too big.

In [56]:
sample_df = merged_df.sample(n=20000, random_state=42)

# Reset the index 
sample_df = sample_df.reset_index(drop=True)

sample_df.head()


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code,features
0,3466,1968,5,967161282,"Breakfast Club, The (1985)",Comedy Drama,M,25,14,94306,5 Comedy Drama M 25
1,5437,1610,4,961366105,"Hunt for Red October, The (1990)",Action Thriller,M,25,11,55426,4 Action Thriller M 25
2,770,445,3,976182730,Fatal Instinct (1993),Comedy,M,18,4,98499,3 Comedy M 18
3,889,2696,2,975250320,"Dinner Game, The (Le Dîner de cons) (1998)",Comedy,M,45,20,10024,2 Comedy M 45
4,2203,2013,5,974604523,"Poseidon Adventure, The (1972)",Action Adventure,M,45,12,53718,5 Action Adventure M 45


Creating Vector

In [57]:
vectorizer = CountVectorizer()
movie_vectors = vectorizer.fit_transform(sample_df['features'])
movie_vectors.toarray()

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

Preparing cosine similarity matrix

In [58]:
cos_sim = cosine_similarity(movie_vectors)

In [59]:
movie_name = "Toy Story (1995)"

movie_index = sample_df[sample_df.Title == movie_name].index[0]

movie_index

1128

In [60]:
similar_movies = list(enumerate(cos_sim[movie_index]))


Sorting similar movies in descending order

In [61]:
sorted_sim_mov = sorted(similar_movies, key = lambda x:x[1], reverse = True)

printing top 10 recommended/similar movies

In [62]:
movie_set = set()
for movie in sorted_sim_mov:
    movie_set.add(sample_df[sample_df.index == movie[0]]["Title"].values[0])
    
    if len(movie_set) >= 10:
        break

list(movie_set)

for movie in movie_set:
    print(movie)

American Tail, An (1986)
Toy Story 2 (1999)
Chicken Run (2000)
American Tail: Fievel Goes West, An (1991)
Aladdin (1992)
Aladdin and the King of Thieves (1996)
Rugrats Movie, The (1998)
Adventures of Rocky and Bullwinkle, The (2000)
Toy Story (1995)
Bug's Life, A (1998)
