<h1>Movie Recommendation

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies_df = pd.read_csv('movies.dat', sep='::', engine='python', header=None, names=['MovieID', 'Title', 'Genres'], encoding="ISO-8859-1")
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('ratings.dat', sep='::', engine='python', header=None, names=["UserID", "MovieID", "Rating", "Timestamp"])
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
users_df = pd.read_csv('users.dat', sep='::', engine='python', header=None, names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
users_df.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [5]:
movies_df.shape, ratings_df.shape, users_df.shape

((3883, 3), (1000209, 4), (6040, 5))

In [6]:
movies_df.isna().sum(), ratings_df.isna().sum(), users_df.isna().sum()

(MovieID    0
 Title      0
 Genres     0
 dtype: int64,
 UserID       0
 MovieID      0
 Rating       0
 Timestamp    0
 dtype: int64,
 UserID        0
 Gender        0
 Age           0
 Occupation    0
 Zip-code      0
 dtype: int64)

NO null values

In [10]:
ratings_df.sort_values(by="")

Unnamed: 0,UserID,MovieID,Rating,Timestamp
427702,2599,1,4,973796689
1966,18,1,4,978154768
683688,4089,1,5,965428947
596207,3626,1,4,966594018
465902,2873,1,5,972784317
...,...,...,...,...
84701,551,3952,4,976067330
253845,1544,3952,4,974742620
180689,1130,3952,3,975593522
35180,238,3952,4,976760112


In [11]:
merged_df = pd.merge(pd.merge(ratings_df, movies_df), users_df)

In [12]:
merged_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067


In [21]:
merged_df['Genres'] = merged_df['Genres'].str.replace("|", " ")
merged_df.head()

  merged_df['Genres'] = merged_df['Genres'].str.replace("|", " ")


Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation Children's Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation Children's Comedy,F,1,10,48067


In [25]:
merged_df["features"] = merged_df['Rating'].astype(str) + " " + merged_df['Genres'].astype(str) + " " + merged_df['Gender'].astype(str) + " " + merged_df['Age'].astype(str)

merged_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip-code,features
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067,5 Drama F 1
1,1,661,3,978302109,James and the Giant Peach (1996),Animation Children's Musical,F,1,10,48067,3 Animation Children's Musical F 1
2,1,914,3,978301968,My Fair Lady (1964),Musical Romance,F,1,10,48067,3 Musical Romance F 1
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067,4 Drama F 1
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation Children's Comedy,F,1,10,48067,5 Animation Children's Comedy F 1


Creating Vector

In [27]:
vectorizer = CountVectorizer()
movie_vectors = vectorizer.fit_transform(merged_df['features'])
movie_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Preparing cosine similarity matrix

In [None]:
len(movie_vectors)