In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df.shape

(9742, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
#features = ['movieId', 'title', 'genres']
#for feature in features:
#	df[feature] = df[feature].fillna('')

    
def combined_features(row):
    try:
        return row['title']+' '+row['genres']
    except:
            print(row)
        
df['combined_features']= df.apply(combined_features, axis=1)
df['combined_features'].head()

0    Toy Story (1995) Adventure|Animation|Children|...
1            Jumanji (1995) Adventure|Children|Fantasy
2               Grumpier Old Men (1995) Comedy|Romance
3        Waiting to Exhale (1995) Comedy|Drama|Romance
4            Father of the Bride Part II (1995) Comedy
Name: combined_features, dtype: object

In [7]:
df.head()

Unnamed: 0,movieId,title,genres,combined_features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   movieId            9742 non-null   int64 
 1   title              9742 non-null   object
 2   genres             9742 non-null   object
 3   combined_features  9742 non-null   object
dtypes: int64(1), object(3)
memory usage: 304.6+ KB


In [9]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = CountVectorizer()

count_matrix = vectorizer.fit_transform(df['combined_features'])

##Step 5: Compute the Cosine Similarity based on the count_matrix
similarity = cosine_similarity(count_matrix)

In [10]:
similarity

array([[1.        , 0.63245553, 0.28867513, ..., 0.        , 0.125     ,
        0.11785113],
       [0.63245553, 1.        , 0.18257419, ..., 0.        , 0.        ,
        0.        ],
       [0.28867513, 0.18257419, 1.        , ..., 0.        , 0.        ,
        0.13608276],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.125     , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.11785113, 0.        , 0.13608276, ..., 0.        , 0.        ,
        1.        ]])

In [11]:
df.reset_index(inplace=True)

In [12]:
df.head()

Unnamed: 0,index,movieId,title,genres,combined_features
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [13]:
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]

In [14]:
movie_user_likes = "Jumanji (1995)"

## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)
similar_movie = list(enumerate (similarity[movie_index]))

In [15]:
similar_movie

[(0, 0.6324555320336758),
 (1, 0.9999999999999999),
 (2, 0.18257418583505539),
 (3, 0.1690308509457033),
 (4, 0.15811388300841894),
 (5, 0.19999999999999998),
 (6, 0.22360679774997896),
 (7, 0.5477225575051662),
 (8, 0.22360679774997896),
 (9, 0.39999999999999997),
 (10, 0.1690308509457033),
 (11, 0.15811388300841894),
 (12, 0.6),
 (13, 0.25819888974716115),
 (14, 0.36514837167011077),
 (15, 0.22360679774997896),
 (16, 0.18257418583505539),
 (17, 0.22360679774997896),
 (18, 0.1690308509457033),
 (19, 0.15811388300841894),
 (20, 0.18257418583505539),
 (21, 0.1690308509457033),
 (22, 0.19999999999999998),
 (23, 0.19999999999999998),
 (24, 0.18257418583505539),
 (25, 0.25819888974716115),
 (26, 0.36514837167011077),
 (27, 0.22360679774997896),
 (28, 0.4338609156373123),
 (29, 0.10846522890932808),
 (30, 0.22360679774997896),
 (31, 0.13483997249264842),
 (32, 0.4472135954999579),
 (33, 0.18257418583505539),
 (34, 0.36514837167011077),
 (35, 0.22360679774997896),
 (36, 0.18257418583505539),

In [16]:
sorted_movie = sorted(similar_movie, key=lambda x:x[1], reverse=True)

In [17]:
df['title'][1]

'Jumanji (1995)'

In [18]:
sorted_movie

[(1, 0.9999999999999999),
 (1565, 0.6761234037828132),
 (131, 0.6708203932499369),
 (0, 0.6324555320336758),
 (12, 0.6),
 (209, 0.6),
 (8800, 0.6),
 (119, 0.5962847939999438),
 (7, 0.5477225575051662),
 (40, 0.5477225575051662),
 (6014, 0.5477225575051662),
 (6065, 0.5477225575051662),
 (7108, 0.5477225575051662),
 (8641, 0.5477225575051662),
 (8715, 0.5477225575051662),
 (9336, 0.5477225575051662),
 (9565, 0.5477225575051662),
 (53, 0.5393598899705937),
 (141, 0.5393598899705937),
 (222, 0.5393598899705937),
 (1357, 0.50709255283711),
 (1556, 0.50709255283711),
 (1617, 0.50709255283711),
 (1706, 0.50709255283711),
 (4809, 0.50709255283711),
 (6389, 0.50709255283711),
 (6629, 0.50709255283711),
 (7426, 0.50709255283711),
 (8219, 0.50709255283711),
 (8716, 0.50709255283711),
 (9430, 0.50709255283711),
 (6011, 0.47809144373375745),
 (109, 0.47434164902525683),
 (488, 0.47434164902525683),
 (767, 0.47434164902525683),
 (1343, 0.47434164902525683),
 (1480, 0.47434164902525683),
 (1505, 0.4

In [19]:
i=0
print("Top 10 similar movies to "+movie_user_likes+" are:\n")
for movie in sorted_movie:
	print(get_title_from_index(movie[0]))
	i+=1
	if (i>10):
		break

Top 10 similar movies to Jumanji (1995) are:

Jumanji (1995)
Tall Tale (1995)
Casper (1995)
Toy Story (1995)
Balto (1995)
Gordy (1995)
Pan (2015)
Amazing Panda Adventure, The (1995)
Tom and Huck (1995)
Mortal Kombat (1995)
MirrorMask (2005)


In [20]:
print(df["genres"].unique())

average_sorted_movie = sorted(similar_movie, key=lambda x:df["genres"][x[0]], reverse=True)
print(average_sorted_movie)

['Adventure|Animation|Children|Comedy|Fantasy'
 'Adventure|Children|Fantasy' 'Comedy|Romance' 'Comedy|Drama|Romance'
 'Comedy' 'Action|Crime|Thriller' 'Adventure|Children' 'Action'
 'Action|Adventure|Thriller' 'Comedy|Horror'
 'Adventure|Animation|Children' 'Drama' 'Action|Adventure|Romance'
 'Crime|Drama' 'Drama|Romance' 'Action|Comedy|Crime|Drama|Thriller'
 'Comedy|Crime|Thriller' 'Crime|Drama|Horror|Mystery|Thriller'
 'Drama|Sci-Fi' 'Children|Drama' 'Adventure|Drama|Fantasy|Mystery|Sci-Fi'
 'Mystery|Sci-Fi|Thriller' 'Children|Comedy' 'Drama|War'
 'Action|Crime|Drama' 'Action|Adventure|Fantasy' 'Comedy|Drama|Thriller'
 'Mystery|Thriller' 'Animation|Children|Drama|Musical|Romance'
 'Crime|Mystery|Thriller' 'Adventure|Drama' 'Drama|Thriller'
 'Comedy|Crime' 'Action|Sci-Fi|Thriller' 'Action|Comedy|Horror|Thriller'
 'Comedy|Drama' 'Documentary' 'Action|Crime|Drama|Thriller'
 'Crime|Drama|Romance' 'Action|Adventure|Drama' 'Action|Thriller'
 'Drama|Horror|Thriller' 'Comedy|Horror|Romance'


In [21]:
i=0
print("Top 10 movie genres to "+movie_user_likes+" are:\n")
for movie in average_sorted_movie:
	print(get_title_from_index(movie[0]))
	i+=1
	if (i>10):
		break

Top 10 movie genres to Jumanji (1995) are:

Wild Bill (1995)
Wyatt Earp (1994)
Bad Girls (1994)
Pale Rider (1985)
Alvarez Kelly (1966)
High Plains Drifter (1973)
Little Big Man (1970)
Jeremiah Johnson (1972)
Mackenna's Gold (1969)
Rio Bravo (1959)
One-Eyed Jacks (1961)
