In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
#reading csv files
movies = pd.read_csv("movies.csv")
tags = pd.read_csv("tags.csv")

In [15]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
movies_tags = pd.merge(movies, tags)
movies_tags.head()
movies_tags.shape

(3683, 6)

In [17]:
#combine tags based on movie
movies_tags_combined = movies_tags.groupby(["movieId", "title", "genres"])["tag"].apply(" ".join).reset_index()
movies_tags_combined.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old
3,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
4,7,Sabrina (1995),Comedy|Romance,remake


In [22]:
def combine_genres_tags(row):
    genres = row["genres"].replace("|", " ")
    return genres + " " + row["tag"]
movies_tags_combined["features"] = movies_tags_combined.apply(combine_genres_tags, axis=1)
movies_tags_combined.head(10)

Unnamed: 0,movieId,title,genres,tag,features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar pixar fun,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy magic board game Robin Williams game,Adventure Children Fantasy fantasy magic board...
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy old,Comedy Romance moldy old
3,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,Comedy pregnancy remake
4,7,Sabrina (1995),Comedy|Romance,remake,Comedy Romance remake
5,11,"American President, The (1995)",Comedy|Drama|Romance,politics president,Comedy Drama Romance politics president
6,14,Nixon (1995),Drama,politics president,Drama politics president
7,16,Casino (1995),Crime|Drama,Mafia,Crime Drama Mafia
8,17,Sense and Sensibility (1995),Drama|Romance,Jane Austen,Drama Romance Jane Austen
9,21,Get Shorty (1995),Comedy|Crime|Thriller,Hollywood,Comedy Crime Thriller Hollywood


In [19]:
vectorizer = TfidfVectorizer(stop_words="english")
matrix = vectorizer.fit_transform(movies_tags_combined["features"])

In [23]:
similarity = cosine_similarity(matrix)
similar_movies = list(enumerate(similarity[6]))
sorted_list = sorted(similar_movies, key=lambda m: m[1], reverse=True)[1:]

In [24]:
def index_to_title(index):
    return movies_tags_combined[movies_tags_combined.index == index]["title"].values[0]
for i in range(5):
    print(index_to_title(sorted_list[i][0]))

American President, The (1995)
JFK (1991)
Dave (1993)
Air Force One (1997)
Mr. Smith Goes to Washington (1939)
