In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

path = "./movies/"

In [2]:
data = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
print(data.shape)
data.head()

(45466, 24)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [4]:
# 결측치를 제거하자 -> isnull?
data  = data[data["overview"].notnull()].reset_index(drop = True)
data.shape

(44512, 24)

In [5]:
tfidf = TfidfVectorizer(stop_words='english') #아마 기존에 갖고 있는 영단어들로 수행하는 듯? 

In [6]:
tfidf_matrix = tfidf.fit_transform(data['overview'])
tfidf_matrix.shape

(44512, 75827)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix  = cosine_similarity(tfidf_matrix)

In [8]:
np.round(cosine_matrix, 4)

array([[1.    , 0.015 , 0.    , ..., 0.    , 0.0059, 0.    ],
       [0.015 , 1.    , 0.0468, ..., 0.    , 0.022 , 0.0092],
       [0.    , 0.0468, 1.    , ..., 0.    , 0.014 , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 1.    , 0.    , 0.    ],
       [0.0059, 0.022 , 0.014 , ..., 0.    , 1.    , 0.    ],
       [0.    , 0.0092, 0.    , ..., 0.    , 0.    , 1.    ]])

In [11]:
# movie2id
movie2id = {}
id2movie = {}
for idx,title in enumerate(data["title"]):
    movie2id[title] = idx
    id2movie[idx] = title


In [22]:
target = "Toy Story"
target_idx = movie2id[target]

sim_scores = [(idx,sim_value) for idx, sim_value in enumerate(cosine_matrix[target_idx,:]) if idx != target_idx]


In [24]:
sim_scores.sort(key = lambda x: x[1], reverse = True)

In [29]:
print(f"{target}'s similarity'")
for index, value in sim_scores[:10]:
    print(id2movie[index], value)

Toy Story's similarity'
Toy Story 3 0.5321733978946077
Toy Story 2 0.47214559370670484
The 40 Year Old Virgin 0.274962516260823
Small Fry 0.27322653023092314
Andy Hardy's Blonde Trouble 0.23543946958082806
Hot Splash 0.22397858775140161
Andy Kaufman Plays Carnegie Hall 0.21761842522811847
Superstar: The Life and Times of Andy Warhol 0.2159367770908928
Andy Peters: Exclamation Mark Question Point 0.20190977282766223
The Champ 0.19868494439439036


In [53]:
# Recommend_ten_movie를 위한 클래스 정의

class recommend_ten_movie:
    def __init__(self):
        self.movie2id = {}
        self.id2movie = {}
        for idx,title in enumerate(data["title"]):
            self.movie2id[title] = idx
            self.id2movie[idx] = title
    
    def forward(self, target:str):
        target_idx = self.movie2id[target]
        sim_scores = [(idx,sim_value) for idx, sim_value in enumerate(cosine_matrix[target_idx,:]) if idx != target_idx]
        sim_scores.sort(key = lambda x: x[1], reverse = True)
        
        print(f"{target}'s similarity'")
        for index, value in sim_scores[:10]:
            print(self.id2movie[index], np.round(value,4))


In [54]:
target = "Toy Story"
c = recommend_ten_movie()

In [55]:
c.forward(target)

Toy Story's similarity'
Toy Story 3 0.5322
Toy Story 2 0.4721
The 40 Year Old Virgin 0.275
Small Fry 0.2732
Andy Hardy's Blonde Trouble 0.2354
Hot Splash 0.224
Andy Kaufman Plays Carnegie Hall 0.2176
Superstar: The Life and Times of Andy Warhol 0.2159
Andy Peters: Exclamation Mark Question Point 0.2019
The Champ 0.1987


In [56]:
c.forward("Toy Story 3")

Toy Story 3's similarity'
Toy Story 0.5322
Toy Story 2 0.4097
The 40 Year Old Virgin 0.3757
Andy Hardy's Blonde Trouble 0.353
Andy Kaufman Plays Carnegie Hall 0.3086
Superstar: The Life and Times of Andy Warhol 0.3062
Andy Peters: Exclamation Mark Question Point 0.2863
The Champ 0.2716
Andy Hardy's Double Life 0.2605
Life Begins for Andy Hardy 0.2569
