# Movie Recommendation

TMDB data

In [1]:
import pandas as pd 
import numpy as np 
from ast import literal_eval

In [2]:
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

In [3]:
movies = pd.read_csv('./data/tmdb_5000_movies.csv')
movies.head()

movies = movies.loc[:, ['title', 'genres', 'keywords']]
movies.head()

# str로 되어있는걸 dict로 변경
movies['genres'] = movies['genres'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)

movies.head()

Unnamed: 0,title,genres,keywords
0,Avatar,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 1463, 'name': 'culture clash'}, {'id':..."
1,Pirates of the Caribbean: At World's End,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 270, 'name': 'ocean'}, {'id': 726, 'na..."
2,Spectre,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 470, 'name': 'spy'}, {'id': 818, 'name..."
3,The Dark Knight Rises,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","[{'id': 849, 'name': 'dc comics'}, {'id': 853,..."
4,John Carter,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':..."


In [4]:
# name에 해당하는것만 가져오기
# 방법 1

# i = 0
# for row in movies['genres']:
#     genres = []
#     for ele in row:
#         genres.append(ele['name'])
#     movies['genres'][i] = genres
#     i += 1

In [5]:
# 방법 2
# 딕셔너리 형태를 깔끔하게 장르에 해당하는 부분만 뽑아서 문자열화
# dict => list
movies['genres'] = movies['genres'].apply(lambda x: [ele['name'] for ele in x])

movies['keywords'] = movies['keywords'].apply(lambda x: [ele['name'] for ele in x])

movies.head()

Unnamed: 0,title,genres,keywords
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ..."
2,Spectre,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi..."
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i..."
4,John Carter,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel..."


In [6]:
# [ 장르, 장르 ] => 장르 장르
# list => str
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['keywords'] = movies['keywords'].apply(lambda x: ' '.join(x))

movies.head()

Unnamed: 0,title,genres,keywords
0,Avatar,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...
1,Pirates of the Caribbean: At World's End,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...
2,Spectre,Action Adventure Crime,spy based on novel secret agent sequel mi6 bri...
3,The Dark Knight Rises,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...
4,John Carter,Action Adventure Science Fiction,based on novel mars medallion space travel pri...


## TF-IDF

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
# ngram_range=(1, 2) 는 단어를 1개 혹은 2개 연속으로 보겠다는 의미
tfidf_vec = TfidfVectorizer(ngram_range=(1, 2))
tfidf_matrix = tfidf_vec.fit_transform(movies['keywords'])

# print(tfidf_vec.vocabulary_.items())
print(list(tfidf_vec.vocabulary_.items())[:5])

# 4803은 영화 의 개수, 276은 단어의 개수
# --> 하나의 영화를 276개 열을 가진 벡터로 표현
print(tfidf_matrix.shape)

[('culture', 9037), ('clash', 7010), ('future', 15032), ('space', 34485), ('war', 40021)]
(4803, 41554)


In [16]:
tfidf_matrix

<4803x41554 sparse matrix of type '<class 'numpy.float64'>'
	with 101584 stored elements in Compressed Sparse Row format>

In [18]:
# 유사도 행렬 (4803, 4803)
# 1, 1 (1번째 영화와 1번재 영화의 유사도)
# 1, 1 / 1, 2 / .... / 1, 4803 -> 1번째 영화와 1~4803번재 영화의 유사도
# 2, 1 / 2, 2 / .... / 2, 4803 -> 2번째 영화와 1~4803번째 영화의 유사도
# ....
# 4803, 1 / 4803, 2 / .... / 4803, 4803 -> 4803번째 영화와 1~4803번째 영화의 유사도
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
# 4803개의 영화랑 4803개의 영화끼리 유사도를 구하겠다!
# 자신과의 유사도는 1
genres_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(genres_similarity)

[[1.         0.00486926 0.         ... 0.00773342 0.         0.        ]
 [0.00486926 1.         0.         ... 0.00908723 0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.00773342 0.00908723 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [20]:
# 유사도 값이 높은 영화의 제목
# 유사도 값이 높은 순으로 인덱스 값을 뽑아낸다
similar_index = np.argsort(-genres_similarity)
print(similar_index)

[[   0 2403  278 ... 1996 1961 4802]
 [   1   12  199 ... 1984 1971 4802]
 [   2   11 3285 ... 1915 1880 4802]
 ...
 [4800 2674 2108 ... 1688 1697 4802]
 [   0 3205 3204 ... 1596 1594 4802]
 [4802 2000 4066 ... 1619 1635 2401]]


## Count(빈도수)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer(ngram_range=(1, 2))
count_matrix = count_vec.fit_transform(movies['keywords'])

# print(count_matrix)

# 유사도 행렬 (4803, 4803)
# 1, 1 (1번째 영화와 1번째 영화의 유사도)
# 1, 1 / 1, 2 / .... / 1, 4803
# 2, 1 / 2, 2 / .... / 2, 4803
# .....
# 4803, 1 / 4803, 2 / .... / 4803, 4803
from sklearn.metrics.pairwise import cosine_similarity

# 4803개의 영화랑 4803개의 영화끼리 유사도를 구하겠다!
genres_similarity = cosine_similarity(count_matrix, count_matrix)
# print(genres_similarity)

# 유사도가 높은 영화
# 유사도 값이 높은 것의 인덱스를 내림차순으로 출력/리턴
similar_index = np.argsort(-genres_similarity)
# print(similar_index)

# 사용자가 입력한 영화의 인덱스 값을 찾아내고
# similar_index 에 기록된 유사한 영화 인덱스를 찾아내고
# 유사한 영화 인덱스를 토대로 영화 이름을 찾아내면 된다!

# input_movie = input()
input_movie = 'Avatar'

movie_index = movies[movies['title']==input_movie].index.values
# print(movie_index)

similar_movies = similar_index[movie_index, :10]
# print(similar_movies)

# 인덱스로 사용하기 위해서는 1차원으로 변형해줘야하기 때문
similar_movies_index = similar_movies.reshape(-1)
print(similar_movies_index)
print(movies.iloc[similar_movies_index])

[   0 2403  278 4332  838  373 3158 1951 1354 3730]
                   title                                     genres                                           keywords
0                 Avatar   Action Adventure Fantasy Science Fiction  culture clash future space war space colony so...
2403              Aliens     Horror Action Thriller Science Fiction  android extraterrestrial technology space mari...
278   Planet of the Apes  Thriller Science Fiction Action Adventure  gorilla space marine space suit revolution chi...
4332      Silent Running            Adventure Drama Science Fiction  space marine sunlight plants space travel satu...
838               Alien³              Science Fiction Action Horror  prison android spacecraft space marine impriso...
373      Mission to Mars                            Science Fiction  mars spacecraft space travel alien long take o...
3158               Alien     Horror Action Thriller Science Fiction  android countdown space marine space suit behe