In [1]:
import pandas as pd

data = pd.read_csv("movies_metadata.csv")
data = data.head(20000) # 데이터자르기

# overview 결측치 정리
data['overview'] = data['overview'].fillna('')
data['overview'].isnull().sum()

# TFIDF구하기
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf = TfidfVectorizer(stop_words='english')
TfidfMatrix = Tfidf.fit_transform(data['overview'])
TfidfMatrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [3]:
# 영화끼리의 코사인유사도 측정해서 어레이만들기
from sklearn.metrics.pairwise import linear_kernel

cos_sim = linear_kernel(TfidfMatrix, TfidfMatrix)
pd.DataFrame(cos_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,1.000000,0.015757,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.009979,0.060997,0.000000,0.000000,0.0,0.024585,0.000000,0.0,0.000000
1,0.015757,1.000000,0.049073,0.0,0.000000,0.051830,0.00000,0.000000,0.106355,0.0,...,0.029242,0.000000,0.000000,0.004271,0.000000,0.0,0.000000,0.000000,0.0,0.000000
2,0.000000,0.049073,1.000000,0.0,0.025005,0.000000,0.00000,0.006500,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
3,0.000000,0.000000,0.000000,1.0,0.000000,0.007139,0.00000,0.009396,0.000000,0.0,...,0.008305,0.000000,0.008952,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
4,0.000000,0.000000,0.025005,0.0,1.000000,0.000000,0.03298,0.000000,0.032751,0.0,...,0.009277,0.000000,0.000000,0.025417,0.000000,0.0,0.078359,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.000000
19996,0.024585,0.000000,0.000000,0.0,0.078359,0.029114,0.00000,0.009135,0.000000,0.0,...,0.024083,0.000000,0.000000,0.018573,0.000000,0.0,1.000000,0.000000,0.0,0.000000
19997,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.016507,0.0,...,0.000000,0.000000,0.000000,0.020476,0.000000,0.0,0.000000,1.000000,0.0,0.083758
19998,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,...,0.006947,0.011156,0.000000,0.007180,0.040233,0.0,0.000000,0.000000,1.0,0.000000


In [10]:
# 제목별 인덱스숫자 테이블 만들기
indices = pd.Series(data.index, index=data['title']).drop_duplicates()
# 추천 함수 생성
def getRecommendation1(title, cosine_sim= cos_sim):
    idx = indices[title]
    simScores = list(enumerate(cosine_sim[idx])) #코사인유사도
    # simScores : 튜플 (인덱스,코사인유사도)
    simScores = sorted(simScores, key=lambda x: x[1] ,reverse=True)
    # 코사인유사도 기준 내림차순 정렬된 튜플중 자기 제외하고 20개 뽑음
    simScores = simScores[1:21]
    # 상위 20개 영화의 인덱스값 저장
    movieidx = [i[0] for i in simScores]    
    return data['title'].iloc[movieidx]

In [9]:
getRecommendation1("The Tree of Life")

6060                  American Heart
198                     Three Wishes
10797                       Firewall
9413            Far Side of the Moon
6239               Love the Hard Way
19102               Jack-Jack Attack
6186           It Runs in the Family
5307                   Traces of Red
1087                      Glory Daze
1473          The Designated Mourner
2551          It Conquered the World
5485                          Charly
3138                   Patriot Games
482                   Lightning Jack
13271    Jack Brooks: Monster Slayer
19758                   Jack & Diane
11098                      Blackjack
747                             Jack
3035           The Last Picture Show
7075                      Cool World
Name: title, dtype: object

In [11]:
# 추천 함수 생성 (조건 : 평점)
def getRecommendation2(title, cosine_sim= cos_sim):
    idx = indices[title]
    simScores = list(enumerate(cosine_sim[idx])) #코사인유사도
    # simScores : 튜플 (인덱스,코사인유사도)
    simScores = sorted(simScores, key=lambda x: x[1] ,reverse=True)
    # 코사인유사도 기준 내림차순 정렬된 튜플중 자기 제외하고 20개 뽑음
    simScores = simScores[1:21]
    # 상위 20개 영화의 인덱스값 저장
    movieidx = [i[0] for i in simScores]
    # 해당 인덱스의 data 테이블 뽑아오기
    recom = data.iloc[movieidx]
    # 코사인 유사도 상위 20개 영화 중 <vote_average가 6 이상>인 영화만 추천
    return recom[(recom['vote_average']>6)]['title']

In [12]:
getRecommendation2("The Tree of Life")

6060             American Heart
198                Three Wishes
9413       Far Side of the Moon
6239          Love the Hard Way
19102          Jack-Jack Attack
5307              Traces of Red
1473     The Designated Mourner
5485                     Charly
3138              Patriot Games
3035      The Last Picture Show
Name: title, dtype: object