- Numpy로 제작

In [1]:
from numpy import dot
from numpy.linalg import norm
import numpy as np
def cos_sim(A, B):
    return dot(A, B)/(norm(A)*norm(B))

In [2]:
doc1=np.array([0,1,1,1])
doc2=np.array([1,0,1,1])
doc3=np.array([2,0,2,2])

In [3]:
cos_sim(doc1, doc2), cos_sim(doc1, doc3), cos_sim(doc2, doc3)

(0.6666666666666667, 0.6666666666666667, 1.0000000000000002)

- Scikit-learn 함수 이용

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([doc1],[doc2])

array([[0.66666667]])

In [5]:
cosine_similarity([doc2],[doc3])

array([[1.]])

# 유사도를 이용한 추천 시스템

In [6]:
from google.colab import files
uploaded = files.upload()
filename = list(uploaded.keys())[0]


Saving movies_metadata.csv to movies_metadata (1).csv


In [26]:
import pandas as pd

movie = pd.read_csv(filename, low_memory=False)
movie.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [27]:
movie.shape

(45466, 24)

In [53]:
df = movie[['title','overview']]
df.head(2)

Unnamed: 0,title,overview
0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...


In [54]:
df.overview[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [55]:
df = df.head(20000)

# 데이터 전처리

In [56]:
df.isnull().sum()

title         2
overview    135
dtype: int64

In [57]:
import warnings

In [58]:
warnings.filterwarnings('ignore')

In [59]:
df.dropna(inplace=True)     # how='any'가 디폴트
df.shape

(19863, 2)

## 텍스트 전처리

In [60]:
# 구둣점 제거
import re
df['clean_doc'] = df.overview.str.replace('[^A-Za-z ]', '')
df.head(3)

Unnamed: 0,title,overview,clean_doc
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Led by Woody Andys toys live happily in his ro...
1,Jumanji,When siblings Judy and Peter discover an encha...,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,A family wedding reignites the ancient feud be...


## DTM 변환


In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(stop_words='english')
tfidf_matrix = tvect.fit_transform(df.overview)
tfidf_matrix.shape

(19863, 47487)

In [62]:
tfidf_clean = tvect.fit_transform(df.clean_doc)
tfidf_clean.shape

(19863, 54245)

# 영화의 타이틀과 인덱스를 가진 테이블

In [64]:
indices = pd.Series(df.index, index=df.title).drop_duplicates()
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Waiting to Exhale              3
Father of the Bride Part II    4
dtype: int64

# 코사인 유사도를 통해 유사한 영화를 찾는 함수

In [65]:
from sklearn.metrics.pairwise import linear_kernel

In [66]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [67]:
cosine_sim.shape

(19863, 19863)

In [68]:
cosine_sim[1, :5]

array([0.01575156, 1.        , 0.04906868, 0.        , 0.        ])

In [71]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # 선택한 영화의 타이틀로부터 해당되는 인덱스를 받아옵니다. 이제 선택한 영화를 가지고 연산할 수 있습니다.
    idx = indices[title]

    # 모든 영화에 대해서 해당 영화와의 유사도를 구합니다.
    sim_scores = list(enumerate(cosine_sim[idx]))

    # 유사도에 따라 영화들을 정렬합니다.
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # 가장 유사한 10개의 영화를 받아옵니다.
    sim_scores = sim_scores[1:11]

    # 가장 유사한 10개의 영화의 인덱스를 받아옵니다.
    movie_indices = [i[0] for i in sim_scores]

    # 가장 유사한 10개의 영화의 제목을 리턴합니다.
    return df['title'].iloc[movie_indices]

In [72]:
get_recommendations('The Dark Knight Rises')

19286                 The One Percent
4132               The Luzhin Defence
9844                  Come and Get It
2817                  The Fire Within
15225                  Father and Son
4607             An American Rhapsody
9894                  Winter Solstice
9033     Leningrad Cowboys Meet Moses
6651                            Cobra
648                   Billy's Holiday
Name: title, dtype: object