In [1]:
docs = [
    '먹고 싶은 사과', # 문서0
    '먹고 싶은 바나나', # 문서1
    '길고 노란 바나나 바나나', # 문서2
    '저는 과일이 좋아요' # 문서3
]
docs

['먹고 싶은 사과', '먹고 싶은 바나나', '길고 노란 바나나 바나나', '저는 과일이 좋아요']

In [2]:
# CountVectorizer : TF 벡터 생성
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer() # Counter Vectorizer 객체 생성
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [3]:
# 문장을 Counter Vectorizer 형태로 변형
countvect = vect.fit_transform(docs)
countvect

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [4]:
# toarray()를 통해서 문장이 Vector 형태의 값을 얻을 수 있음
# 하지만, 각 인덱스와 컬럼이 무엇을 의미하는지에 대해서는 알 수가 없음
# sparse matrix -> numpy
countvect.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]])

In [5]:
vect.vocabulary_

{'과일이': 0,
 '길고': 1,
 '노란': 2,
 '먹고': 3,
 '바나나': 4,
 '사과': 5,
 '싶은': 6,
 '저는': 7,
 '좋아요': 8}

In [6]:
# sorted라는 함수를 통해서 단어를 정렬
sorted(vect.vocabulary_)

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [7]:
import pandas as pd
countvect_df = pd.DataFrame(countvect.toarray(), columns=sorted(vect.vocabulary_))
countvect_df.index = ['문서1', '문서2', '문서3', '문서4']
countvect_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0,0,0,1,0,1,1,0,0
문서2,0,0,0,1,1,0,1,0,0
문서3,0,1,1,0,2,0,0,0,0
문서4,1,0,0,0,0,0,0,1,1


In [8]:
# 위의 Data Frame 형태의 유사도를 계산
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(countvect_df, countvect_df)

array([[1.        , 0.66666667, 0.        , 0.        ],
       [0.66666667, 1.        , 0.47140452, 0.        ],
       [0.        , 0.47140452, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [9]:
# CountVectorizer -> TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
tfvect = vect.fit(docs)
tfvect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [10]:
tfidv_df = pd.DataFrame(tfvect.transform(docs).toarray(), columns=sorted(vect.vocabulary_))
tfidv_df.index = ['문서1', '문서2', '문서3', '문서4']
tfidv_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
문서2,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
문서3,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
문서4,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [11]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidv_df, tfidv_df)

array([[1.        , 0.60784064, 0.        , 0.        ],
       [0.60784064, 1.        , 0.42980824, 0.        ],
       [0.        , 0.42980824, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(max_features=4)
tfvect = vect.fit(docs)

In [13]:
tfidv_df = pd.DataFrame(tfvect.transform(docs).toarray(), columns=sorted(vect.vocabulary_))
tfidv_df.index = '문서1 문서2 문서3 문서4'.split()
tfidv_df

Unnamed: 0,과일이,먹고,바나나,싶은
문서1,0.0,0.707107,0.0,0.707107
문서2,0.0,0.57735,0.57735,0.57735
문서3,0.0,0.0,1.0,0.0
문서4,1.0,0.0,0.0,0.0


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import numpy as np

In [16]:
import os
print(os.listdir('/content/drive/MyDrive/Colab_Notebooks/dataset/recommendation/movies/'))

['movies_metadata.csv', 'ratings.csv']


In [17]:
path = '/content/drive/MyDrive/Colab_Notebooks/dataset/recommendation/movies/'

In [18]:
data = pd.read_csv(path + 'movies_metadata.csv', low_memory=False)
data.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [19]:
data['overview'].head() # 이 영화가 어떤 영화인지 설명

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [20]:
# data의 항목 추출
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [21]:
data['overview'].notnull() # 결측치가 없으면 True, 있으면 False

0        True
1        True
2        True
3        True
4        True
         ... 
45461    True
45462    True
45463    True
45464    True
45465    True
Name: overview, Length: 45466, dtype: bool

In [22]:
# 전처리
# overview의 결측치가 있는 항목은 모두 제거
data = data[data['overview'].notnull()].reset_index(drop=True)
data.shape

(44512, 24)

In [23]:
data = data.loc[0:40000].reset_index(drop=True)

In [24]:
# 불용어 : 유의미하지 않은 단어 토큰을 제거
# htps://wikidocs.net/22530
tfidf = TfidfVectorizer(stop_words='english')

# overwrite에 대해서 tf-idf 수행
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(40001, 71257)


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [26]:
cosine_matrix.shape

(40001, 40001)

In [27]:
np.round(cosine_matrix, 4)

array([[1.    , 0.0151, 0.    , ..., 0.0329, 0.    , 0.    ],
       [0.0151, 1.    , 0.0472, ..., 0.    , 0.0167, 0.059 ],
       [0.    , 0.0472, 1.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.0329, 0.    , 0.    , ..., 1.    , 0.    , 0.    ],
       [0.    , 0.0167, 0.    , ..., 0.    , 1.    , 0.    ],
       [0.    , 0.059 , 0.    , ..., 0.    , 0.    , 1.    ]])

In [28]:
# cosine matrix
# 인덱스 0 -> 1 -> 2 ... -> 20001
# 0 인덱스 : 0 번째 id 영화

movie2id = {}
for i, c in enumerate(data['title']): movie2id[i] = c

id2movie = {}
for i, c in movie2id.items(): id2movie[c] = i

In [29]:
idx = id2movie['Toy Story'] # Toy Story : 0번 인덱스
idx

0

In [30]:
cosine_matrix[0]

array([1.        , 0.01511816, 0.        , ..., 0.03293361, 0.        ,
       0.        ])

In [31]:
# Toy Story의 id 호출
idx = id2movie['Toy Story'] # Toy Story : 0번 인덱스
sim_scores = [(i, c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] # 자기 자신(1) 을 제외한 영화들의 유사도 및 인덱스를 추출
sim_scores

[(1, 0.015118161333979299),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.038730470991734656),
 (18, 0.0),
 (19, 0.0),
 (20, 0.009784153884570742),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.018684767445681365),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.006335701108570893),
 (42, 0.0),
 (43, 0.0),
 (44, 0.008937728507784504),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.01308661028572924),
 (49, 0.009287967771036805),
 (50, 0.010704384446012585),
 (51, 0.0),
 (52, 0.0),
 (53, 0.019798557304722943),
 (54, 0.0),
 (55, 0.025243182246308275),
 (56, 0.020735349318797777),
 (57, 0.0),
 (58, 0.033784327021970824),
 (59, 0.0),
 (60, 0.0),
 (61, 0.007805145165360473),
 (62, 0.0),
 (63, 0.009620196602825655),

In [32]:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # 유사도(x[1])가 높은 순서대로 정렬
sim_scores[0:10] # 상위 10개의 인덱스와 유사도를 추출

[(15282, 0.5302305777345263),
 (2979, 0.47040869969461174),
 (10271, 0.2761173868151379),
 (24316, 0.2681710907595272),
 (23646, 0.2367941443417645),
 (28893, 0.2216127763454472),
 (37778, 0.2168826902243731),
 (8303, 0.20024011436825886),
 (26945, 0.18281578621604597),
 (1058, 0.17997744529873472)]

Toy Story overview 기준으로 TF-IDF 정렬한 결과

In [33]:
sim_scores = [(movie2id[i], score) for i, score in sim_scores[0:10]]
sim_scores

[('Toy Story 3', 0.5302305777345263),
 ('Toy Story 2', 0.47040869969461174),
 ('The 40 Year Old Virgin', 0.2761173868151379),
 ('Small Fry', 0.2681710907595272),
 ("Andy Hardy's Blonde Trouble", 0.2367941443417645),
 ('Hot Splash', 0.2216127763454472),
 ('Superstar: The Life and Times of Andy Warhol', 0.2168826902243731),
 ('The Champ', 0.20024011436825886),
 ('Life Begins for Andy Hardy', 0.18281578621604597),
 ('Rebel Without a Cause', 0.17997744529873472)]