In [6]:
import pandas as pd

movie_data_set = './dataset/ml-10M100K/movies.dat'

m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_data_set, names=m_cols, sep='::', encoding='latin-1', engine='python')

movies['genre'] = movies.genre.apply(lambda x : x.split("|"))
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [8]:
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']

movie_tag_data_set = './dataset/ml-10M100K/tags.dat'
user_tagged_movies = pd.read_csv(movie_tag_data_set, names=t_cols, sep='::', engine='python')

user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()

user_tagged_movies.head()


Unnamed: 0,user_id,movie_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [9]:
print(f'태그 종류 = {len(user_tagged_movies.tag.unique())}')
print(f'태그 레코드 수 = {len(user_tagged_movies)}')
print(f'태그가 붙어 있는 영화 수 = {len(user_tagged_movies.movie_id.unique())}')

태그 종류 = 15241
태그 레코드 수 = 95580
태그가 붙어 있는 영화 수 = 7601


In [10]:
# tag 영화별 list 형식으로 저장
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag' : list})

# 태그 정보 겨합
movies = movies.merge(movie_tags, on='movie_id', how='left')

movies.head()

Unnamed: 0,movie_id,title,genre,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, pixar, animation, pixar, animat..."
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[for children, game, animals, joe johnston, ro..."
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[funniest movies, comedinha de velhinhos engra..."
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[girl movie]
4,5,Father of the Bride Part II (1995),[Comedy],"[steve martin, pregnancy, remake, steve martin..."


### 평갓값 데이터
movielens에서 가져온 데이터의 평가값 데이터 수는 1000만건에 이르기에 이를 이용하면 알고리즘에 따라 몇 시간 며칠이 걸리기도 합니다.

In [11]:
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

movie_rating = './dataset/ml-10M100K/ratings.dat'
ratings = pd.read_csv(movie_rating, names=r_cols, sep='::', engine='python')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [12]:
# 1000 명으로 줄여서 테스트
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]

# 영화 + 평가 데이터
movielens = ratings.merge(movies, on = 'movie_id')
movielens.head()


Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag
0,1,122,5.0,838985046,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
1,139,122,3.0,974302621,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
2,149,122,2.5,1112342322,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
3,182,122,3.0,943458784,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
4,215,122,4.5,1102493547,Boomerang (1992),"[Comedy, Romance]","[dating, nudity (topless - brief), can't remem..."
