# Movielens 영화 추천 실습

## 데이터 준비와 전처리

In [1]:
import os
import pandas as pd
rating_file_path = os.getenv('HOME')+'/aiffel/recommendata_iu/data/ml-1m/ratings.dat'

In [2]:
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

In [3]:
ratings=pd.read_csv(rating_file_path, sep="::", names=ratings_cols, engine='python')

In [4]:
original_data_size = len(ratings)

In [5]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
# 3점 이상만 남김
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'original_data_size:{original_data_size}, filtered_data_size:{filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / original_data_size:.2%}')

original_data_size:1000209, filtered_data_size:836478
Ratio of Remaining Data is 83.63%


In [7]:
# rating column의 이름을 count로 변경
ratings.rename(columns={'rating':'count'}, inplace=True)

In [8]:
ratings.keys()

Index(['user_id', 'movie_id', 'count', 'timestamp'], dtype='object')

In [9]:
# 영화제목을 보기위해 메타데이터를 읽어옴
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 

In [10]:
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


* 유니크한 영화갯수

In [12]:
ratings.movie_id.nunique()

3628

* 유니크한 사용자수

In [13]:
ratings.user_id.nunique()

6039

* 가장 인기있는 영화 30개(인기순) 둘중 어느거??

In [14]:
ratings.groupby('movie_id')['count'].sum()

movie_id
1       8475
2       1986
3       1208
4        349
5        754
        ... 
3948    2949
3949    1212
3950     186
3951     149
3952    1406
Name: count, Length: 3628, dtype: int64

In [15]:
ratings.groupby('movie_id')['user_id'].count()

movie_id
1       2000
2        551
3        339
4        102
5        214
        ... 
3948     752
3949     280
3950      47
3951      36
3952     353
Name: user_id, Length: 3628, dtype: int64

* 내가 좋아하는 영화 5개

In [16]:
movies.title

0                         Toy Story (1995)
1                           Jumanji (1995)
2                  Grumpier Old Men (1995)
3                 Waiting to Exhale (1995)
4       Father of the Bride Part II (1995)
                       ...                
3878               Meet the Parents (2000)
3879            Requiem for a Dream (2000)
3880                      Tigerland (2000)
3881               Two Family House (2000)
3882                 Contender, The (2000)
Name: title, Length: 3883, dtype: object

In [17]:
import re

In [18]:
my_movie = ['Gladiator','Die Hard','Mask','Terminator 2', 'Home Alone']

In [19]:
aa =movies.title.tolist()

In [20]:
re.findall('Gladiator',str(aa))

['Gladiator']

In [28]:
for i in my_movie:
    if re.findall(i,str(aa)):
        print(i)
    else:
        print('can\'t find')

Gladiator
Die Hard
Mask
Terminator 2
Home Alone


In [22]:
if 'Toy Story' in movies.title:
    print('find')
else:
    print('can\'t find')

can't find


In [23]:
movies['title'] = movies.title.str.lower()

* CSR MAtrix 만들기

In [25]:
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,toy story (1995),Animation|Children's|Comedy
1,2,jumanji (1995),Adventure|Children's|Fantasy
2,3,grumpier old men (1995),Comedy|Romance
3,4,waiting to exhale (1995),Comedy|Drama
4,5,father of the bride part ii (1995),Comedy


In [26]:
ratings.head()

Unnamed: 0,user_id,movie_id,count,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [33]:
movies['gladiator' in movies['title']]

KeyError: False