<a href="https://colab.research.google.com/github/JakeOh/202103_itw_pyda_wkd/blob/main/da11_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 데이터 프레임 merge

In [4]:
emp = pd.DataFrame({'empno': np.arange(1, 6),
                    'empname': ['a', 'b', 'c', 'd', 'e'],
                    'deptno': [10, 20, 30, 10, 50]})
emp

Unnamed: 0,empno,empname,deptno
0,1,a,10
1,2,b,20
2,3,c,30
3,4,d,10
4,5,e,50


In [3]:
dept = pd.DataFrame({'deptno': np.arange(10, 50, 10),
                     'deptname': ['HR', 'IT', 'Sales', 'Account']})
dept

Unnamed: 0,deptno,deptname
0,10,HR
1,20,IT
2,30,Sales
3,40,Account


In [5]:
pd.merge(left=emp, right=dept)
# how 파라미터: inner, left, right, outer. 기본값은 inner.

Unnamed: 0,empno,empname,deptno,deptname
0,1,a,10,HR
1,4,d,10,HR
2,2,b,20,IT
3,3,c,30,Sales


In [6]:
pd.merge(left=emp, right=dept, how='left')  # left (outer) join

Unnamed: 0,empno,empname,deptno,deptname
0,1,a,10,HR
1,2,b,20,IT
2,3,c,30,Sales
3,4,d,10,HR
4,5,e,50,


In [7]:
pd.merge(left=emp, right=dept, how='right')  # right (outer) join

Unnamed: 0,empno,empname,deptno,deptname
0,1.0,a,10,HR
1,4.0,d,10,HR
2,2.0,b,20,IT
3,3.0,c,30,Sales
4,,,40,Account


In [8]:
pd.merge(left=emp, right=dept, how='outer')  # full (outer) join

Unnamed: 0,empno,empname,deptno,deptname
0,1.0,a,10,HR
1,4.0,d,10,HR
2,2.0,b,20,IT
3,3.0,c,30,Sales
4,5.0,e,50,
5,,,40,Account


# Movielens 데이터 셋

In [9]:
movies_file = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/movies.dat'
ratings_file = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/ratings.dat'
users_file = 'https://github.com/wesm/pydata-book/raw/2nd-edition/datasets/movielens/users.dat'

In [10]:
movies_col = ['movie_id', 'title', 'genres']
ratings_col = ['user_id', 'movie_id', 'rating', 'timestamp']
users_col = ['user_id', 'gender', 'age', 'occupation', 'zipcode']

In [14]:
movies = pd.read_csv(movies_file,        # 파일 경로(URL)
                     engine='python',    # separator 관련 warning을 제거
                     sep='::',           # 값 구분자(value separator)가 쉼표가 아니라 '::'임을 설정
                     header=None,        # 파일에 헤더가 없음. 첫번째 라인부터 데이터.
                     names=movies_col,   # 컬럼 이름 설정
                     encoding='cp1252')  # 파일 인코딩 설정
# cp1252: 영문 MS-Windows에서 저장된 텍스트 파일
# cp949: 한글 MS-Windows에서 저장된 텍스트 파일
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [16]:
ratings = pd.read_csv(ratings_file,
                      sep='::',
                      engine='python',
                      header=None,
                      names=ratings_col)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [17]:
users = pd.read_csv(users_file,
                    sep='::',
                    engine='python',
                    header=None,
                    names=users_col)
users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


# 데이터 셋 merge

In [18]:
# ratings와 movies를 merge
ratings_movies = pd.merge(left=ratings, right=movies)
ratings_movies

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,Modulations (1998),Documentary
1000205,5675,2703,3,976029116,Broken Vessels (1998),Drama
1000206,5780,2845,1,958153068,White Boys (1999),Drama
1000207,5851,3607,5,957756608,One Little Indian (1973),Comedy|Drama|Western


In [19]:
# ratings_movies와 users를 merge
df = pd.merge(left=ratings_movies, right=users)
df

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genres,gender,age,occupation,zipcode
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,4211,3791,2,965319075,Footloose (1984),Drama,M,45,5,77662
1000205,4211,3806,3,965319138,MacKenna's Gold (1969),Western,M,45,5,77662
1000206,4211,3840,4,965319197,Pumpkinhead (1988),Horror,M,45,5,77662
1000207,4211,3766,2,965319138,Missing in Action (1984),Action|War,M,45,5,77662


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   user_id     1000209 non-null  int64 
 1   movie_id    1000209 non-null  int64 
 2   rating      1000209 non-null  int64 
 3   timestamp   1000209 non-null  int64 
 4   title       1000209 non-null  object
 5   genres      1000209 non-null  object
 6   gender      1000209 non-null  object
 7   age         1000209 non-null  int64 
 8   occupation  1000209 non-null  int64 
 9   zipcode     1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 83.9+ MB


# 성별 영화 선호도 차이

In [21]:
# 제목(title)별, 성(gender)별 평점(rating)의 평균
df.groupby(['title', 'gender'])['rating'].mean()

title                                       gender
$1,000,000 Duck (1971)                      F         3.375000
                                            M         2.761905
'Night Mother (1986)                        F         3.388889
                                            M         3.352941
'Til There Was You (1997)                   F         2.675676
                                                        ...   
Zero Kelvin (Kjærlighetens kjøtere) (1995)  M         3.500000
Zeus and Roxanne (1997)                     F         2.777778
                                            M         2.357143
eXistenZ (1999)                             F         3.098592
                                            M         3.289086
Name: rating, Length: 7152, dtype: float64

In [23]:
rating_by_gender = df.pivot_table(values='rating', 
                                  index='title', 
                                  columns='gender')
rating_by_gender

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375000,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
...,...,...
"Zed & Two Noughts, A (1985)",3.500000,3.380952
Zero Effect (1998),3.864407,3.723140
Zero Kelvin (Kjærlighetens kjøtere) (1995),,3.500000
Zeus and Roxanne (1997),2.777778,2.357143


In [26]:
# 여성 평점 평균 상위 10개 영화
rating_by_gender.sort_values(by='F', ascending=False).head(n=10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Clean Slate (Coup de Torchon) (1981),5.0,3.857143
"Ballad of Narayama, The (Narayama Bushiko) (1958)",5.0,3.428571
Raw Deal (1948),5.0,3.307692
Bittersweet Motel (2000),5.0,
Skipped Parts (2000),5.0,4.0
Lamerica (1994),5.0,4.666667
"Gambler, The (A Játékos) (1997)",5.0,3.166667
"Brother, Can You Spare a Dime? (1975)",5.0,3.642857
Ayn Rand: A Sense of Life (1997),5.0,4.0
24 7: Twenty Four Seven (1997),5.0,3.75


In [27]:
# 남성 평점 평균 상위 10개 영화
rating_by_gender.sort_values(by='M', ascending=False).head(n=10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Schlafes Bruder (Brother of Sleep) (1995),,5.0
Small Wonders (1996),3.333333,5.0
"Gate of Heavenly Peace, The (1995)",5.0,5.0
"Baby, The (1973)",,5.0
Ulysses (Ulisse) (1954),,5.0
Dangerous Game (1993),4.0,5.0
Angela (1995),3.0,5.0
"Bells, The (1926)",4.0,5.0
Smashing Time (1967),,5.0
Follow the Bitch (1998),,5.0


In [34]:
title_counts = df['title'].value_counts()
title_counts

American Beauty (1999)                                   3428
Star Wars: Episode IV - A New Hope (1977)                2991
Star Wars: Episode V - The Empire Strikes Back (1980)    2990
Star Wars: Episode VI - Return of the Jedi (1983)        2883
Jurassic Park (1993)                                     2672
                                                         ... 
Never Met Picasso (1996)                                    1
Tinseltown (1998)                                           1
Night Tide (1961)                                           1
Bat Whispers, The (1930)                                    1
Schlafes Bruder (Brother of Sleep) (1995)                   1
Name: title, Length: 3706, dtype: int64

남/녀 평점 평균을 계산할 때, 일정 숫자 이상의 평가를 받은 영화들을 대상으로 평균을 계산하는 것이 좋을 것 같다.

In [36]:
title_counts.describe()

count    3706.000000
mean      269.889099
std       384.047838
min         1.000000
25%        33.000000
50%       123.500000
75%       350.000000
max      3428.000000
Name: title, dtype: float64

In [40]:
# 150회 이상 평가를 받은 영화들만 고려
indexer = title_counts[title_counts >= 150].index
indexer

Index(['American Beauty (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
       'Star Wars: Episode V - The Empire Strikes Back (1980)',
       'Star Wars: Episode VI - Return of the Jedi (1983)',
       'Jurassic Park (1993)', 'Saving Private Ryan (1998)',
       'Terminator 2: Judgment Day (1991)', 'Matrix, The (1999)',
       'Back to the Future (1985)', 'Silence of the Lambs, The (1991)',
       ...
       'Specialist, The (1994)', 'Twelfth Night (1996)',
       'Cowboy Way, The (1994)', 'Program, The (1993)',
       'Golden Voyage of Sinbad, The (1974)',
       'Love and Death on Long Island (1997)', 'In the Army Now (1994)',
       'Tales from the Crypt Presents: Bordello of Blood (1996)',
       'Asphalt Jungle, The (1950)',
       'Police Academy 5: Assignment: Miami Beach (1988)'],
      dtype='object', length=1683)

In [52]:
rating_150 = rating_by_gender.loc[indexer]
rating_150

gender,F,M
American Beauty (1999),4.238901,4.347301
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307
Star Wars: Episode V - The Empire Strikes Back (1980),4.106481,4.344577
Star Wars: Episode VI - Return of the Jedi (1983),3.865237,4.069058
Jurassic Park (1993),3.579407,3.814197
...,...,...
Love and Death on Long Island (1997),3.116279,3.555556
In the Army Now (1994),2.384615,2.192000
Tales from the Crypt Presents: Bordello of Blood (1996),2.727273,2.565891
"Asphalt Jungle, The (1950)",3.571429,4.008130


In [53]:
rating_150.sort_values(by='F', ascending=False).head(n=10)

gender,F,M
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
"General, The (1927)",4.575758,4.32948
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Grand Illusion (Grande illusion, La) (1937)",4.560976,4.266129
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611


In [54]:
rating_150.sort_values(by='M', ascending=False).head(n=10)

gender,F,M
"Godfather, The (1972)",4.3147,4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628
"Shawshank Redemption, The (1994)",4.539075,4.560625
Raiders of the Lost Ark (1981),4.332168,4.520597
"Usual Suspects, The (1995)",4.513317,4.518248
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307
Schindler's List (1993),4.562602,4.491415
Paths of Glory (1957),4.392857,4.485149
"Wrong Trousers, The (1993)",4.588235,4.478261
"Close Shave, A (1995)",4.644444,4.473795


In [55]:
# rating_150 데이터 프레임에 파생변수 추가
# diff = 여성 평점 평균 - 남성 평점 평균
rating_150['diff'] = rating_150['F'] - rating_150['M']
rating_150

gender,F,M,diff
American Beauty (1999),4.238901,4.347301,-0.108400
Star Wars: Episode IV - A New Hope (1977),4.302937,4.495307,-0.192371
Star Wars: Episode V - The Empire Strikes Back (1980),4.106481,4.344577,-0.238096
Star Wars: Episode VI - Return of the Jedi (1983),3.865237,4.069058,-0.203821
Jurassic Park (1993),3.579407,3.814197,-0.234791
...,...,...,...
Love and Death on Long Island (1997),3.116279,3.555556,-0.439276
In the Army Now (1994),2.384615,2.192000,0.192615
Tales from the Crypt Presents: Bordello of Blood (1996),2.727273,2.565891,0.161381
"Asphalt Jungle, The (1950)",3.571429,4.008130,-0.436702


In [57]:
# 남녀 선호도의 차이가 큰 영화들
rating_150.sort_values(by='diff', ascending=True).head(n=10)

gender,F,M,diff
Lifeforce (1985),2.25,2.994152,-0.744152
Quest for Fire (1981),2.578947,3.309677,-0.73073
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,-0.726351
No Escape (1994),2.3,2.994048,-0.694048
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,-0.676359
Tora! Tora! Tora! (1970),3.090909,3.737705,-0.646796
Up in Smoke (1978),2.944444,3.585227,-0.640783
Dumb & Dumber (1994),2.697987,3.336595,-0.638608
Friday the 13th: The Final Chapter (1984),1.636364,2.258503,-0.62214
"Longest Day, The (1962)",3.411765,4.031447,-0.619682


150회 이상 평가를 받은 영화들 중에서, 여성들의 평점이 높은 영화 50개의 장르들

In [62]:
female_top50 = rating_150.sort_values(by='F', ascending=False).head(n=50).index
female_top50

Index(['Close Shave, A (1995)', 'Wrong Trousers, The (1993)',
       'General, The (1927)', 'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)',
       'Wallace & Gromit: The Best of Aardman Animation (1996)',
       'Schindler's List (1993)',
       'Grand Illusion (Grande illusion, La) (1937)',
       'Shawshank Redemption, The (1994)', 'Grand Day Out, A (1992)',
       'To Kill a Mockingbird (1962)', 'Creature Comforts (1990)',
       'Usual Suspects, The (1995)', 'It Happened One Night (1934)',
       '400 Blows, The (Les Quatre cents coups) (1959)', 'Rear Window (1954)',
       'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)',
       'Sixth Sense, The (1999)', 'Third Man, The (1949)',
       'Some Like It Hot (1959)', 'City Lights (1931)', 'Notorious (1946)',
       'Philadelphia Story, The (1940)', 'Yojimbo (1961)',
       'Life Is Beautiful (La Vita è bella) (1997)',
       'Strangers on a Train (1951)',
       'Bicycle Thief, The (Ladri di biciclette) (1948)',


In [67]:
female_movies = movies.set_index('title').loc[female_top50]
female_movies.head()

Unnamed: 0,movie_id,genres
"Close Shave, A (1995)",745,Animation|Comedy|Thriller
"Wrong Trousers, The (1993)",1148,Animation|Comedy
"General, The (1927)",3022,Comedy
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),922,Film-Noir
Wallace & Gromit: The Best of Aardman Animation (1996),720,Animation


In [70]:
female_genres = []  # 장르들을 추가할 리스트
for g in female_movies['genres']:
    # print(g)
    # print(g.split('|'))
    # 'Animation|Comedy|Thriller' -> ['Animation', 'Comedy', 'Thriller']
    female_genres.extend(g.split('|'))

female_genres

['Animation',
 'Comedy',
 'Thriller',
 'Animation',
 'Comedy',
 'Comedy',
 'Film-Noir',
 'Animation',
 'Drama',
 'War',
 'Drama',
 'War',
 'Drama',
 'Animation',
 'Comedy',
 'Drama',
 'Animation',
 'Comedy',
 'Crime',
 'Thriller',
 'Comedy',
 'Drama',
 'Mystery',
 'Thriller',
 'Action',
 'Drama',
 'Thriller',
 'Mystery',
 'Thriller',
 'Comedy',
 'Crime',
 'Comedy',
 'Drama',
 'Romance',
 'Film-Noir',
 'Romance',
 'Thriller',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Western',
 'Comedy',
 'Drama',
 'Film-Noir',
 'Thriller',
 'Drama',
 'Drama',
 'Documentary',
 'Drama',
 'War',
 'Drama',
 'War',
 'Sci-Fi',
 'War',
 'Drama',
 'War',
 'Drama',
 'Thriller',
 'Drama',
 'Drama',
 'Film-Noir',
 'Thriller',
 'Adventure',
 "Children's",
 'Drama',
 'Musical',
 'Comedy',
 'Romance',
 'Film-Noir',
 'Thriller',
 'Drama',
 'Drama',
 'Mystery',
 'Thriller',
 'Film-Noir',
 'Mystery',
 'Action',
 'Adventure',
 'Comedy',
 'Romance',
 'Musical',
 'Romance',
 'Comedy',
 'Drama',
 'Musical',
 'Romance',

In [71]:
s = pd.Series(female_genres)
s.value_counts()

Drama          22
Comedy         14
Thriller       11
Romance         7
Film-Noir       6
War             6
Animation       5
Mystery         4
Action          3
Adventure       3
Musical         3
Documentary     2
Crime           2
Sci-Fi          1
Western         1
Children's      1
dtype: int64

150회 이상 평가를 받은 영화들 중에서, 남성들의 평점이 높은 영화 50개의 장르들