# 영화 평점 분석 실습

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
#1. 사용자 데이터 읽어오기
users = pd.read_csv('data/movielens/users.dat', sep='::', engine = 'python',
           names = ['사용자아이디', '성별', '연령','직업','지역']) #seperator가 엔진을 파이썬으로 설정하라는 경고가 나옴.
users[:5]

Unnamed: 0,사용자아이디,성별,연령,직업,지역
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [3]:
# 2. 평점 데이터 읽어오기
# 데이터 파일 : data/movielens/ratings.dat
# 컬럼명들은 ['사용자아이디', '영화아이디', '평점', '타임스탬프']
ratings  = pd.read_csv('data/movielens/ratings.dat', sep='::', engine = 'python',
           names = ['사용자아이디', '영화아이디', '평점', '타임스탬프'])
ratings[:5]

Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
#3. 영화 데이터 읽어오기
#데이터 파일: data/movielens/movies.dat
# 컬럼명 :  ['영화아이디', '영화제목', '장르']
movies = pd.read_csv('data/movielens/movies.dat', sep='::', engine = 'python',
           encoding = 'latin-1', names = ['영화아이디', '영화제목', '장르'])
movies[:5]

Unnamed: 0,영화아이디,영화제목,장르
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
print('영화수: ' + str(len(movies)))
print('사용자수: ' + str(len(users)))
print('평점수: ' + str(len(ratings)))

영화수: 3883
사용자수: 6040
평점수: 1000209


In [6]:
#4. 3개의 데이터프레임을 하나로 합치기
data = pd.merge(ratings, users)

In [7]:
data = pd.merge(data, movies)

In [8]:
data.head(10)

Unnamed: 0,사용자아이디,영화아이디,평점,타임스탬프,성별,연령,직업,지역,영화제목,장르
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama
5,18,1193,4,978156168,F,18,3,95825,One Flew Over the Cuckoo's Nest (1975),Drama
6,19,1193,5,982730936,M,1,10,48073,One Flew Over the Cuckoo's Nest (1975),Drama
7,24,1193,5,978136709,F,25,7,10023,One Flew Over the Cuckoo's Nest (1975),Drama
8,28,1193,3,978125194,F,25,1,14607,One Flew Over the Cuckoo's Nest (1975),Drama
9,33,1193,5,978557765,M,45,3,55421,One Flew Over the Cuckoo's Nest (1975),Drama


In [9]:
#5. 영화별 평점 구하기

In [10]:
영화별평점 = data.groupby('영화제목')

In [11]:
영화별평점.agg('mean')['평점'].sort_values(ascending = False)

영화제목
Gate of Heavenly Peace, The (1995)                                       5.000000
Lured (1947)                                                             5.000000
Ulysses (Ulisse) (1954)                                                  5.000000
Smashing Time (1967)                                                     5.000000
Follow the Bitch (1998)                                                  5.000000
Song of Freedom (1936)                                                   5.000000
Bittersweet Motel (2000)                                                 5.000000
Baby, The (1973)                                                         5.000000
One Little Indian (1973)                                                 5.000000
Schlafes Bruder (Brother of Sleep) (1995)                                5.000000
I Am Cuba (Soy Cuba/Ya Kuba) (1964)                                      4.800000
Lamerica (1994)                                                          4.750000
Apple, The 

In [12]:
영화평점 = 영화별평점['평점'].agg(['count', 'mean'])

In [13]:
영화평점[영화평점['count'] >= 500].sort_values(by = 'mean', ascending = False)

Unnamed: 0_level_0,count,mean
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),628,4.560510
"Shawshank Redemption, The (1994)",2227,4.554558
"Godfather, The (1972)",2223,4.524966
"Close Shave, A (1995)",657,4.520548
"Usual Suspects, The (1995)",1783,4.517106
Schindler's List (1993),2304,4.510417
"Wrong Trousers, The (1993)",882,4.507937
Raiders of the Lost Ark (1981),2514,4.477725
Rear Window (1954),1050,4.476190
Star Wars: Episode IV - A New Hope (1977),2991,4.453694


In [15]:
# 평점이 4.3점 이상인 영화들을 리뷰(평점)개수가 많은 상위 10개의 영화를 출력
영화평점[영화평점['mean'] >= 4.3].sort_values(by = 'count', ascending = False)[:10]

Unnamed: 0_level_0,count,mean
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
American Beauty (1999),3428,4.317386
Star Wars: Episode IV - A New Hope (1977),2991,4.453694
Saving Private Ryan (1998),2653,4.337354
"Matrix, The (1999)",2590,4.31583
"Silence of the Lambs, The (1991)",2578,4.351823
Raiders of the Lost Ark (1981),2514,4.477725
"Sixth Sense, The (1999)",2459,4.406263
"Princess Bride, The (1987)",2318,4.30371
Schindler's List (1993),2304,4.510417
"Shawshank Redemption, The (1994)",2227,4.554558


In [16]:
영화별_성별 = data.groupby(['영화제목', '성별'])

In [17]:
영화별_성별_평점 = 영화별_성별['평점'].agg(['count', 'mean'])

In [18]:
영화별_성별_평점[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
영화제목,성별,Unnamed: 2_level_1,Unnamed: 3_level_1
"$1,000,000 Duck (1971)",F,16,3.375
"$1,000,000 Duck (1971)",M,21,2.761905
'Night Mother (1986),F,36,3.388889
'Night Mother (1986),M,34,3.352941
'Til There Was You (1997),F,37,2.675676
'Til There Was You (1997),M,15,2.733333
"'burbs, The (1989)",F,92,2.793478
"'burbs, The (1989)",M,211,2.962085
...And Justice for All (1979),F,35,3.828571
...And Justice for All (1979),M,164,3.689024


## 계층적 색인 (다중 색인)
### 2개 이상의 색인(인덱스)를 지정할 수 있다. 
### 차원이 높은 (고차원) 데이터를 낮은 차원의 형식으로 다룰 수 있게 해주는 기능

In [19]:
# $1,000,000 Duck (1971) 영화 정보만 출력
영화별_성별_평점.loc["$1,000,000 Duck (1971)"]

Unnamed: 0_level_0,count,mean
성별,Unnamed: 1_level_1,Unnamed: 2_level_1
F,16,3.375
M,21,2.761905


In [20]:
# $1,000,000 Duck (1971) 영화의 여자(F)의 평점 리뷰 개수와 평점 평균을 출력
영화별_성별_평점.loc[("$1,000,000 Duck (1971)", "F")]

count    16.000
mean      3.375
Name: ($1,000,000 Duck (1971), F), dtype: float64

In [21]:
# $1,000,000 Duck (1971) 영화의 여자(F)의 평점 평균을 출력
영화별_성별_평점.loc[("$1,000,000 Duck (1971)", "F"), "mean"]
#위 구문과 동일함 --> 영화별_성별_평점["mean"].loc[("$1,000,000 Duck (1971)", "F")] 

3.375

In [22]:
# 여성의 평점 정보들을 모두 출력
# loc를 사용하면 첫번째 인덱스 라벨 (예제에서는 영화제목)로만 접근이 가능함. 
# 그 외 인덱스 라벨로 접근하고자 하는 경우에는 xs() 함수를 사용함.
영화별_성별_평점.xs("F", level = "성별")

Unnamed: 0_level_0,count,mean
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",16,3.375000
'Night Mother (1986),36,3.388889
'Til There Was You (1997),37,2.675676
"'burbs, The (1989)",92,2.793478
...And Justice for All (1979),35,3.828571
1-900 (1994),1,2.000000
10 Things I Hate About You (1999),232,3.646552
101 Dalmatians (1961),187,3.791444
101 Dalmatians (1996),150,3.240000
12 Angry Men (1957),141,4.184397


In [23]:
#다중 색인의 색인 순서 교환
# swaplevel(index1, index2, axis)
# axis의 기본값은 0
영화별_성별_평점.swaplevel("영화제목","성별")

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
성별,영화제목,Unnamed: 2_level_1,Unnamed: 3_level_1
F,"$1,000,000 Duck (1971)",16,3.375000
M,"$1,000,000 Duck (1971)",21,2.761905
F,'Night Mother (1986),36,3.388889
M,'Night Mother (1986),34,3.352941
F,'Til There Was You (1997),37,2.675676
M,'Til There Was You (1997),15,2.733333
F,"'burbs, The (1989)",92,2.793478
M,"'burbs, The (1989)",211,2.962085
F,...And Justice for All (1979),35,3.828571
M,...And Justice for All (1979),164,3.689024


In [24]:
# stack(), unstack() 함수
# stack() : 1축(열, column) 인덱스 --> 0축(행, row) 인덱스
# unstack(): 0축 인덱스 --> 1축 인덱스
영화별_성별_평점[:20].unstack()

Unnamed: 0_level_0,count,count,mean,mean
성별,F,M,F,M
영화제목,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
"$1,000,000 Duck (1971)",16,21,3.375,2.761905
'Night Mother (1986),36,34,3.388889,3.352941
'Til There Was You (1997),37,15,2.675676,2.733333
"'burbs, The (1989)",92,211,2.793478,2.962085
...And Justice for All (1979),35,164,3.828571,3.689024
1-900 (1994),1,1,2.0,3.0
10 Things I Hate About You (1999),232,468,3.646552,3.311966
101 Dalmatians (1961),187,378,3.791444,3.5
101 Dalmatians (1996),150,214,3.24,2.911215
12 Angry Men (1957),141,475,4.184397,4.328421


In [25]:
영화별_성별_평점[:20].stack()

영화제목                               성별       
$1,000,000 Duck (1971)             F   count     16.000000
                                       mean       3.375000
                                   M   count     21.000000
                                       mean       2.761905
'Night Mother (1986)               F   count     36.000000
                                       mean       3.388889
                                   M   count     34.000000
                                       mean       3.352941
'Til There Was You (1997)          F   count     37.000000
                                       mean       2.675676
                                   M   count     15.000000
                                       mean       2.733333
'burbs, The (1989)                 F   count     92.000000
                                       mean       2.793478
                                   M   count    211.000000
                                       mean       2.962085
...And Just

In [26]:
# 실습 1. 여성들에게 가장 평점이 높았던 10개의 영화 제목, 평점 평균을 출력. 
#단, 평점의 갯수가 100개 이상이어야 함.
여성평점 = 영화별_성별_평점.xs("F", level = '성별')
여성평점[여성평점['count']>=100].sort_values(by = 'mean', ascending = False)[:10]

Unnamed: 0_level_0,count,mean
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",180,4.644444
"Wrong Trousers, The (1993)",238,4.588235
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),117,4.57265
Wallace & Gromit: The Best of Aardman Animation (1996),103,4.563107
Schindler's List (1993),615,4.562602
"Shawshank Redemption, The (1994)",627,4.539075
"Grand Day Out, A (1992)",132,4.537879
To Kill a Mockingbird (1962),300,4.536667
"Usual Suspects, The (1995)",413,4.513317
It Happened One Night (1934),130,4.5


In [27]:
# 실습 2. 남자와 여자의 호불호가 크게 갈리는 영화 10개를 출력 
# 단, 평점의 갯수가 100개 이상이어야 함.
# 호불호는 평점의 차이로 가정함.
영화별_성별_평점 = 영화별_성별_평점[영화별_성별_평점['count']>= 100]

In [28]:
남녀평점차이 = 영화별_성별_평점['mean'].unstack()

In [29]:
남녀평점차이['diff'] = abs(남녀평점차이['F'] - 남녀평점차이['M'])

In [30]:
남녀평점차이.sort_values(by = 'diff', ascending = False)[:10]

성별,F,M,diff
영화제목,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,0.676359
Dumb & Dumber (1994),2.697987,3.336595,0.638608
Grease (1978),3.975265,3.367041,0.608224
Caddyshack (1980),3.396135,3.969737,0.573602
Little Women (1994),3.870588,3.321739,0.548849
Animal House (1978),3.628906,4.167192,0.538286
Steel Magnolias (1989),3.901734,3.365957,0.535777
"Exorcist, The (1973)",3.537634,4.067239,0.529605
Anastasia (1997),3.8,3.281609,0.518391
