## 영화 평점 데이터

영화 평점 데이터를 활용하여 다음을 실시해보세요

- 1.데이터 탐색: NA 처리
- 2.영화별 성별 평점 평균을 산출하여 성별로 상위/하위 10갸 영화의 목록 추출 및 도식화
- 3.영화 평점 정보가 300건 이상 있는 영화에 대하여 여성에게 인기가 높은 상위 10개 영화
- 4.남녀간 호불호가 큰 영화는?
- 5.성별에 관계없이 호불호가 큰 영화는?

In [1]:
import numpy as np
import pandas as pd

In [2]:
unames = ['user_id','gender','age','occupation','zip']
rnames = ['user_id','movie_id','rating','timestamp']
mnames = ['movie_id','title','genres']

In [3]:
movie_df = pd.read_csv('./data/movielens/movies.dat', header=None, sep='::',names= mnames ,engine='python')
rating_df = pd.read_csv('./data/movielens/ratings.dat', header=None, sep='::',names= rnames ,engine='python')
user_df = pd.read_csv('./data/movielens/users.dat', header=None, sep='::',names= unames , engine='python')

In [4]:
movie_df.head(5)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
rating_df.head(5)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [6]:
user_df.head(5)

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


## 1.데이터 탐색: NA 처리

In [7]:
for i in movie_df.columns:
    print(movie_df[i].isnull().values.any())

False
False
False


In [8]:
for i in rating_df.columns:
    print(rating_df[i].isnull().values.any())

False
False
False
False


In [9]:
for i in user_df.columns:
    print(user_df[i].isnull().values.any())

False
False
False
False
False


In [10]:
user_rating = pd.merge(rating_df, user_df, how='outer')

In [11]:
movie_user_rating = pd.merge(user_rating, movie_df, how='outer')

In [12]:
for i in movie_user_rating.columns:
    print(movie_user_rating[i].isnull().values.any())

True
False
True
True
True
True
True
True
False
False


In [13]:
movie_user_rating.count()

user_id       1000209
movie_id      1000386
rating        1000209
timestamp     1000209
gender        1000209
age           1000209
occupation    1000209
zip           1000209
title         1000386
genres        1000386
dtype: int64

In [14]:
movie_user_rating.dropna(axis=0, inplace=True)

In [15]:
movie_user_rating.count()

user_id       1000209
movie_id      1000209
rating        1000209
timestamp     1000209
gender        1000209
age           1000209
occupation    1000209
zip           1000209
title         1000209
genres        1000209
dtype: int64

In [16]:
movie_user_rating.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1.0,1193,5.0,978300760.0,F,1.0,10.0,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2.0,1193,5.0,978298413.0,M,56.0,16.0,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12.0,1193,4.0,978220179.0,M,25.0,12.0,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15.0,1193,4.0,978199279.0,M,25.0,7.0,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17.0,1193,5.0,978158471.0,M,50.0,1.0,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [17]:
movie_user_rating['rating'].groupby(movie_user_rating['title']).mean().head()

title
$1,000,000 Duck (1971)           3.027027
'Night Mother (1986)             3.371429
'Til There Was You (1997)        2.692308
'burbs, The (1989)               2.910891
...And Justice for All (1979)    3.713568
Name: rating, dtype: float64

In [18]:
movie_pivot = movie_user_rating.pivot_table(values='rating', index='title',columns='gender',aggfunc='mean')

## 2.영화별 성별 평점 평균을 산출하여 성별로 상위/하위 10개 영화의 목록 추출 및 도식화

In [19]:
movie_pivot.dropna(axis=0, inplace=True)

In [20]:
movie_pivot.sort_values(by='M',ascending=False).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Bells, The (1926)",4.0,5.0
Dangerous Game (1993),4.0,5.0
Angela (1995),3.0,5.0
"Gate of Heavenly Peace, The (1995)",5.0,5.0
Small Wonders (1996),3.333333,5.0
Time of the Gypsies (Dom za vesanje) (1989),3.5,4.833333
I Am Cuba (Soy Cuba/Ya Kuba) (1964),5.0,4.75
Window to Paris (1994),4.0,4.666667
Lamerica (1994),5.0,4.666667
Sanjuro (1962),4.375,4.639344


In [21]:
movie_pivot.sort_values(by='M',ascending=True).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)",4.0,1.0
Theodore Rex (1995),3.0,1.0
Loser (1991),2.5,1.0
Brenda Starr (1989),1.5,1.0
Wirey Spindell (1999),1.0,1.0
Santa with Muscles (1996),1.0,1.0
"James Dean Story, The (1957)",4.0,1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973),1.0,1.0
Spring Fever USA (a.k.a. Lauderdale) (1989),1.0,1.0
"Love, etc. (1996)",2.5,1.0


In [22]:
movie_pivot.sort_values(by='F',ascending=False).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Coldblooded (1995),5.0,3.588235
"Gate of Heavenly Peace, The (1995)",5.0,5.0
"Other Side of Sunday, The (S�ndagsengler) (1996)",5.0,2.928571
"Brother, Can You Spare a Dime? (1975)",5.0,3.642857
"Big Combo, The (1955)",5.0,3.6
Country Life (1994),5.0,2.0
Lamerica (1994),5.0,4.666667
24 7: Twenty Four Seven (1997),5.0,3.75
Raw Deal (1948),5.0,3.307692
Belly (1998),5.0,3.0


In [23]:
movie_pivot.sort_values(by='F',ascending=True).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Destiny Turns on the Radio (1995),1.0,2.25
Carnosaur 3: Primal Species (1996),1.0,1.066667
Frogs for Snakes (1998),1.0,2.25
Jamaica Inn (1939),1.0,3.142857
"Alan Smithee Film: Burn Hollywood Burn, An (1997)",1.0,2.0625
Diamonds (1999),1.0,2.625
Big Bully (1996),1.0,2.0
"Neon Bible, The (1995)",1.0,4.0
My Life and Times With Antonin Artaud (En compagnie d'Antonin Artaud) (1993),1.0,2.777778
Retro Puppetmaster (1999),1.0,1.8


In [24]:
rating_count = movie_user_rating['rating'].groupby(movie_user_rating['title']).count()

In [25]:
rating_count_300 = rating_count[rating_count>300]

## 3.영화 평점 정보가 300건 이상 있는 영화에 대하여 여성에게 인기가 높은 상위 10개 영화

In [26]:
movie_pivot[movie_pivot.index.isin(rating_count_300.index)].sort_values(by='F',ascending=False).head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075
Schindler's List (1993),4.562602,4.491415
"Shawshank Redemption, The (1994)",4.539075,4.560625
"Grand Day Out, A (1992)",4.537879,4.293255
To Kill a Mockingbird (1962),4.536667,4.372611
"Usual Suspects, The (1995)",4.513317,4.518248
It Happened One Night (1934),4.5,4.163934


## 4.남녀간 호불호가 큰 영화는?

In [27]:
movie_rating300_F = movie_pivot[movie_pivot.index.isin(rating_count_300.index)]['F']

In [28]:
movie_rating300_M = movie_pivot[movie_pivot.index.isin(rating_count_300.index)]['M']

In [29]:
abs(movie_rating300_F-movie_rating300_M).sort_values(ascending=False).head(10)

title
Dirty Dancing (1987)                      0.830782
Good, The Bad and The Ugly, The (1966)    0.726351
Kentucky Fried Movie, The (1977)          0.676359
Jumpin' Jack Flash (1986)                 0.676359
Dumb & Dumber (1994)                      0.638608
Longest Day, The (1962)                   0.619682
Cable Guy, The (1996)                     0.613787
Evil Dead II (Dead By Dawn) (1987)        0.611985
Grease (1978)                             0.608224
Rocky III (1982)                          0.581801
dtype: float64

In [30]:
movie_user_rating.pivot_table(values='rating', index='title',columns='gender',aggfunc='std').head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",0.957427,1.135991
'Night Mother (1986),1.021981,1.228015
'Til There Was You (1997),0.973369,1.162919
"'burbs, The (1989)",1.124532,1.099127
...And Justice for All (1979),0.954424,0.862086


## 5.성별에 관계없이 호불호가 큰 영화는?

In [31]:
movie_user_rating_300 = movie_user_rating[movie_user_rating.title.isin(rating_count_300.index)]

In [32]:
movie_user_rating_300['rating'].groupby(movie_user_rating.title).std().sort_values(ascending=False).head(10)

title
Dumb & Dumber (1994)                           1.321333
Blair Witch Project, The (1999)                1.316368
Natural Born Killers (1994)                    1.307198
Tank Girl (1995)                               1.277695
Rocky Horror Picture Show, The (1975)          1.260177
Eyes Wide Shut (1999)                          1.259624
Billy Madison (1995)                           1.249970
Bicentennial Man (1999)                        1.245533
Babe: Pig in the City (1998)                   1.239379
South Park: Bigger, Longer and Uncut (1999)    1.235380
Name: rating, dtype: float64