# 电影数据分析 

## 年龄字段说明 

- Age is chosen from the following ranges:

	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

## 职业字段说明 

- Occupation is chosen from the following choices:

	*  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"

## 读取数据 

In [1]:
import pandas as pd

In [6]:
movie_data_root_path = '../../../git_data_book/pydata-book/datasets/movielens/'

In [7]:
!ls ../../../git_data_book/pydata-book/datasets/movielens/

movies.dat  ratings.dat  README  users.dat


In [16]:
unames = ['user_id', 'gender', 'age', 'occupation', 'zip']
users = pd.read_table(movie_data_root_path + 'users.dat', sep = '::', header = None, names = unames)

  


In [17]:
users.head(3)

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


In [18]:
rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_table(movie_data_root_path + 'ratings.dat', sep = '::', header = None, names = rnames)

  


In [19]:
ratings.head(3)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [20]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table(movie_data_root_path + 'movies.dat', sep = '::', header = None, names = mnames)

  


In [21]:
movies.head(3)

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


## 性别和年龄分析电影评分 

####  将包含性别和年龄以及电影rank的表组成到一起 -- pandas的merge适合此处

In [22]:
# 先合并users和ratings
users_ratings = users.merge(right = ratings, left_on = 'user_id', right_on = 'user_id')

In [23]:
users_ratings[:3]

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,rating,timestamp
0,1,F,1,10,48067,1193,5,978300760
1,1,F,1,10,48067,661,3,978302109
2,1,F,1,10,48067,914,3,978301968


In [25]:
# 再合并users_ratings和movies
users_ratings_movies = users_ratings.merge(right = movies, left_on = 'movie_id', right_on = 'movie_id')

In [28]:
users_ratings_movies[:10]

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,rating,timestamp,title,genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama
6,19,M,1,10,48073,1193,5,982730936,One Flew Over the Cuckoo's Nest (1975),Drama
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama
9,33,M,45,3,55421,1193,5,978557765,One Flew Over the Cuckoo's Nest (1975),Drama


#### 去掉少于250条评分的电影 

In [42]:
# 过滤掉少于250条评价的记录 获取评价多于250条的电影title作为index
size_by_title = users_ratings_movies.groupby('title').size()
size_by_title_250 = size_by_title.index[size_by_title > 250]

#### 分析按照性别和年龄的rank平均分 -- pandas的pivot_table适合此处 

In [44]:
# 按照性别计算
mean_ratings_gender = users_ratings_movies.pivot_table(values='rating', index='title', columns='gender', aggfunc='mean')
mean_ratings_gender = mean_ratings_gender.ix[size_by_title_250]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  This is separate from the ipykernel package so we can avoid doing imports until


In [45]:
mean_ratings_gender[:3]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5


In [46]:
# 按照年龄计算
mean_ratings_age = users_ratings_movies.pivot_table(values='rating', index='title', columns='age', aggfunc='mean')
mean_ratings_age = mean_ratings_age.ix[size_by_title_250]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  This is separate from the ipykernel package so we can avoid doing imports until


In [47]:
mean_ratings_age[:3]

age,1,18,25,35,45,50,56
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"'burbs, The (1989)",4.5,3.244444,2.652174,2.818182,2.545455,3.208333,2.666667
10 Things I Hate About You (1999),3.745455,3.41502,3.43295,3.102941,3.258065,3.62963,4.0
101 Dalmatians (1961),3.514286,3.295082,3.613757,3.826087,3.976744,3.65,3.190476


#### 分析女性对电影偏好 -- pandas的sort_index适合此处 

In [53]:
mean_ratings_gender.sort_index(by='F', ascending=False)[:3]

  """Entry point for launching an IPython kernel.


gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589


#### 男性女性评分相差最大的电影 

In [58]:
# 生成diff列表示该数据并排序
mean_ratings_gender['diff'] = (mean_ratings_gender['F'] - mean_ratings_gender['M'])

In [67]:
mean_ratings_gender.sort_index(by='diff')[:3] # 分歧排名前三，且是男性喜欢的电影

  """Entry point for launching an IPython kernel.


gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,-0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,-0.676359
Dumb & Dumber (1994),2.697987,3.336595,-0.638608


In [68]:
mean_ratings_gender.sort_index(by='diff')[::-1][:3] # 分歧排名前三，且是女性喜欢的电影

  """Entry point for launching an IPython kernel.


gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,0.676359
Grease (1978),3.975265,3.367041,0.608224


In [70]:
mean_ratings_gender['diff'] = mean_ratings_gender['diff'] ** 2 # 仅根据分歧数值找出分歧最大的电影，使用标准差
mean_ratings_gender.sort_index(by='diff', ascending=False)[:3]

  


gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,0.476374
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.278346
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.209271
