# Date Night Movie

In this assignment we are going to use pandas to figure out - What's the best **date-night movie**?

This assignment is going to use
- Joining
- Groupby
- Sorting


In [3]:
import os
import pandas as pd

##### Read in the movie data: `pd.read_table`

In [4]:
def get_movie_data():
    
    unames = ['user_id','gender','age','occupation','zip']
    users = pd.read_table(os.path.join('..\data','users.dat'), 
                          sep='::', header=None, names=unames, encoding='latin-1')
    
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table(os.path.join('..\data', 'ratings.dat'), 
                            sep='::', header=None, names=rnames, encoding='latin-1')
    
    mnames = ['movie_id', 'title','genres']
    movies = pd.read_table(os.path.join('..\data', 'movies.dat'), 
                           sep='::', header=None, names=mnames, encoding='latin-1')

    return users, ratings, movies

In [5]:
users, ratings, movies = get_movie_data()

  users = pd.read_table(os.path.join('..\data','users.dat'),
  ratings = pd.read_table(os.path.join('..\data', 'ratings.dat'),
  movies = pd.read_table(os.path.join('..\data', 'movies.dat'),


In [6]:
print(users.head())

   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455


In [7]:
print(ratings.head())

   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291


In [8]:
print(movies.head())

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


##### Clean up the `movies`

- Get the `year`
- Shorten the `title`


In [9]:
tmp = movies.title.str.extract('(.*) \(([0-9]+)\)')
tmp.apply(lambda x:x[0] if len(x) > 0 else None)
tmp.apply(lambda x: x[0][:40] if len(x) > 0 else None)

0    Toy Story
1         1995
dtype: object

In [10]:
movies['year'] = tmp[1]
movies['short_title'] = tmp[0]

In [11]:
print(movies.head())

   movie_id                               title                        genres  \
0         1                    Toy Story (1995)   Animation|Children's|Comedy   
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2         3             Grumpier Old Men (1995)                Comedy|Romance   
3         4            Waiting to Exhale (1995)                  Comedy|Drama   
4         5  Father of the Bride Part II (1995)                        Comedy   

   year                  short_title  
0  1995                    Toy Story  
1  1995                      Jumanji  
2  1995             Grumpier Old Men  
3  1995            Waiting to Exhale  
4  1995  Father of the Bride Part II  


##### Join the tables with `pd.merge` (20 pts)

In [12]:
a = pd.merge(left=movies,right=ratings,how='inner',on='movie_id')

In [13]:
b = pd.merge(left=users,right=a,how='inner',on='user_id')

In [14]:
b

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,title,genres,year,short_title,rating,timestamp
0,1,F,1,10,48067,1,Toy Story (1995),Animation|Children's|Comedy,1995,Toy Story,5,978824268
1,1,F,1,10,48067,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1995,Pocahontas,5,978824351
2,1,F,1,10,48067,150,Apollo 13 (1995),Drama,1995,Apollo 13,5,978301777
3,1,F,1,10,48067,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,1977,Star Wars: Episode IV - A New Hope,4,978300760
4,1,F,1,10,48067,527,Schindler's List (1993),Drama|War,1993,Schindler's List,5,978824195
...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,M,25,6,11106,3683,Blood Simple (1984),Drama|Film-Noir,1984,Blood Simple,4,960971696
1000205,6040,M,25,6,11106,3703,Mad Max 2 (a.k.a. The Road Warrior) (1981),Action|Sci-Fi,1981,Mad Max 2 (a.k.a. The Road Warrior),4,964828575
1000206,6040,M,25,6,11106,3735,Serpico (1973),Crime|Drama,1973,Serpico,4,960971654
1000207,6040,M,25,6,11106,3751,Chicken Run (2000),Animation|Children's|Comedy,2000,Chicken Run,4,964828782


##### What's the highest rated movie? (20 pts))

In [15]:
b[b.rating==b.rating.max()]

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,title,genres,year,short_title,rating,timestamp
0,1,F,1,10,48067,1,Toy Story (1995),Animation|Children's|Comedy,1995,Toy Story,5,978824268
1,1,F,1,10,48067,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1995,Pocahontas,5,978824351
2,1,F,1,10,48067,150,Apollo 13 (1995),Drama,1995,Apollo 13,5,978301777
4,1,F,1,10,48067,527,Schindler's List (1993),Drama|War,1993,Schindler's List,5,978824195
8,1,F,1,10,48067,595,Beauty and the Beast (1991),Animation|Children's|Musical,1991,Beauty and the Beast,5,978824268
...,...,...,...,...,...,...,...,...,...,...,...,...
1000181,6040,M,25,6,11106,3224,Woman in the Dunes (Suna no onna) (1964),Drama,1964,Woman in the Dunes (Suna no onna),5,956716750
1000183,6040,M,25,6,11106,3289,Not One Less (Yi ge dou bu neng shao) (1999),Drama,1999,Not One Less (Yi ge dou bu neng shao),5,956704305
1000184,6040,M,25,6,11106,3334,Key Largo (1948),Crime|Drama|Film-Noir|Thriller,1948,Key Largo,5,960971875
1000198,6040,M,25,6,11106,3521,Mystery Train (1989),Comedy|Crime|Drama,1989,Mystery Train,5,956715569


In [38]:
rt = int(b.rating.max())
k= b[b.rating==rt].groupby('gender')[['title','gender','title']].apply(lambda x: x)
k

Unnamed: 0,title,gender,title.1
0,Toy Story (1995),F,Toy Story (1995)
1,Pocahontas (1995),F,Pocahontas (1995)
2,Apollo 13 (1995),F,Apollo 13 (1995)
4,Schindler's List (1993),F,Schindler's List (1993)
8,Beauty and the Beast (1991),F,Beauty and the Beast (1991)
...,...,...,...
1000181,Woman in the Dunes (Suna no onna) (1964),M,Woman in the Dunes (Suna no onna) (1964)
1000183,Not One Less (Yi ge dou bu neng shao) (1999),M,Not One Less (Yi ge dou bu neng shao) (1999)
1000184,Key Largo (1948),M,Key Largo (1948)
1000198,Mystery Train (1989),M,Mystery Train (1989)


###### What is a good rated movie for date night? (60 pts)

- Hint - highly rated movie by 
    - both partners (might be the same gender or not),
    - based on genre preferences,
    - age group can also be combined

In [16]:
b.age.value_counts()

25    395556
35    199003
18    183536
45     83633
50     72490
56     38780
1      27211
Name: age, dtype: int64

In [17]:
b[(b.gender=='M')&(b.rating==b.rating.max())]

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,title,genres,year,short_title,rating,timestamp
55,2,M,56,16,70072,110,Braveheart (1995),Action|Drama|War,1995,Braveheart,5,978298625
61,2,M,56,16,70072,318,"Shawshank Redemption, The (1994)",Drama,1994,"Shawshank Redemption, The",5,978298413
63,2,M,56,16,70072,356,Forrest Gump (1994),Comedy|Romance|War,1994,Forrest Gump,5,978299686
65,2,M,56,16,70072,380,True Lies (1994),Action|Adventure|Comedy|Romance,1994,True Lies,5,978299809
70,2,M,56,16,70072,480,Jurassic Park (1993),Action|Adventure|Sci-Fi,1993,Jurassic Park,5,978299809
...,...,...,...,...,...,...,...,...,...,...,...,...
1000181,6040,M,25,6,11106,3224,Woman in the Dunes (Suna no onna) (1964),Drama,1964,Woman in the Dunes (Suna no onna),5,956716750
1000183,6040,M,25,6,11106,3289,Not One Less (Yi ge dou bu neng shao) (1999),Drama,1999,Not One Less (Yi ge dou bu neng shao),5,956704305
1000184,6040,M,25,6,11106,3334,Key Largo (1948),Crime|Drama|Film-Noir|Thriller,1948,Key Largo,5,960971875
1000198,6040,M,25,6,11106,3521,Mystery Train (1989),Comedy|Crime|Drama,1989,Mystery Train,5,956715569


In [37]:
b[(b.gender=='F')&(b.rating==b.rating.max())]

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,title,genres,year,short_title,rating,timestamp,age_cat
0,1,F,1,10,48067,1,Toy Story (1995),Animation|Children's|Comedy,1995,Toy Story,5,978824268,child
1,1,F,1,10,48067,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1995,Pocahontas,5,978824351,child
2,1,F,1,10,48067,150,Apollo 13 (1995),Drama,1995,Apollo 13,5,978301777,child
4,1,F,1,10,48067,527,Schindler's List (1993),Drama|War,1993,Schindler's List,5,978824195,child
8,1,F,1,10,48067,595,Beauty and the Beast (1991),Animation|Children's|Musical,1991,Beauty and the Beast,5,978824268,child
...,...,...,...,...,...,...,...,...,...,...,...,...,...
999841,6039,F,45,0,01060,2300,"Producers, The (1968)",Comedy|Musical,1968,"Producers, The",5,956705395,old
999842,6039,F,45,0,01060,2396,Shakespeare in Love (1998),Comedy|Romance,1998,Shakespeare in Love,5,956705395,old
999856,6039,F,45,0,01060,3037,Little Big Man (1970),Comedy|Drama|Western,1970,Little Big Man,5,956705679,old
999858,6039,F,45,0,01060,3088,Harvey (1950),Comedy,1950,Harvey,5,956705281,old


In [19]:
z = pd.cut(b.age, [0,18,35,57], labels=['child','middle-aged','old'])
z

0                child
1                child
2                child
3                child
4                child
              ...     
1000204    middle-aged
1000205    middle-aged
1000206    middle-aged
1000207    middle-aged
1000208    middle-aged
Name: age, Length: 1000209, dtype: category
Categories (3, object): ['child' < 'middle-aged' < 'old']

In [20]:
b['age_cat'] = z
b

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,title,genres,year,short_title,rating,timestamp,age_cat
0,1,F,1,10,48067,1,Toy Story (1995),Animation|Children's|Comedy,1995,Toy Story,5,978824268,child
1,1,F,1,10,48067,48,Pocahontas (1995),Animation|Children's|Musical|Romance,1995,Pocahontas,5,978824351,child
2,1,F,1,10,48067,150,Apollo 13 (1995),Drama,1995,Apollo 13,5,978301777,child
3,1,F,1,10,48067,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,1977,Star Wars: Episode IV - A New Hope,4,978300760,child
4,1,F,1,10,48067,527,Schindler's List (1993),Drama|War,1993,Schindler's List,5,978824195,child
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,M,25,6,11106,3683,Blood Simple (1984),Drama|Film-Noir,1984,Blood Simple,4,960971696,middle-aged
1000205,6040,M,25,6,11106,3703,Mad Max 2 (a.k.a. The Road Warrior) (1981),Action|Sci-Fi,1981,Mad Max 2 (a.k.a. The Road Warrior),4,964828575,middle-aged
1000206,6040,M,25,6,11106,3735,Serpico (1973),Crime|Drama,1973,Serpico,4,960971654,middle-aged
1000207,6040,M,25,6,11106,3751,Chicken Run (2000),Animation|Children's|Comedy,2000,Chicken Run,4,964828782,middle-aged
