In [1]:
import pandas as pd
import numpy as np
import re

## Load the main Disney movies dataset that we want to use

In [2]:
disney_df = pd.read_csv('../resource/disney/disney_movies_total_gross.csv')
disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251"
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052"
2,Fantasia,"Nov 13, 1940",Musical,G,"$83,320,000","$2,187,090,808"
3,Song of the South,"Nov 12, 1946",Adventure,G,"$65,000,000","$1,078,510,579"
4,Cinderella,"Feb 15, 1950",Drama,G,"$85,000,000","$920,608,730"


In [3]:
disney_df.count()

movie_title                 579
release_date                579
genre                       562
MPAA_rating                 523
total_gross                 579
inflation_adjusted_gross    579
dtype: int64

In [4]:
#disney_list_df = disney_df['movie_title']

## Load MovieLens movies dataset

In [5]:
#movie lens database
movies_df = pd.read_csv('../resource/ml-25m/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df['title'] = movies_df['title'].astype(str)

In [7]:
movies_df.dtypes

movieId     int64
title      object
genres     object
dtype: object

In [8]:
#cleanup movie title to only show title without year

movies_df['movie_title'] = movies_df['title'].str.split(' \(', 1).str[0]
movies_df.drop('title', axis=1, inplace=True)
movies_df.head()

Unnamed: 0,movieId,genres,movie_title
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Adventure|Children|Fantasy,Jumanji
2,3,Comedy|Romance,Grumpier Old Men
3,4,Comedy|Drama|Romance,Waiting to Exhale
4,5,Comedy,Father of the Bride Part II


## Merge the MovieLens dataset to Disney dataset. We want to keep the movieId and genres from MovieLens to our Disney dataset

In [9]:
merged_disney_df = pd.merge(disney_df, movies_df, on="movie_title")
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251",594,Animation|Children|Drama|Fantasy|Musical
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",596,Animation|Children|Fantasy|Musical
2,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",5990,Children|Comedy|Fantasy
3,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",90264,Children|Fantasy|Musical
4,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",126693,Animation|Children


In [10]:
merged_disney_df.count()

movie_title                 596
release_date                596
genre                       582
MPAA_rating                 532
total_gross                 596
inflation_adjusted_gross    596
movieId                     596
genres                      596
dtype: int64

## Remove the $ and , characters from total_gross & inflation_adjusted_gross. Make these 2 columns as int type

In [86]:
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].str.replace(',','')
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].str.replace('$','')

  


In [87]:
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].str.replace(',','')
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].str.replace('$','')

  


In [88]:
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].astype(int)
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].astype(int)

## Drop genre from dataset as we want genres which have more information

In [89]:
#merged_disney_df.drop('genre', axis=1, inplace=True)
#merged_disney_df.head()

Unnamed: 0,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",G,184925485,5228953251,594,Animation|Children|Drama|Fantasy|Musical
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical
2,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,5990,Children|Comedy|Fantasy
3,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,90264,Children|Fantasy|Musical
4,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,126693,Animation|Children


## Keep only movies where total_gross != 0 from dataset

In [90]:
merged_disney_df= merged_disney_df[merged_disney_df['total_gross'] != 0]

In [91]:
merged_disney_df.count()

movie_title                 593
release_date                593
MPAA_rating                 532
total_gross                 593
inflation_adjusted_gross    593
movieId                     593
genres                      593
dtype: int64

In [93]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",G,184925485,5228953251,594,Animation|Children|Drama|Fantasy|Musical
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical
2,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,5990,Children|Comedy|Fantasy
3,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,90264,Children|Fantasy|Musical
4,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,126693,Animation|Children


## Save this merged Disney movies dataset as csv

In [19]:
merged_disney_df.to_csv (r'output/MERGED_disney_movies_total_gross.csv', index = None, header=True) 

## Load MovieLens ratings dataset

In [20]:
#movie lens database
ratings_df = pd.read_csv('../resource/ml-25m/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [21]:
merged_disney_ratings = pd.merge(ratings_df, merged_disney_df, on="movieId")
merged_disney_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,genres
0,1,6377,4.0,1147868469,Finding Nemo,"May 30, 2003",G,380529370,518148559,Adventure|Animation|Children|Comedy
1,3,6377,4.0,1439472644,Finding Nemo,"May 30, 2003",G,380529370,518148559,Adventure|Animation|Children|Comedy
2,4,6377,3.0,1573939584,Finding Nemo,"May 30, 2003",G,380529370,518148559,Adventure|Animation|Children|Comedy
3,12,6377,4.0,1167582434,Finding Nemo,"May 30, 2003",G,380529370,518148559,Adventure|Animation|Children|Comedy
4,13,6377,4.0,1238029123,Finding Nemo,"May 30, 2003",G,380529370,518148559,Adventure|Animation|Children|Comedy


### Remove the duplicated MERGED movies

In [95]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",G,184925485,5228953251,594,Animation|Children|Drama|Fantasy|Musical
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical
2,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,5990,Children|Comedy|Fantasy
3,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,90264,Children|Fantasy|Musical
4,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,126693,Animation|Children


## JUDAT -- please ignore the codes at the bottom as there are work on progress for removing duplicate movies

### Keep only the movies where the genres_length is the longest

In [96]:
merged_disney_df['count'] = merged_disney_df['genres'].str.len()
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres,count
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",G,184925485,5228953251,594,Animation|Children|Drama|Fantasy|Musical,40
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical,34
2,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,5990,Children|Comedy|Fantasy,23
3,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,90264,Children|Fantasy|Musical,24
4,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,126693,Animation|Children,18


In [97]:
merged_disney_df_g = merged_disney_df.groupby(['movie_title']).agg({'count':'max'})

merged_disney_df_g = merged_disney_df_g.reset_index()

In [98]:
merged_disney_df_g = merged_disney_df_g.rename(columns={'count':'count_max'})

merged_disney_df = pd.merge(merged_disney_df, merged_disney_df_g, how='left', on=['movie_title'])

merged_disney_df = merged_disney_df[merged_disney_df['count'] == merged_disney_df['count_max']]

In [101]:
merged_disney_df.drop(columns=['count', 'count_max'], inplace=True)

In [102]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",G,184925485,5228953251,594,Animation|Children|Drama|Fantasy|Musical
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical
6,Fantasia,"Nov 13, 1940",G,83320000,2187090808,1282,Animation|Children|Fantasy|Musical
7,Song of the South,"Nov 12, 1946",G,65000000,1078510579,2099,Adventure|Animation|Children|Musical
8,Cinderella,"Feb 15, 1950",G,85000000,920608730,1022,Animation|Children|Fantasy|Musical|Romance


In [103]:
#Check to see if Pinocchio is unique & has the longest genre length 
merged_disney_df.loc[merged_disney_df['movie_title'] == 'Pinocchio']

Unnamed: 0,movie_title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical


In [104]:
merged_disney_df.rename(columns={'movie_title': 'title'}, inplace = True)
merged_disney_df.head()

Unnamed: 0,title,release_date,MPAA_rating,total_gross,inflation_adjusted_gross,movieId,genres
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",G,184925485,5228953251,594,Animation|Children|Drama|Fantasy|Musical
1,Pinocchio,"Feb 9, 1940",G,84300000,2188229052,596,Animation|Children|Fantasy|Musical
6,Fantasia,"Nov 13, 1940",G,83320000,2187090808,1282,Animation|Children|Fantasy|Musical
7,Song of the South,"Nov 12, 1946",G,65000000,1078510579,2099,Adventure|Animation|Children|Musical
8,Cinderella,"Feb 15, 1950",G,85000000,920608730,1022,Animation|Children|Fantasy|Musical|Romance


In [113]:
disney_movies = merged_disney_df[['movieId', 'title', 'genres']].copy()
disney_movies.to_csv (r'output/test_disney_movies.csv', index = None, header=True) 

In [114]:
merged_disney_df.to_csv (r'output/test_disney_movies.csv', index = None, header=True) 