In [1]:
import pandas as pd
import numpy as np
import re

## Load the main Disney movies dataset that we want to use

Start by loading the main Disney dataset

In [76]:
disney_df = pd.read_csv('../resource/disney/disney_movies_total_gross.csv')
disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251"
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052"
2,Fantasia,"Nov 13, 1940",Musical,G,"$83,320,000","$2,187,090,808"
3,Song of the South,"Nov 12, 1946",Adventure,G,"$65,000,000","$1,078,510,579"
4,Cinderella,"Feb 15, 1950",Drama,G,"$85,000,000","$920,608,730"


In [78]:
#Get the year from release_date

disney_df['year'] = pd.DatetimeIndex(disney_df['release_date']).year

disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251",1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940
2,Fantasia,"Nov 13, 1940",Musical,G,"$83,320,000","$2,187,090,808",1940
3,Song of the South,"Nov 12, 1946",Adventure,G,"$65,000,000","$1,078,510,579",1946
4,Cinderella,"Feb 15, 1950",Drama,G,"$85,000,000","$920,608,730",1950


In [79]:
disney_df.count()

movie_title                 579
release_date                579
genre                       562
MPAA_rating                 523
total_gross                 579
inflation_adjusted_gross    579
year                        579
dtype: int64

In [81]:
#cast the movie_title as string type
disney_df['movie_title'] = disney_df['movie_title'].astype(str)

disney_df.dtypes

movie_title                 object
release_date                object
genre                       object
MPAA_rating                 object
total_gross                 object
inflation_adjusted_gross    object
year                         int64
dtype: object

In [7]:
#disney_list_df = disney_df['movie_title']

## Load MovieLens movies dataset

In [82]:
#movie lens database
movielens_df = pd.read_csv('../resource/ml-25m/movies.csv')
movielens_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [84]:
#cast the title as string type
movielens_df['title'] = movielens_df['title'].astype(str)
movielens_df.dtypes

movieId     int64
title      object
genres     object
dtype: object

## Split title into movie_title and year

In [85]:
#cleanup movie title to only show title without year
#extract the 4 digit of the years inside parentheses () and create a new column 'year'

movielens_df['movie_title'] = movielens_df['title'].str.split(' \(\d', 1).str[0]
movielens_df['year'] = movielens_df['title'].str.extract('(\(\d{4}\)$)')
movielens_df['year'] = movielens_df['year'].str.replace('\)','')
movielens_df['year'] = movielens_df['year'].str.replace('\(','')

#movies_df.drop('title', axis=1, inplace=True)
movielens_df.head()

  
  import sys


Unnamed: 0,movieId,title,genres,movie_title,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [86]:
#Can't parse year to int as there are some null values
movielens_df[movielens_df['year'].isnull()]

Unnamed: 0,movieId,title,genres,movie_title,year
8463,25936,"Babe Ruth Story, The (1948)",Drama,"Babe Ruth Story, The",
8602,26137,"Heroes of Telemark, The (1965)",Action|Drama|War,"Heroes of Telemark, The",
8910,26628,Jack's Back (1988),Crime|Horror|Mystery|Thriller,Jack's Back,
9120,27189,After the Rain (Ame agaru) (1999),Action|Drama,After the Rain (Ame agaru),
9763,32497,Love Letter (1995),Drama|Romance,Love Letter,
...,...,...,...,...,...
62071,207714,Tales of Found Footage,(no genres listed),Tales of Found Footage,
62104,207884,Enduring Destiny,(no genres listed),Enduring Destiny,
62285,208597,Punk the Capital: Building a Sound Movement,Documentary,Punk the Capital: Building a Sound Movement,
62326,208763,Yosemite: The Fate of Heaven,(no genres listed),Yosemite: The Fate of Heaven,


In [15]:
#Remove year is null rows
#movies_df = movies_df[~movies_df['year'].isnull()]

In [16]:
#Parse year as int
#movies_df['year'] = movies_df['year'].astype(int)

In [17]:
movielens_df.count()

movieId        62423
title          62423
genres         62423
movie_title    62423
year           61857
dtype: int64

In [18]:
movielens_df.dtypes

movieId         int64
title          object
genres         object
movie_title    object
year           object
dtype: object

## Merge the MovieLens dataset to Disney dataset. We want to keep the movieId and genres from MovieLens to our Disney dataset

In [87]:
#Merge the Disney dataset and MovieLens dataset on movie_title ONLY
merged_disney_df =merged_disney_df pd.merge(disney_df, movielens_df, how='left', on=['movie_title'])
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251",1937,594.0,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,596.0,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
2,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,5990.0,Pinocchio (2002),Children|Comedy|Fantasy,2002
3,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,90264.0,Pinocchio (1976),Children|Fantasy|Musical,1976
4,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,126693.0,Pinocchio (2012),Animation|Children,2012


In [88]:
merged_disney_df.count()

movie_title                 743
release_date                743
genre                       723
MPAA_rating                 666
total_gross                 743
inflation_adjusted_gross    743
year_x                      743
movieId                     578
title                       578
genres                      578
year_y                      577
dtype: int64

In [20]:
#Merge the Disney dataset and MovieLens dataset on movie_title & year
#merged_disney_df = pd.merge(disney_df, movies_df, on=['movie_title','year'])
#merged_disney_df.head()

In [89]:
merged_disney_df.count()

movie_title                 743
release_date                743
genre                       723
MPAA_rating                 666
total_gross                 743
inflation_adjusted_gross    743
year_x                      743
movieId                     578
title                       578
genres                      578
year_y                      577
dtype: int64

## Cleanup the Merged Movies data

### Remove the "$" and "," characters from total_gross & inflation_adjusted_gross. Make these 2 columns as int type

In [90]:
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].str.replace(',','')
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].str.replace('$','')

  


In [93]:
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].str.replace(',','')
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].str.replace('$','')

  


In [94]:
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].astype(int)
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].astype(int)

## Drop genre from dataset as we want genres which have more information

In [25]:
#merged_disney_df.drop('genre', axis=1, inplace=True)
#merged_disney_df.head()

## Keep only movies where total_gross != 0 from dataset

In [95]:
#only 4 Disney movie titles will be dropped

merged_disney_df= merged_disney_df[merged_disney_df['total_gross'] != 0]

In [96]:
merged_disney_df.count()

movie_title                 738
release_date                738
genre                       720
MPAA_rating                 666
total_gross                 738
inflation_adjusted_gross    738
year_x                      738
movieId                     575
title                       575
genres                      575
year_y                      574
dtype: int64

In [97]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,184925485,5228953251,1937,594.0,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,596.0,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
2,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,5990.0,Pinocchio (2002),Children|Comedy|Fantasy,2002
3,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,90264.0,Pinocchio (1976),Children|Fantasy|Musical,1976
4,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,126693.0,Pinocchio (2012),Animation|Children,2012


## Save this merged Disney movies dataset as csv

In [29]:
#merged_disney_df.to_csv (r'output/MERGED_disney_movies_total_gross.csv', index = None, header=True) 

## Load MovieLens ratings dataset

In [30]:
#movie lens database
ratings_df = pd.read_csv('../resource/ml-25m/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [54]:
disney_ratings = pd.merge(ratings_df, merged_disney_df[['movieId','movie_title']], on='movieId')
disney_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_title
0,1,6377,4.0,1147868469,Finding Nemo
1,3,6377,4.0,1439472644,Finding Nemo
2,4,6377,3.0,1573939584,Finding Nemo
3,12,6377,4.0,1167582434,Finding Nemo
4,13,6377,4.0,1238029123,Finding Nemo


In [57]:
disney_ratings.count()

userId         1598445
rating         1598445
timestamp      1598445
movie_title    1598445
dtype: int64

In [56]:
disney_ratings.drop(columns='movieId', inplace=True)
disney_ratings.head()

Unnamed: 0,userId,rating,timestamp,movie_title
0,1,4.0,1147868469,Finding Nemo
1,3,4.0,1439472644,Finding Nemo
2,4,3.0,1573939584,Finding Nemo
3,12,4.0,1167582434,Finding Nemo
4,13,4.0,1238029123,Finding Nemo


### Remove the duplicated MERGED movies

In [58]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251",1937,594.0,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,596.0,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
2,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,5990.0,Pinocchio (2002),Children|Comedy|Fantasy,2002
3,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,90264.0,Pinocchio (1976),Children|Fantasy|Musical,1976
4,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,126693.0,Pinocchio (2012),Animation|Children,2012


## JUDAT -- please ignore the codes at the bottom as these are work on progress for removing duplicate movies

## Genre & Genres merge so all movies will have Genres regardless where the information is from 

In [98]:
#Is there any movies with genre = NaN and genres = (no genres listed)?
merged_disney_df.loc[(merged_disney_df['genre'].isna()) &  (merged_disney_df['genres'] == '(no genres listed)')]

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
227,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606.0,My Boyfriend's Back,(no genres listed),


In [99]:
#Check My Boyfriend's Back genres
merged_disney_df.loc[merged_disney_df['movie_title'] == 'My Boyfriend\'s Back']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
226,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,2552.0,My Boyfriend's Back (1993),Comedy,1993.0
227,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606.0,My Boyfriend's Back,(no genres listed),


In [100]:
#Manual search on IMDB: https://www.imdb.com/title/tt0107626/
#Genre is: Comedy|Fantasy|Horror|Romance
#Manually add this genres into the dataset
merged_disney_df['genres'][merged_disney_df['movie_title'].str.contains('My Boyfriend\'s Back')] ='Comedy|Fantasy|Horror|Romance'

In [101]:
#Check My Boyfriend's Back genres
merged_disney_df.loc[merged_disney_df['movie_title'] == 'My Boyfriend\'s Back']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
226,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,2552.0,My Boyfriend's Back (1993),Comedy|Fantasy|Horror|Romance,1993.0
227,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606.0,My Boyfriend's Back,Comedy|Fantasy|Horror|Romance,


In [102]:
#Check the movie titles with genres == '(no genres listed)'
merged_disney_df.loc[merged_disney_df['genres'] == '(no genres listed)']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
16,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149883.0,Cinderella (2000),(no genres listed),2000
19,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,179555.0,Cinderella (1960),(no genres listed),1960
47,Freaky Friday,"Jan 21, 1977",Comedy,,25942000,98067733,1977,136592.0,Freaky Friday (1995),(no genres listed),1995
132,Stella,"Feb 2, 1990",Drama,PG-13,20062347,40077147,1990,163160.0,Stella (1950),(no genres listed),1950
150,White Fang,"Jan 18, 1991",Adventure,PG,34729091,69540672,1991,184969.0,White Fang (1997),(no genres listed),1997
166,Paradise,"Sep 18, 1991",Drama,PG-13,18634643,37313540,1991,162580.0,Paradise (2015),(no genres listed),2015
324,Boys,"May 10, 1996",Drama,PG-13,21930418,41826566,1996,195079.0,Boys (2016),(no genres listed),2016
332,Jack,"Aug 9, 1996",Drama,PG-13,58617334,111792852,1996,158838.0,Jack (2013),(no genres listed),2013
464,The Count of Monte Cristo,"Jan 25, 2002",Drama,PG-13,54228104,78682079,2002,128295.0,The Count of Monte Cristo (1961),(no genres listed),1961
500,Freaky Friday,"Aug 6, 2003",Comedy,PG,110222438,154090360,2003,136592.0,Freaky Friday (1995),(no genres listed),1995


In [103]:
#Get the list of movie titles where genres == '(no genres listed)'
no_genres_list = merged_disney_df['movie_title'].loc[merged_disney_df['genres'] == '(no genres listed)'].unique().tolist()
print(no_genres_list)

['Cinderella', 'Freaky Friday', 'Stella', 'White Fang', 'Paradise', 'Boys', 'Jack', 'The Count of Monte Cristo', 'The Village', 'Into the Woods']


In [104]:
#For all the above movies with genres == (no genres listed), we copy the genre into genres so all 
#these movies will now have genres
#merged_disney_df['genres'] = np.where(merged_disney_df['genres'] == '(no genres listed)', merged_disney_df['genre'], merged_disney_df['genres'])

merged_disney_df.loc[(merged_disney_df.genres == '(no genres listed)')  , 'genres'] = merged_disney_df.genre

In [105]:
merged_disney_df.loc[merged_disney_df['genres'] == '(no genres listed)']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y


In [106]:
#Check the list of movies with genres == (no genres listed) to confirm if the genres are now populated correctly
merged_disney_df.query('movie_title in @no_genres_list')

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
8,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,1022.0,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance,1950
9,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,63239.0,Cinderella (1997),Children|Fantasy|Musical|Romance,1997
10,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,111901.0,Cinderella (1914),Fantasy|Romance,1914
11,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,129195.0,Cinderella (2011),Drama|Romance,2011
12,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,130073.0,Cinderella (2015),Children|Drama|Fantasy|Romance,2015
...,...,...,...,...,...,...,...,...,...,...,...
709,Cinderella,"Mar 13, 2015",Drama,PG,201151353,201151353,2015,179555.0,Cinderella (1960),Drama,1960
710,Cinderella,"Mar 13, 2015",Drama,PG,201151353,201151353,2015,179605.0,Cinderella (1965),Fantasy|Romance,1965
711,Cinderella,"Mar 13, 2015",Drama,PG,201151353,201151353,2015,188371.0,Cinderella (1979),Animation|Children|Fantasy,1979
712,Cinderella,"Mar 13, 2015",Drama,PG,201151353,201151353,2015,203467.0,Cinderella (2006),Drama|Horror|Mystery|Thriller,2006


In [107]:
merged_disney_df.count()

movie_title                 738
release_date                738
genre                       720
MPAA_rating                 666
total_gross                 738
inflation_adjusted_gross    738
year_x                      738
movieId                     575
title                       575
genres                      575
year_y                      574
dtype: int64

In [108]:
#there is a null row in year_y

merged_disney_df.loc[merged_disney_df['year_y'].isnull()]

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
33,The Absent Minded Professor,"Mar 16, 1961",Comedy,,25381407,310094574,1961,,,,
38,The Sword in the Stone,"Dec 25, 1963",Adventure,,22182353,153870834,1963,,,,
41,The Aristocats,"Apr 24, 1970",Musical,G,55675257,255161499,1970,,,,
44,The Apple Dumpling Gang,"Jul 1, 1975",Comedy,,31916500,131246872,1975,,,,
50,The Rescuers,"Jun 22, 1977",Adventure,,48775599,159743914,1977,,,,
...,...,...,...,...,...,...,...,...,...,...,...
686,Million Dollar Arm,"May 10, 2014",Drama,PG,36447959,37607865,2014,,,,
688,Planes: Fire and Rescue,"Jul 18, 2014",Adventure,PG,59157732,61040349,2014,,,,
691,"Alexander and the Terrible, Horrible,…","Oct 10, 2014",Comedy,PG,66954149,69055550,2014,,,,
724,Star Wars Ep. VII: The Force Awakens,"Dec 18, 2015",Adventure,PG-13,936662225,936662225,2015,,,,


In [109]:
#if the year_y is null, we copy the year_x to year_y
merged_disney_df.loc[(merged_disney_df['year_y'].isnull())  , 'year_y'] = merged_disney_df.year_x

In [110]:
merged_disney_df.count()

movie_title                 738
release_date                738
genre                       720
MPAA_rating                 666
total_gross                 738
inflation_adjusted_gross    738
year_x                      738
movieId                     575
title                       575
genres                      575
year_y                      738
dtype: int64

In [111]:
#Check Cinderella genres
merged_disney_df.loc[merged_disney_df['movie_title'] == 'Cinderella']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
8,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,1022.0,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance,1950
9,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,63239.0,Cinderella (1997),Children|Fantasy|Musical|Romance,1997
10,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,111901.0,Cinderella (1914),Fantasy|Romance,1914
11,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,129195.0,Cinderella (2011),Drama|Romance,2011
12,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,130073.0,Cinderella (2015),Children|Drama|Fantasy|Romance,2015
13,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,146030.0,Cinderella (1947),Children|Comedy|Fantasy|Musical|Romance,1947
14,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149875.0,Cinderella (1957),Drama|Romance,1957
15,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149877.0,Cinderella (1977),Comedy,1977
16,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149883.0,Cinderella (2000),Drama,2000
17,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,165397.0,Cinderella (1899),Children|Fantasy|Horror|Sci-Fi,1899


In [114]:
#Cast both year_y and year_x as int

merged_disney_df['year_x'] = merged_disney_df['year_x'].astype(int)
merged_disney_df['year_y'] = merged_disney_df['year_y'].astype(int)

In [120]:
#new_movieId = merged_disney_df.loc[(merged_disney_df['year_x'] == merged_disney_df['year_y'])]

In [123]:
merged_disney_df.loc[(merged_disney_df['movieId'].isnull())]

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
33,The Absent Minded Professor,"Mar 16, 1961",Comedy,,25381407,310094574,1961,,,,1961
38,The Sword in the Stone,"Dec 25, 1963",Adventure,,22182353,153870834,1963,,,,1963
41,The Aristocats,"Apr 24, 1970",Musical,G,55675257,255161499,1970,,,,1970
44,The Apple Dumpling Gang,"Jul 1, 1975",Comedy,,31916500,131246872,1975,,,,1975
50,The Rescuers,"Jun 22, 1977",Adventure,,48775599,159743914,1977,,,,1977
...,...,...,...,...,...,...,...,...,...,...,...
686,Million Dollar Arm,"May 10, 2014",Drama,PG,36447959,37607865,2014,,,,2014
688,Planes: Fire and Rescue,"Jul 18, 2014",Adventure,PG,59157732,61040349,2014,,,,2014
691,"Alexander and the Terrible, Horrible,…","Oct 10, 2014",Comedy,PG,66954149,69055550,2014,,,,2014
724,Star Wars Ep. VII: The Force Awakens,"Dec 18, 2015",Adventure,PG-13,936662225,936662225,2015,,,,2015


In [125]:
merged_disney_df.loc[(merged_disney_df['total_gross'] == 936662225)]

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
724,Star Wars Ep. VII: The Force Awakens,"Dec 18, 2015",Adventure,PG-13,936662225,936662225,2015,,,,2015


In [135]:
#merged_disney_df = merged_disney_df[['movieId', 'title', 'genres']].copy()
merged_disney_df.to_csv (r'output/merged_disney_ml.csv', index = None, header=True) 

## Load IMDB

## Keep the codes below as Markdown so as not to run them now. Will change back to code once we're done with the above. 

### Keep only the movies where the genres_length is the longest

merged_disney_df['count'] = merged_disney_df['genres'].str.len()
merged_disney_df.head()

merged_disney_df_g = merged_disney_df.groupby(['movie_title','release_date']).agg({'count':'max'})

merged_disney_df_g = merged_disney_df_g.reset_index()

merged_disney_df_g = merged_disney_df_g.rename(columns={'count':'count_max'})

merged_disney_df = pd.merge(merged_disney_df, merged_disney_df_g, how='left', on=['movie_title'])

merged_disney_df = merged_disney_df[merged_disney_df['count'] == merged_disney_df['count_max']]

# add a iterrative code here to check if row[n] & row[n+1] has the same movie_title name and same count length
# keep the first one. 
for i in merged_disney_df.index:
    try:
         if merged_disney_df["movie_title"][i] == merged_disney_df["movie_title"][i+1] and merged_disney_df['count'][i] == merged_disney_df['count'][i+1]:
            print (merged_disney_df["movie_title"][i])
            print (merged_disney_df["movie_title"][i+1])
            print ('=======================')
    except KeyError:
        pass

merged_disney_df.loc[merged_disney_df['movie_title'] == 'Freaky Friday']

merged_disney_df.loc[merged_disney_df['movie_title'] == 'Freaky Friday']

merged_disney_df.drop(columns=['count', 'count_max'], inplace=True)

merged_disney_df.head()

#Check to see if Pinocchio is unique & has the longest genre length 
merged_disney_df.loc[merged_disney_df['movie_title'] == 'Pinocchio']

In [51]:
#merged_disney_df.rename(columns={'movie_title': 'title'}, inplace = True)
#merged_disney_df.head()

In [52]:
#disney_movies = merged_disney_df[['movieId', 'title', 'genres']].copy()
#disney_movies.to_csv (r'output/test_disney_movies.csv', index = None, header=True) 