In [1]:
import pandas as pd
import numpy as np
import re

## Load the main Disney movies dataset that we want to use

In [2]:
disney_df = pd.read_csv('../resource/disney/disney_movies_total_gross.csv')
disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251"
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052"
2,Fantasia,"Nov 13, 1940",Musical,G,"$83,320,000","$2,187,090,808"
3,Song of the South,"Nov 12, 1946",Adventure,G,"$65,000,000","$1,078,510,579"
4,Cinderella,"Feb 15, 1950",Drama,G,"$85,000,000","$920,608,730"


In [3]:
disney_df['year'] = pd.DatetimeIndex(disney_df['release_date']).year

disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251",1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940
2,Fantasia,"Nov 13, 1940",Musical,G,"$83,320,000","$2,187,090,808",1940
3,Song of the South,"Nov 12, 1946",Adventure,G,"$65,000,000","$1,078,510,579",1946
4,Cinderella,"Feb 15, 1950",Drama,G,"$85,000,000","$920,608,730",1950


In [4]:
disney_df.count()

movie_title                 579
release_date                579
genre                       562
MPAA_rating                 523
total_gross                 579
inflation_adjusted_gross    579
year                        579
dtype: int64

In [5]:
disney_df['movie_title'] = disney_df['movie_title'].astype(str)

In [6]:
disney_df.dtypes

movie_title                 object
release_date                object
genre                       object
MPAA_rating                 object
total_gross                 object
inflation_adjusted_gross    object
year                         int64
dtype: object

In [7]:
#disney_list_df = disney_df['movie_title']

## Load MovieLens movies dataset

In [8]:
#movie lens database
movielens_df = pd.read_csv('../resource/ml-25m/movies.csv')
movielens_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
movielens_df['title'] = movielens_df['title'].astype(str)

In [10]:
movielens_df.dtypes

movieId     int64
title      object
genres     object
dtype: object

## Split title into movie_title and year

In [11]:
#cleanup movie title to only show title without year
#extract the 4 digit of the years inside parentheses () and create a new column 'year'

movielens_df['movie_title'] = movielens_df['title'].str.split(' \(\d', 1).str[0]
movielens_df['year'] = movielens_df['title'].str.extract('(\(\d{4}\)$)')
movielens_df['year'] = movielens_df['year'].str.replace('\)','')
movielens_df['year'] = movielens_df['year'].str.replace('\(','')

#movies_df.drop('title', axis=1, inplace=True)
movielens_df.head()

  
  import sys


Unnamed: 0,movieId,title,genres,movie_title,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995


In [12]:
#Check year column is computed correctly

movielens_df[movielens_df['year'].str.len()>4]

Unnamed: 0,movieId,title,genres,movie_title,year


In [13]:
#Remove all rows where the year length is more than 4. Confident that these titles are NOT Disney movies
#movies_df.drop(movies_df.loc[movies_df['year'].str.len()>4].index, inplace=True)
#movies_df.head()

In [14]:
#Can't parse year to int as there are some null values
movielens_df[movielens_df['year'].isnull()]

Unnamed: 0,movieId,title,genres,movie_title,year
8463,25936,"Babe Ruth Story, The (1948)",Drama,"Babe Ruth Story, The",
8602,26137,"Heroes of Telemark, The (1965)",Action|Drama|War,"Heroes of Telemark, The",
8910,26628,Jack's Back (1988),Crime|Horror|Mystery|Thriller,Jack's Back,
9120,27189,After the Rain (Ame agaru) (1999),Action|Drama,After the Rain (Ame agaru),
9763,32497,Love Letter (1995),Drama|Romance,Love Letter,
...,...,...,...,...,...
62071,207714,Tales of Found Footage,(no genres listed),Tales of Found Footage,
62104,207884,Enduring Destiny,(no genres listed),Enduring Destiny,
62285,208597,Punk the Capital: Building a Sound Movement,Documentary,Punk the Capital: Building a Sound Movement,
62326,208763,Yosemite: The Fate of Heaven,(no genres listed),Yosemite: The Fate of Heaven,


In [15]:
#Remove year is null rows
#movies_df = movies_df[~movies_df['year'].isnull()]

In [16]:
#Parse year as int
#movies_df['year'] = movies_df['year'].astype(int)

In [17]:
movielens_df.count()

movieId        62423
title          62423
genres         62423
movie_title    62423
year           61857
dtype: int64

In [18]:
movielens_df.dtypes

movieId         int64
title          object
genres         object
movie_title    object
year           object
dtype: object

## Merge the MovieLens dataset to Disney dataset. We want to keep the movieId and genres from MovieLens to our Disney dataset

In [19]:
#Merge the Disney dataset and MovieLens dataset on movie_title ONLY
merged_disney_df = pd.merge(disney_df, movielens_df, how='inner', on=['movie_title'])
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,"$184,925,485","$5,228,953,251",1937,594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
2,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,5990,Pinocchio (2002),Children|Comedy|Fantasy,2002
3,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,90264,Pinocchio (1976),Children|Fantasy|Musical,1976
4,Pinocchio,"Feb 9, 1940",Adventure,G,"$84,300,000","$2,188,229,052",1940,126693,Pinocchio (2012),Animation|Children,2012


In [20]:
#Merge the Disney dataset and MovieLens dataset on movie_title & year
#merged_disney_df = pd.merge(disney_df, movies_df, on=['movie_title','year'])
#merged_disney_df.head()

In [21]:
merged_disney_df.count()

movie_title                 578
release_date                578
genre                       564
MPAA_rating                 514
total_gross                 578
inflation_adjusted_gross    578
year_x                      578
movieId                     578
title                       578
genres                      578
year_y                      577
dtype: int64

## Remove the "$" and "," characters from total_gross & inflation_adjusted_gross. Make these 2 columns as int type

In [22]:
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].str.replace(',','')
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].str.replace('$','')

  


In [23]:
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].str.replace(',','')
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].str.replace('$','')

  


In [24]:
merged_disney_df['total_gross'] = merged_disney_df['total_gross'].astype(int)
merged_disney_df['inflation_adjusted_gross'] = merged_disney_df['inflation_adjusted_gross'].astype(int)

## Drop genre from dataset as we want genres which have more information

In [25]:
#merged_disney_df.drop('genre', axis=1, inplace=True)
#merged_disney_df.head()

## Keep only movies where total_gross != 0 from dataset

In [26]:
merged_disney_df= merged_disney_df[merged_disney_df['total_gross'] != 0]

In [27]:
merged_disney_df.count()

movie_title                 575
release_date                575
genre                       561
MPAA_rating                 514
total_gross                 575
inflation_adjusted_gross    575
year_x                      575
movieId                     575
title                       575
genres                      575
year_y                      574
dtype: int64

In [28]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,184925485,5228953251,1937,594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
2,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,5990,Pinocchio (2002),Children|Comedy|Fantasy,2002
3,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,90264,Pinocchio (1976),Children|Fantasy|Musical,1976
4,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,126693,Pinocchio (2012),Animation|Children,2012


## Save this merged Disney movies dataset as csv

In [29]:
merged_disney_df.to_csv (r'output/MERGED_disney_movies_total_gross.csv', index = None, header=True) 

## Load MovieLens ratings dataset

In [30]:
#movie lens database
ratings_df = pd.read_csv('../resource/ml-25m/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [31]:
merged_disney_ratings = pd.merge(ratings_df, merged_disney_df, on="movieId")
merged_disney_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,title,genres,year_y
0,1,6377,4.0,1147868469,Finding Nemo,"May 30, 2003",Adventure,G,380529370,518148559,2003,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003
1,3,6377,4.0,1439472644,Finding Nemo,"May 30, 2003",Adventure,G,380529370,518148559,2003,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003
2,4,6377,3.0,1573939584,Finding Nemo,"May 30, 2003",Adventure,G,380529370,518148559,2003,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003
3,12,6377,4.0,1167582434,Finding Nemo,"May 30, 2003",Adventure,G,380529370,518148559,2003,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003
4,13,6377,4.0,1238029123,Finding Nemo,"May 30, 2003",Adventure,G,380529370,518148559,2003,Finding Nemo (2003),Adventure|Animation|Children|Comedy,2003


### Remove the duplicated MERGED movies

In [32]:
merged_disney_df.head()

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
0,Snow White and the Seven Dwarfs,"Dec 21, 1937",Musical,G,184925485,5228953251,1937,594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical,1937
1,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,596,Pinocchio (1940),Animation|Children|Fantasy|Musical,1940
2,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,5990,Pinocchio (2002),Children|Comedy|Fantasy,2002
3,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,90264,Pinocchio (1976),Children|Fantasy|Musical,1976
4,Pinocchio,"Feb 9, 1940",Adventure,G,84300000,2188229052,1940,126693,Pinocchio (2012),Animation|Children,2012


## JUDAT -- please ignore the codes at the bottom as these are work on progress for removing duplicate movies

## Genre & Genres merge so all movies will have Genres regardless where the information is from 

In [33]:
#Is there any movies with genre = NaN and genres = (no genres listed)?
merged_disney_df.loc[(merged_disney_df['genre'].isna()) &  (merged_disney_df['genres'] == '(no genres listed)')]

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
210,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606,My Boyfriend's Back,(no genres listed),


In [34]:
#Check My Boyfriend's Back genres
merged_disney_df.loc[merged_disney_df['movie_title'] == 'My Boyfriend\'s Back']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
209,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,2552,My Boyfriend's Back (1993),Comedy,1993.0
210,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606,My Boyfriend's Back,(no genres listed),


In [35]:
#Manual search on IMDB: https://www.imdb.com/title/tt0107626/
#Genre is: Comedy|Fantasy|Horror|Romance
#Manually add this genres into the dataset
merged_disney_df['genres'][merged_disney_df['movie_title'].str.contains('My Boyfriend\'s Back')] ='Comedy|Fantasy|Horror|Romance'

In [36]:
#Check My Boyfriend's Back genres
merged_disney_df.loc[merged_disney_df['movie_title'] == 'My Boyfriend\'s Back']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
209,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,2552,My Boyfriend's Back (1993),Comedy|Fantasy|Horror|Romance,1993.0
210,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606,My Boyfriend's Back,Comedy|Fantasy|Horror|Romance,


In [37]:
#Check the movie titles with genres == '(no genres listed)'
merged_disney_df.loc[merged_disney_df['genres'] == '(no genres listed)']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
16,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149883,Cinderella (2000),(no genres listed),2000
19,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,179555,Cinderella (1960),(no genres listed),1960
32,Cinderella,"Mar 13, 2015",Drama,PG,201151353,201151353,2015,149883,Cinderella (2000),(no genres listed),2000
35,Cinderella,"Mar 13, 2015",Drama,PG,201151353,201151353,2015,179555,Cinderella (1960),(no genres listed),1960
62,Freaky Friday,"Jan 21, 1977",Comedy,,25942000,98067733,1977,136592,Freaky Friday (1995),(no genres listed),1995
66,Freaky Friday,"Aug 6, 2003",Comedy,PG,110222438,154090360,2003,136592,Freaky Friday (1995),(no genres listed),1995
136,Stella,"Feb 2, 1990",Drama,PG-13,20062347,40077147,1990,163160,Stella (1950),(no genres listed),1950
151,White Fang,"Jan 18, 1991",Adventure,PG,34729091,69540672,1991,184969,White Fang (1997),(no genres listed),1997
163,Paradise,"Sep 18, 1991",Drama,PG-13,18634643,37313540,1991,162580,Paradise (2015),(no genres listed),2015
289,Boys,"May 10, 1996",Drama,PG-13,21930418,41826566,1996,195079,Boys (2016),(no genres listed),2016


In [38]:
#Get the list of movie titles where genres == '(no genres listed)'
no_genres_list = merged_disney_df['movie_title'].loc[merged_disney_df['genres'] == '(no genres listed)'].unique().tolist()
print(no_genres_list)

['Cinderella', 'Freaky Friday', 'Stella', 'White Fang', 'Paradise', 'Boys', 'Jack', 'The Count of Monte Cristo', 'The Village', 'Into the Woods']


In [39]:
#For all the above movies with genres == (no genres listed), we copy the genre into genres so all 
#these movies will now have genres
#merged_disney_df['genres'] = np.where(merged_disney_df['genres'] == '(no genres listed)', merged_disney_df['genre'], merged_disney_df['genres'])

merged_disney_df.loc[(merged_disney_df.genres == '(no genres listed)')  , 'genres'] = merged_disney_df.genre

In [40]:
merged_disney_df.loc[merged_disney_df['genres'] == '(no genres listed)']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y


In [41]:
#Check the list of movies with genres == (no genres listed) to confirm if the genres are now populated correctly
merged_disney_df.query('movie_title in @no_genres_list')

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
8,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,1022,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance,1950
9,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,63239,Cinderella (1997),Children|Fantasy|Musical|Romance,1997
10,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,111901,Cinderella (1914),Fantasy|Romance,1914
11,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,129195,Cinderella (2011),Drama|Romance,2011
12,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,130073,Cinderella (2015),Children|Drama|Fantasy|Romance,2015
...,...,...,...,...,...,...,...,...,...,...,...
430,The Village,"Jul 30, 2004",Thriller/Suspense,PG-13,114197520,155021748,2004,131654,The Village (2010),Thriller/Suspense,2010
431,The Village,"Jul 30, 2004",Thriller/Suspense,PG-13,114197520,155021748,2004,194600,The Village (1993),Animation,1993
547,Into the Woods,"Dec 25, 2014",Musical,PG,128002372,130894237,2014,8580,Into the Woods (1991),Adventure|Comedy|Fantasy|Musical,1991
548,Into the Woods,"Dec 25, 2014",Musical,PG,128002372,130894237,2014,118997,Into the Woods (2014),Children|Comedy|Fantasy|Musical,2014


In [42]:
merged_disney_df.count()

movie_title                 575
release_date                575
genre                       561
MPAA_rating                 514
total_gross                 575
inflation_adjusted_gross    575
year_x                      575
movieId                     575
title                       575
genres                      575
year_y                      574
dtype: int64

In [43]:
#there is a null row in year_y

merged_disney_df.loc[merged_disney_df['year_y'].isnull()]

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
210,My Boyfriend's Back,"Aug 6, 1993",,PG-13,3218882,6554384,1993,131606,My Boyfriend's Back,Comedy|Fantasy|Horror|Romance,


In [44]:
#if the year_y is null, we copy the year_x to year_y

merged_disney_df.loc[(merged_disney_df['year_y'].isnull())  , 'year_y'] = merged_disney_df.year_x

In [45]:
merged_disney_df.count()

movie_title                 575
release_date                575
genre                       561
MPAA_rating                 514
total_gross                 575
inflation_adjusted_gross    575
year_x                      575
movieId                     575
title                       575
genres                      575
year_y                      575
dtype: int64

In [46]:
#Check Cinderella genres
merged_disney_df.loc[merged_disney_df['movie_title'] == 'Cinderella']

Unnamed: 0,movie_title,release_date,genre,MPAA_rating,total_gross,inflation_adjusted_gross,year_x,movieId,title,genres,year_y
8,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,1022,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance,1950
9,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,63239,Cinderella (1997),Children|Fantasy|Musical|Romance,1997
10,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,111901,Cinderella (1914),Fantasy|Romance,1914
11,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,129195,Cinderella (2011),Drama|Romance,2011
12,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,130073,Cinderella (2015),Children|Drama|Fantasy|Romance,2015
13,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,146030,Cinderella (1947),Children|Comedy|Fantasy|Musical|Romance,1947
14,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149875,Cinderella (1957),Drama|Romance,1957
15,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149877,Cinderella (1977),Comedy,1977
16,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,149883,Cinderella (2000),Drama,2000
17,Cinderella,"Feb 15, 1950",Drama,G,85000000,920608730,1950,165397,Cinderella (1899),Children|Fantasy|Horror|Sci-Fi,1899


In [47]:
#Cast both year_y and year_x as int

merged_disney_df['year_x'] = merged_disney_df['year_x'].astype(int)
merged_disney_df['year_y'] = merged_disney_df['year_y'].astype(int)

In [48]:
new_movieId = merged_disney_df.loc[merged_disney_df['year_x'] == merged_disney_df['year_y']]

In [52]:
new_movieId.count()

movie_title                 389
release_date                389
genre                       380
MPAA_rating                 351
total_gross                 389
inflation_adjusted_gross    389
year_x                      389
movieId                     389
title                       389
genres                      389
year_y                      389
dtype: int64

In [49]:
#merged_disney_df = merged_disney_df.loc[merged_disney_df['year_x'] == merged_disney_df['year_y']]

## Keep the codes below as Markdown so as not to run them now. Will change back to code once we're done with the above. 

### Keep only the movies where the genres_length is the longest

merged_disney_df['count'] = merged_disney_df['genres'].str.len()
merged_disney_df.head()

merged_disney_df_g = merged_disney_df.groupby(['movie_title','release_date']).agg({'count':'max'})

merged_disney_df_g = merged_disney_df_g.reset_index()

merged_disney_df_g = merged_disney_df_g.rename(columns={'count':'count_max'})

merged_disney_df = pd.merge(merged_disney_df, merged_disney_df_g, how='left', on=['movie_title'])

merged_disney_df = merged_disney_df[merged_disney_df['count'] == merged_disney_df['count_max']]

# add a iterrative code here to check if row[n] & row[n+1] has the same movie_title name and same count length
# keep the first one. 
for i in merged_disney_df.index:
    try:
         if merged_disney_df["movie_title"][i] == merged_disney_df["movie_title"][i+1] and merged_disney_df['count'][i] == merged_disney_df['count'][i+1]:
            print (merged_disney_df["movie_title"][i])
            print (merged_disney_df["movie_title"][i+1])
            print ('=======================')
    except KeyError:
        pass

merged_disney_df.loc[merged_disney_df['movie_title'] == 'Freaky Friday']

merged_disney_df.loc[merged_disney_df['movie_title'] == 'Freaky Friday']

merged_disney_df.drop(columns=['count', 'count_max'], inplace=True)

merged_disney_df.head()

#Check to see if Pinocchio is unique & has the longest genre length 
merged_disney_df.loc[merged_disney_df['movie_title'] == 'Pinocchio']

In [50]:
#merged_disney_df.rename(columns={'movie_title': 'title'}, inplace = True)
#merged_disney_df.head()

In [51]:
#disney_movies = merged_disney_df[['movieId', 'title', 'genres']].copy()
#disney_movies.to_csv (r'output/test_disney_movies.csv', index = None, header=True) 