In [41]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 900
pd.options.display.max_columns = 30

In [42]:
movies_df = pd.read_csv( "movie_metadata.csv" )

In [43]:
#check column labels
movies_df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [44]:
#drop movies that aren't in english
movies_df = movies_df.drop(movies_df[movies_df.language != "English"].index)
#drop movies that aren't USA movies
movies_df = movies_df.drop( movies_df[movies_df.country != 'USA'].index )

In [45]:
#drop labels we aren't likely interested in
movies_df.drop( labels = [ 'color', 'duration', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'language',
                       'content_rating', 'aspect_ratio', 'num_critic_for_reviews', 'num_user_for_reviews', 'budget' ],
            axis = 1, inplace = True )

In [46]:
#rearrange column labels
movies_df = movies_df [ [ 'movie_title', 'title_year', 'country', 'gross', 'director_name', 'director_facebook_likes',
               'actor_1_name', 'actor_1_facebook_likes', 'actor_2_name', 'actor_2_facebook_likes', 'actor_3_name',
               'actor_3_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes', 'imdb_score', 'num_voted_users',
               'genres' ] ]

In [47]:
#drop null gross. note: checked that no gross == 0 movies_df[movies_df['gross'] == 0]
#we don't care about movies whose gross we can't measure, otherwise we have to look it up on our own which is tedious
movies_df.drop(movies_df[movies_df.gross.isnull()].index, inplace = True)

In [48]:
#what movies have null values at this point
movies_df[movies_df.isnull().any(axis=1)]
#perhaps we should drop the rows with null values here because if we have an imbalance in number of actor_1, actor_2, actor_3,
#and director it could create a problem when comparing graphs

Unnamed: 0,movie_title,title_year,country,gross,director_name,director_facebook_likes,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,imdb_score,num_voted_users,genres
3346,All Is Lost,2013.0,USA,6262942.0,J.C. Chandor,78.0,Robert Redford,0.0,,,,,0,18000,6.9,59545,Action|Adventure|Drama
3528,Gerry,2002.0,USA,236266.0,Gus Van Sant,835.0,Matt Damon,13000.0,Casey Affleck,0.0,,,13000,0,6.2,15104,Adventure|Drama|Mystery
4225,Fantasia,1940.0,USA,76400000.0,James Algar,11.0,Leopold Stokowski,16.0,Deems Taylor,0.0,,,16,3000,7.8,71321,Animation|Family|Fantasy|Music
4519,Sex with Strangers,2002.0,USA,247740.0,Harry Gantz,0.0,,,,,,,0,51,4.7,285,Documentary|Drama
4548,An Inconvenient Truth,2006.0,USA,23808111.0,Davis Guggenheim,49.0,Billy West,861.0,Al Gore,68.0,,,929,0,7.5,67654,Documentary
4720,The Harvest/La Cosecha,2011.0,USA,2245.0,U. Roberto Romano,6.0,,,,,,,0,88,7.2,57,Documentary
5009,Pink Narcissus,1971.0,USA,8231.0,James Bidgood,0.0,Don Brooks,0.0,Bobby Kendall,0.0,,,0,85,6.7,803,Drama|Fantasy


In [49]:
#drop null name actors/directors
movies_df.drop(movies_df[movies_df.director_name.isnull()].index, inplace = True)
movies_df.drop(movies_df[movies_df.actor_1_name.isnull()].index, inplace = True)
movies_df.drop(movies_df[movies_df.actor_2_name.isnull()].index, inplace = True)
movies_df.drop(movies_df[movies_df.actor_3_name.isnull()].index, inplace = True)

In [50]:
#count of actor_1 with 0 likes
print("Number of actor_1 with 0 FB likes: ", movies_df[(movies_df['actor_1_facebook_likes'] == 0)]['actor_1_facebook_likes'].count())
#value counts of actor_1 with 0 likes
print(movies_df[(movies_df['actor_1_facebook_likes'] == 0)]['actor_1_name'].value_counts())
#print rows where these actors appear
movies_df[(movies_df['actor_1_facebook_likes'] == 0)]
#observe that genre is mostly documentary except for the Philippines made thirller movie

Number of actor_1 with 0 FB likes:  5
Chemeeka Walker    1
Elizabeth Streb    1
Heather Berman     1
Ariel Hsing        1
Naderev Sano       1
Name: actor_1_name, dtype: int64


Unnamed: 0,movie_title,title_year,country,gross,director_name,director_facebook_likes,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,imdb_score,num_voted_users,genres
4729,Mad Hot Ballroom,2005.0,USA,8044906.0,Marilyn Agrelo,2.0,Heather Berman,0.0,Eva Carrozza,0.0,Paul Daggett,0.0,0,588,7.5,3156,Documentary|Family|Music
4776,Born to Fly: Elizabeth Streb vs. Gravity,2014.0,USA,21199.0,Catherine Gund,0.0,Elizabeth Streb,0.0,Sarah Callan,0.0,Laura Flanders,0.0,0,44,6.8,40,Action|Biography|Documentary|Sport
4938,Antarctic Edge: 70° South,2015.0,USA,4914.0,Dena Seidel,0.0,Naderev Sano,0.0,Hugh Ducklow,0.0,Mike Brett,0.0,0,215,7.0,123,Adventure|Documentary
4940,Top Spin,2014.0,USA,5858.0,Sara Newens,0.0,Ariel Hsing,0.0,Xinhua Jiang,0.0,Michael Landers,0.0,0,116,7.1,260,Documentary
4977,Super Size Me,2004.0,USA,11529368.0,Morgan Spurlock,293.0,Chemeeka Walker,0.0,Amanda Kearsan,0.0,Amelia Giancarlo,0.0,0,0,7.3,85028,Comedy|Documentary|Drama


In [51]:
#count of actor_2 with 0 likes
print("Number of actor_2 with 0 FB likes: ", movies_df[(movies_df['actor_2_facebook_likes'] == 0)]['actor_2_facebook_likes'].count())
#value counts of actor_2 with 0 likes
print(movies_df[(movies_df['actor_2_facebook_likes'] == 0)]['actor_2_name'].value_counts())
#print rows where these actors appear
movies_df[(movies_df['actor_2_facebook_likes'] == 0)]

Number of actor_2 with 0 FB likes:  13
Amanda Kearsan         1
Xinhua Jiang           1
Kyra Nichols           1
Hugh Ducklow           1
Donald Austin          1
Sarah Callan           1
Joe Bishop             1
Jasper Johal           1
Evelyn Jefferson       1
Richard Linklater      1
Val Kilmer             1
Eva Carrozza           1
Megan Ambuhl Graner    1
Name: actor_2_name, dtype: int64


Unnamed: 0,movie_title,title_year,country,gross,director_name,director_facebook_likes,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,imdb_score,num_voted_users,genres
502,Red Planet,2000.0,USA,17473245.0,Antony Hoffman,14.0,Bob Neill,2.0,Val Kilmer,0.0,Tom Sizemore,0.0,2,995,5.7,47612,Action|Sci-Fi|Thriller
2382,The Nutcracker,1993.0,USA,2119994.0,Emile Ardolino,22.0,Darci Kistler,2.0,Kyra Nichols,0.0,Margaret Tracey,0.0,2,238,5.7,1153,Family|Fantasy|Music
3790,Standard Operating Procedure,2008.0,USA,228830.0,Errol Morris,239.0,Jeffrey Frost,6.0,Megan Ambuhl Graner,0.0,Ken Davis,0.0,6,489,7.5,3329,Crime|Documentary|War
4729,Mad Hot Ballroom,2005.0,USA,8044906.0,Marilyn Agrelo,2.0,Heather Berman,0.0,Eva Carrozza,0.0,Paul Daggett,0.0,0,588,7.5,3156,Documentary|Family|Music
4776,Born to Fly: Elizabeth Streb vs. Gravity,2014.0,USA,21199.0,Catherine Gund,0.0,Elizabeth Streb,0.0,Sarah Callan,0.0,Laura Flanders,0.0,0,44,6.8,40,Action|Biography|Documentary|Sport
4816,Murderball,2005.0,USA,1523883.0,Henry Alex Rubin,30.0,Mark Zupan,15.0,Joe Bishop,0.0,Andy Cohn,0.0,15,0,7.8,9037,Documentary|Sport
4890,Burn,2012.0,USA,111300.0,Tom Putnam,15.0,Brendan Doogie Milewski,2.0,Donald Austin,0.0,Craig Dougherty,0.0,2,801,7.5,575,Documentary
4899,Short Cut to Nirvana: Kumbh Mela,2004.0,USA,381225.0,Maurizio Benazzo,0.0,The Dalai Lama,66.0,Jasper Johal,0.0,Swami Krishnanad,0.0,66,30,7.2,131,Documentary
4915,The Trials of Darryl Hunt,2006.0,USA,1111.0,Ricki Stern,15.0,Darryl Hunt,2.0,Evelyn Jefferson,0.0,John Reeves,0.0,2,246,7.7,771,Crime|Documentary
4938,Antarctic Edge: 70° South,2015.0,USA,4914.0,Dena Seidel,0.0,Naderev Sano,0.0,Hugh Ducklow,0.0,Mike Brett,0.0,0,215,7.0,123,Adventure|Documentary


In [52]:
#count of actor_3 with 0 likes
print("Number of actor_3 with 0 FB likes: ", movies_df[(movies_df['actor_3_facebook_likes'] == 0)]['actor_3_facebook_likes'].count())
#value counts of actor_3 with 0 likes
print(movies_df[(movies_df['actor_3_facebook_likes'] == 0)]['actor_3_name'].value_counts())
#print rows where these actors appear
movies_df[(movies_df['actor_3_facebook_likes'] == 0)]

Number of actor_3 with 0 FB likes:  27
Tencho Gyalpo        1
Margaret Tracey      1
Renee Leblanc        1
Sean Connery         1
Jean Caffeine        1
Amelia Giancarlo     1
Andy Cohn            1
Aasheekaa Bathija    1
Swami Krishnanad     1
Bill Baird           1
Craig Dougherty      1
Patrick Wilson       1
Benjamin Fletcher    1
Tobey Maguire        1
Paul Daggett         1
Kim De Angelo        1
Laura Flanders       1
Anna Vareschi        1
Michael Landers      1
Tom Sizemore         1
John Reeves          1
Ken Davis            1
Richard Spore        1
Mike Brett           1
David Thewlis        1
Steve Burg           1
Peter Agnefjall      1
Name: actor_3_name, dtype: int64


Unnamed: 0,movie_title,title_year,country,gross,director_name,director_facebook_likes,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,imdb_score,num_voted_users,genres
502,Red Planet,2000.0,USA,17473245.0,Antony Hoffman,14.0,Bob Neill,2.0,Val Kilmer,0.0,Tom Sizemore,0.0,2,995,5.7,47612,Action|Sci-Fi|Thriller
1249,Medicine Man,1992.0,USA,45500797.0,John McTiernan,323.0,Lorraine Bracco,472.0,José Wilker,47.0,Sean Connery,0.0,519,694,6.0,17443,Adventure|Drama|Romance
1506,Ride with the Devil,1999.0,USA,630779.0,Ang Lee,0.0,Jeremy W. Auman,3.0,Jeffrey Dover,2.0,Tobey Maguire,0.0,5,633,6.8,11101,Drama|Romance|War|Western
1790,Kundun,1997.0,USA,5532301.0,Martin Scorsese,17000.0,Tenzin Thuthob Tsarong,2.0,Tulku Jamyang Kunga Tenzin,2.0,Tencho Gyalpo,0.0,4,0,7.0,21606,Biography|Drama|History|War
2382,The Nutcracker,1993.0,USA,2119994.0,Emile Ardolino,22.0,Darci Kistler,2.0,Kyra Nichols,0.0,Margaret Tracey,0.0,2,238,5.7,1153,Family|Fantasy|Music
3219,The Brown Bunny,2003.0,USA,365734.0,Vincent Gallo,787.0,Vincent Gallo,787.0,Cheryl Tiegs,96.0,Anna Vareschi,0.0,883,952,5.0,11487,Drama
3398,Anomalisa,2015.0,USA,3442820.0,Duke Johnson,26.0,Jennifer Jason Leigh,1000.0,Tom Noonan,442.0,David Thewlis,0.0,1442,0,7.3,31489,Animation|Comedy|Drama|Romance
3643,Lake of Fire,2006.0,USA,23807.0,Tony Kaye,194.0,Noam Chomsky,103.0,Pat Buchanan,2.0,Bill Baird,0.0,105,570,8.3,2143,Documentary
3790,Standard Operating Procedure,2008.0,USA,228830.0,Errol Morris,239.0,Jeffrey Frost,6.0,Megan Ambuhl Graner,0.0,Ken Davis,0.0,6,489,7.5,3329,Crime|Documentary|War
3923,The Real Cancun,2003.0,USA,3713002.0,Rick de Oliveira,2.0,Laura Ramsey,960.0,Alan Taylor,12.0,Benjamin Fletcher,0.0,972,131,2.7,3611,Documentary


In [53]:
#TODO
#check if the actor_1, actor_2, and actor_3 with 0 likes across movies are unique

In [54]:
#count of directos with 0 likes
print("Number of directors with 0 FB likes: ", movies_df[(movies_df['director_facebook_likes'] == 0)]['director_name'].count())
#count of unique directors with 0 likes
print("Number of unique directors with 0 FB likes: ", len(movies_df[(movies_df['director_facebook_likes'] == 0)]['director_name'].unique()))
#value counts of directors with 0 likes
print(movies_df[(movies_df['director_facebook_likes'] == 0)]['director_name'].value_counts())
#I looked into some (top 5 value counts) of these directors. They do have facebook pages, some with 10's to 100's of thousands
#of likes and some had official pages but others didn't, and some had multiple FB pages.

Number of directors with 0 FB likes:  547
Number of unique directors with 0 FB likes:  201
Steven Soderbergh             15
Spike Lee                     15
Ridley Scott                  14
Robert Zemeckis               13
Michael Bay                   13
Wes Craven                    12
John Carpenter                12
Robert Rodriguez              11
Rob Reiner                    11
Sam Raimi                     10
Richard Linklater             10
Oliver Stone                  10
Kevin Smith                   10
M. Night Shyamalan             9
Francis Ford Coppola           9
Tyler Perry                    9
Zack Snyder                    8
Frank Oz                       8
Garry Marshall                 8
Brian De Palma                 8
Ang Lee                        7
Chris Columbus                 7
Wes Anderson                   7
Bryan Singer                   7
James Wan                      7
Peter Jackson                  6
Michael Mann                   6
Paul Thomas Anders

In [55]:
#ran this with top 5 actors in value_counts and it looks like there isn't a movie they are in where the FB likes is more than 0
movies_df[movies_df['director_name'] == 'Ridley Scott'].count()

movie_title                  14
title_year                   14
country                      14
gross                        14
director_name                14
director_facebook_likes      14
actor_1_name                 14
actor_1_facebook_likes       14
actor_2_name                 14
actor_2_facebook_likes       14
actor_3_name                 14
actor_3_facebook_likes       14
cast_total_facebook_likes    14
movie_facebook_likes         14
imdb_score                   14
num_voted_users              14
genres                       14
dtype: int64

In [56]:
#count of all movies where either the actors or the director have 0 likes
movies_df[(movies_df['actor_1_facebook_likes'] == 0) | (movies_df['actor_2_facebook_likes'] == 0) | (movies_df['actor_3_facebook_likes'] == 0) | (movies_df['director_facebook_likes'] == 0)]['movie_title'].count()

567

In [57]:
#number of unique directors appearing where either the director or actor has 0 likes.
movies_df[(movies_df['actor_1_facebook_likes'] == 0) | (movies_df['actor_2_facebook_likes'] == 0) | (movies_df['actor_3_facebook_likes'] == 0) | (movies_df['director_facebook_likes'] == 0)]['director_name'].value_counts().count()

221

In [58]:
#number of unique actor_1 appearing where either the director or actor has 0 likes
movies_df[(movies_df['actor_1_facebook_likes'] == 0) | (movies_df['actor_2_facebook_likes'] == 0) | (movies_df['actor_3_facebook_likes'] == 0) | (movies_df['director_facebook_likes'] == 0)]['actor_1_name'].value_counts().count()

354

In [59]:
#number of unique actor_2 appearing where either the director or actor has 0 likes
movies_df[(movies_df['actor_1_facebook_likes'] == 0) | (movies_df['actor_2_facebook_likes'] == 0) | (movies_df['actor_3_facebook_likes'] == 0) | (movies_df['director_facebook_likes'] == 0)]['actor_2_name'].value_counts().count()

463

In [60]:
#number of unique actor_3 appearing where either the director or actor has 0 likes
movies_df[(movies_df['actor_1_facebook_likes'] == 0) | (movies_df['actor_2_facebook_likes'] == 0) | (movies_df['actor_3_facebook_likes'] == 0) | (movies_df['director_facebook_likes'] == 0)]['actor_3_name'].value_counts().count()

505

In [61]:
movies_df.shape

(3207, 17)

In [62]:
#drop movies where actors and directors with 0 likes
movies_df = movies_df.drop(movies_df[movies_df.actor_1_facebook_likes == 0].index)
movies_df = movies_df.drop(movies_df[movies_df.actor_2_facebook_likes == 0].index)
movies_df = movies_df.drop(movies_df[movies_df.actor_3_facebook_likes == 0].index)
movies_df = movies_df.drop(movies_df[movies_df.director_facebook_likes == 0].index)

In [63]:
#drop movies not in the US
movies_df = movies_df.drop( movies_df[movies_df.country != 'USA'].index )
#view number of rows left
movies_df.shape

(2640, 17)

In [64]:
#checking if genres type and title type are string
print(isinstance((movies_df.loc[1, 'genres']), str))
print(isinstance((movies_df.loc[1, 'movie_title']), str))

True
True


In [65]:
#check if any of the genres entries is null, in this case none are so we don't have to worry about that in the next step
movies_df.genres.isnull().sum()

0

In [66]:
#genres:
#consider each genre seperately: extract genres delimited by "|" (pipeline)

#this variable will store all genres in the movie dataset
genre_dict = {}

#sample if need to test. Note: not 50 entries because we dropped certain rows & some of the indeces from 0-49 were dropped
#test = pd.DataFrame(movies.loc[0:49])


#change this back to movies.genres
for genres in movies_df.genres:
    #remove spaces from string so that something like "Action " vs " Action" are not treated differently
    genres_no_space = genres.strip()
    #need to extract genre by delimiter |
    genres_list = genres_no_space.split("|")
    
    #this block of code adds the genre as a key in dictionary and updates the count
    for genre in genres_list:
        
        #the key exists and we want to increment the count for movies with this genre
        if genre in genre_dict:
            genre_dict[genre] = genre_dict.get(genre) + 1   
        else: #add this genre if it doesn't already exist in the dict
            key_value = {genre : 1}
            genre_dict.update(key_value)

In [67]:
#REMEMBER: only loop once per row checking every dict entry

#list where we will store the genres. This is just convenient to have
genre_list = []
#add dictionary keys (genres) into our genre_list
for key in genre_dict:
    #don't add any genres whose corresponding movies count is less than 20, due to statistical insignificance
    if genre_dict[key] >= 20:
        genre_list.append(key)

#list where we will store all the movie titles. This is made to create index column of dataframe
movie_title_list = []
for title in movies_df.movie_title:
    movie_title_list.append(title)

#create the df whose columns titles are genres, and index is movie titles
movie_genre_df = pd.DataFrame(index = movie_title_list, columns = genre_list)
movie_genre_df.index.name = 'MovieTitle'

#fill the columns with 1 or 0, where 1 serves to tell us the movie belongs to the genre in the column title and 0 otherwise
for movie in movies_df.index:
    #current title of movie
    title = movies_df.movie_title.loc[movie]
    #current genre string of movie
    genres_str = movies_df.genres.loc[movie]
    
    #We have a genre list from previous code. We can loop over list and check if
    # current movie has certain genres. Note: will need movie title and genre
    for genre in genre_list:      
        #returns positive index of where substring was found in string, -1 if not found
        i = genres_str.find(genre)
        #if the genre is contained in the string then make the column corresponding to
        #this genre and this title in the movies_genre_df to 1 otherwise make it 0  
        if i >= 0:
            movie_genre_df.at[title, genre] = 1
        else:
            movie_genre_df.at[title, genre] = 0

In [69]:
#check movie_df
movies_df

Unnamed: 0,movie_title,title_year,country,gross,director_name,director_facebook_likes,actor_1_name,actor_1_facebook_likes,actor_2_name,actor_2_facebook_likes,actor_3_name,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,imdb_score,num_voted_users,genres
1,Pirates of the Caribbean: At World's End,2007.0,USA,309404152.0,Gore Verbinski,563.0,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0,48350,0,7.1,471220,Action|Adventure|Fantasy
3,The Dark Knight Rises,2012.0,USA,448130642.0,Christopher Nolan,22000.0,Tom Hardy,27000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,23000.0,106759,164000,8.5,1144337,Action|Thriller
5,John Carter,2012.0,USA,73058679.0,Andrew Stanton,475.0,Daryl Sabara,640.0,Samantha Morton,632.0,Polly Walker,530.0,1873,24000,6.6,212204,Action|Adventure|Sci-Fi
7,Tangled,2010.0,USA,200807262.0,Nathan Greno,15.0,Brad Garrett,799.0,Donna Murphy,553.0,M.C. Gainey,284.0,2036,29000,7.8,294810,Adventure|Animation|Comedy|Family|Fantasy|Musi...
13,Pirates of the Caribbean: Dead Man's Chest,2006.0,USA,423032628.0,Gore Verbinski,563.0,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0,48486,5000,7.3,522040,Action|Adventure|Fantasy
14,The Lone Ranger,2013.0,USA,89289910.0,Gore Verbinski,563.0,Johnny Depp,40000.0,Ruth Wilson,2000.0,Tom Wilkinson,1000.0,45757,48000,6.5,181792,Action|Adventure|Western
16,The Chronicles of Narnia: Prince Caspian,2008.0,USA,141614023.0,Andrew Adamson,80.0,Peter Dinklage,22000.0,Pierfrancesco Favino,216.0,Damián Alcázar,201.0,22697,0,6.6,149922,Action|Adventure|Family|Fantasy
18,Pirates of the Caribbean: On Stranger Tides,2011.0,USA,241063875.0,Rob Marshall,252.0,Johnny Depp,40000.0,Sam Claflin,11000.0,Stephen Graham,1000.0,54083,58000,6.7,370704,Action|Adventure|Fantasy
19,Men in Black 3,2012.0,USA,179020854.0,Barry Sonnenfeld,188.0,Will Smith,10000.0,Michael Stuhlbarg,816.0,Nicole Scherzinger,718.0,12572,40000,6.8,268154,Action|Adventure|Comedy|Family|Fantasy|Sci-Fi
21,The Amazing Spider-Man,2012.0,USA,262030663.0,Marc Webb,464.0,Emma Stone,15000.0,Andrew Garfield,10000.0,Chris Zylka,963.0,28489,56000,7.0,451803,Action|Adventure|Fantasy


In [70]:
#check movie_genre_df
movie_genre_df

Unnamed: 0_level_0,Action,Adventure,Fantasy,Thriller,Sci-Fi,Animation,Comedy,Family,Musical,Romance,Western,Sport,Horror,Crime,Drama,Mystery,History,War,Biography,Music,Documentary
MovieTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Pirates of the Caribbean: At World's End,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Dark Knight Rises,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
John Carter,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Tangled,0,1,1,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0
Pirates of the Caribbean: Dead Man's Chest,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Lone Ranger,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
The Chronicles of Narnia: Prince Caspian,1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: On Stranger Tides,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Men in Black 3,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
The Amazing Spider-Man,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#note that this shows we have 446 null values in original movie dataset
print("Number of all null rows: ", movies_df.isnull().any(axis = 1).sum())

#this line shows the number of null values by column
movies_df.isnull().sum()

#TODO: next step is to deal with nulls

In [None]:
#drop all null values (466 values)
movies_df.dropna(inplace = True)
#check the shape, we know we had 3086 when we last checked the shape so: should be 3086 - 466 = 2640
movies_df.shape

In [None]:
#Scatterplot for actor1_facebook_likes and movie gross
scatter1 = movies_df.plot.scatter(x = "actor_1_facebook_likes", y = "gross")
scatter1Mod = movies_df.plot.scatter(x = "actor_1_facebook_likes", y = "gross")
print(scatter1)
#plt.plot([0, 100000], scatter1.y, color = 'red')
#Actual limit 600000. Few outliers beyond 100000, removing for better view of scatterplot
scatter1Mod.set_xlim([0, 100000])
print(scatter1Mod)

scatter1.plot

In [None]:
#Scatterplot for actor2_facebook_likes and movie gross
scatter2 = movies_df.plot.scatter(x = "actor_2_facebook_likes", y = "gross")
scatter2Mod = movies_df.plot.scatter(x = "actor_2_facebook_likes", y = "gross")
print(scatter2)

#Actual limit 140000. Few outliers beyond 40000, removing for better view of scatterplot
scatter2Mod.set_xlim([0, 40000])
print(scatter2Mod)

In [None]:
#Scatterplot for actor3_facebook_likes and movie gross
scatter3 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")
scatter3Mod1 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")
scatter3Mod2 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")
scatter3Mod3 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")


print(scatter3)

#No notable outliers. Reducing Xlim to display more accurate scatter plot views
scatter3Mod1.set_xlim([0, 3000])
scatter3Mod2.set_xlim([0, 12000])
scatter3Mod3.set_xlim([0, 1200])


print(scatter3Mod1)
print(scatter3Mod2)

In [None]:
#Scatterplot for director_facebook_likes and movie gross
directScatter = movies_df.plot.scatter(x = "director_facebook_likes", y = "gross")

print(directScatter)