In [None]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 60
pd.options.display.max_columns = 30

In [None]:
movies_df = pd.read_csv( "movie_metadata.csv" )

In [None]:
#check column labels
movies_df.columns

In [None]:
#drop movies that aren't in english
movies_df = movies_df.drop(movies_df[movies_df.language != "English"].index)

In [None]:
#drop labels we aren't likely interested in
movies_df.drop( labels = [ 'color', 'duration', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'language',
                       'content_rating', 'aspect_ratio', 'num_critic_for_reviews', 'num_user_for_reviews', 'budget' ],
            axis = 1, inplace = True )

In [None]:
#rearrange column labels
movies_df = movies_df [ [ 'movie_title', 'title_year', 'country', 'gross', 'director_name', 'director_facebook_likes',
               'actor_1_name', 'actor_1_facebook_likes', 'actor_2_name', 'actor_2_facebook_likes', 'actor_3_name',
               'actor_3_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes', 'imdb_score', 'num_voted_users',
               'genres' ] ]

In [None]:
#drop movies where actors and directors with 0 likes
movies_df = movies_df.drop(movies_df[movies_df.actor_1_facebook_likes == 0].index)
movies_df = movies_df.drop(movies_df[movies_df.actor_2_facebook_likes == 0].index)
movies_df = movies_df.drop(movies_df[movies_df.actor_3_facebook_likes == 0].index)
movies_df = movies_df.drop(movies_df[movies_df.director_facebook_likes == 0].index)

In [None]:
#drop movies not in the US
movies_df = movies_df.drop( movies_df[movies_df.country != 'USA'].index )
#view number of rows left
movies_df.shape

In [None]:
#checking if genres type and title type are string
print(isinstance((movies_df.loc[1, 'genres']), str))
print(isinstance((movies_df.loc[1, 'movie_title']), str))

In [None]:
#check if any of the genres entries is null, in this case none are so we don't have to worry about that in the next step
movies_df.genres.isnull().sum()

In [None]:
#genres:
#consider each genre seperately: extract genres delimited by "|" (pipeline)

#this variable will store all genres in the movie dataset
genre_dict = {}

#sample if need to test. Note: not 50 entries because we dropped certain rows & some of the indeces from 0-49 were dropped
#test = pd.DataFrame(movies.loc[0:49])


#change this back to movies.genres
for genres in movies_df.genres:
    #remove spaces from string so that something like "Action " vs " Action" are not treated differently
    genres_no_space = genres.strip()
    #need to extract genre by delimiter |
    genres_list = genres_no_space.split("|")
    
    #this block of code adds the genre as a key in dictionary and updates the count
    for genre in genres_list:
        
        #the key exists and we want to increment the count for movies with this genre
        if genre in genre_dict:
            genre_dict[genre] = genre_dict.get(genre) + 1   
        else: #add this genre if it doesn't already exist in the dict
            key_value = {genre : 1}
            genre_dict.update(key_value)

In [None]:
#REMEMBER: only loop once per row checking every dict entry

#list where we will store the genres. This is just convenient to have
genre_list = []
#add dictionary keys (genres) into our genre_list
for key in genre_dict:
    #don't add any genres whose corresponding movies count is less than 20, due to statistical insignificance
    if genre_dict[key] >= 20:
        genre_list.append(key)

#list where we will store all the movie titles. This is made to create index column of dataframe
movie_title_list = []
for title in movies_df.movie_title:
    movie_title_list.append(title)

#create the df whose columns titles are genres, and index is movie titles
movie_genre_df = pd.DataFrame(index = movie_title_list, columns = genre_list)
movie_genre_df.index.name = 'MovieTitle'

#fill the columns with 1 or 0, where 1 serves to tell us the movie belongs to the genre in the column title and 0 otherwise
for movie in movies_df.index:
    #current title of movie
    title = movies_df.movie_title.loc[movie]
    #current genre string of movie
    genres_str = movies_df.genres.loc[movie]
    
    #We have a genre list from previous code. We can loop over list and check if
    # current movie has certain genres. Note: will need movie title and genre
    for genre in genre_list:      
        #returns positive index of where substring was found in string, -1 if not found
        i = genres_str.find(genre)
        #if the genre is contained in the string then make the column corresponding to
        #this genre and this title in the movies_genre_df to 1 otherwise make it 0  
        if i >= 0:
            movie_genre_df.at[title, genre] = 1
        else:
            movie_genre_df.at[title, genre] = 0

In [None]:
#check movie_df
movies_df

In [None]:
#check movie_genre_df
movie_genre_df

In [None]:
#note that this shows we have 446 null values in original movie dataset
print("Number of all null rows: ", movies_df.isnull().any(axis = 1).sum())

#this line shows the number of null values by column
movies_df.isnull().sum()

#TODO: next step is to deal with nulls

In [None]:
#drop all null values (466 values)
movies_df.dropna(inplace = True)
#check the shape, we know we had 3086 when we last checked the shape so: should be 3086 - 466 = 2640
movies_df.shape

In [None]:
#Scatterplot for actor1_facebook_likes and movie gross
scatter1 = movies_df.plot.scatter(x = "actor_1_facebook_likes", y = "gross")
scatter1Mod = movies_df.plot.scatter(x = "actor_1_facebook_likes", y = "gross")
print(scatter1)
#plt.plot([0, 100000], scatter1.y, color = 'red')
#Actual limit 600000. Few outliers beyond 100000, removing for better view of scatterplot
scatter1Mod.set_xlim([0, 100000])
print(scatter1Mod)

scatter1.plot

In [None]:
#Scatterplot for actor2_facebook_likes and movie gross
scatter2 = movies_df.plot.scatter(x = "actor_2_facebook_likes", y = "gross")
scatter2Mod = movies_df.plot.scatter(x = "actor_2_facebook_likes", y = "gross")
print(scatter2)

#Actual limit 140000. Few outliers beyond 40000, removing for better view of scatterplot
scatter2Mod.set_xlim([0, 40000])
print(scatter2Mod)

In [None]:
#Scatterplot for actor3_facebook_likes and movie gross
scatter3 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")
scatter3Mod1 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")
scatter3Mod2 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")
scatter3Mod3 = movies_df.plot.scatter(x = "actor_3_facebook_likes", y = "gross")


print(scatter3)

#No notable outliers. Reducing Xlim to display more accurate scatter plot views
scatter3Mod1.set_xlim([0, 3000])
scatter3Mod2.set_xlim([0, 12000])
scatter3Mod3.set_xlim([0, 1200])


print(scatter3Mod1)
print(scatter3Mod2)

In [None]:
#Scatterplot for director_facebook_likes and movie gross
directScatter = movies_df.plot.scatter(x = "director_facebook_likes", y = "gross")

print(directScatter)