In [134]:
import pandas as pd
import numpy as np
from imdb import Cinemagoer
from tqdm.auto import tqdm
import re

# Read movie dataset

In [135]:
df = pd.read_csv("Movies to watch - Ratings.csv")
df = df.iloc[1:,[0,1,2,3,4,5,6]]
df = df.rename(columns={'Unnamed: 0': "Film", "Unnamed: 11": "Average rating"})
df = df.replace("DNF", np.nan)
df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan
1,Inception,8.0,8.25,,8.3,8.000,
2,Good Will Hunting,9.0,9.00,,9.1,,
3,Cabin in the Woods,7.0,7.00,6.5,4.6,8.300,
4,Independence Day,7.0,7.20,7,3.7,6.900,
5,Spotlight,10.0,8.80,8.5,7.7,,
...,...,...,...,...,...,...,...
80,American Psycho,8.0,7.00,4.5,9.32,,7.496
81,Shaun of the Dead,7.5,8.00,6.5,7.87,7.545,
82,Silence,9.0,7.60,6.4,6.09,,
83,Arrival,7.5,9.00,8.2,7.34,,


# Download information from IMDB

In [136]:
# select interesting information from the following list:
# ['akas', 'animation department', 'art department', 'art direction', 'aspect ratio', 'assistant director', 'box office', 'camera and electrical department', 'canonical title', 'cast', 
#  'casting department', 'casting director', 'certificates', 'cinematographer', 'color info', 'composer', 'costume department', 'costume designer', 'countries', 'country codes', 
#  'cover url', 'director', 'distributors', 'editor', 'editorial department', 'full-size cover url', 'genres', 'imdbID', 'kind', 'language codes', 'languages', 'localized title', 
#  'location management', 'long imdb canonical title', 'long imdb title', 'make up', 'miscellaneous crew', 'music department', 'original air date', 'original title', 'other companies', 
#  'plot', 'plot outline', 'producer', 'production companies', 'production design', 'production manager', 'rating', 'runtimes', 'script department', 'set decoration', 'smart canonical title', 
#  'smart long imdb canonical title', 'sound crew', 'sound mix', 'special effects', 'special effects companies', 'stunt performer', 'synopsis', 'title', 'transportation department', 'visual effects', 
#  'votes', 'writer', 'year']

interesting_stats = ['aspect ratio', 'box office', 'director', 'genres', 'year', 'rating', 'votes', 'plot']

In [137]:

def retrieve_movie_stats(movie, interesting_stats, ia):    
    res = {}
    # searching the name 
    search = ia.search_movie(movie)
    
    # getting the id
    id = search[0].movieID
    
    # get a movie's info
    movie_info = ia.get_movie(id)
    
    for key in interesting_stats:
        try:
            if key in movie_info:
                if key == "box office":
                    res["Budget"] = movie_info[key]["Budget"]
                    if "Cumulative Worldwide Gross" in movie_info[key]: res["Cumulative Worldwide Gross"] = movie_info[key]["Cumulative Worldwide Gross"]
                elif key == "synopsis":
                    res[key] = movie_info[key][0]
                else:                
                #     if isinstance(movie_info[key], list):
                #         res[key] = ",".join(movie_info[key])
                #     else:                    
                    res[key] = str(movie_info[key])
            else:
                res[key] = np.nan
        except:
            res[key] = np.nan
        
    return res

In [138]:
# create an instance of the Cinemagoer class
cnm = Cinemagoer()

# list interesting columns
interesting_stats = ['aspect ratio', 'box office', 'director', 'genres', 'year', 'rating', 'votes', 'synopsis']
movie_stats = {}
for movie in df.Film.values:
    movie_stats[movie] = retrieve_movie_stats(movie, interesting_stats, cnm)
    
movie_stats = pd.DataFrame.from_dict(movie_stats, orient="index")
movie_stats.to_csv("movie_stats.csv")
movie_stats

Unnamed: 0,aspect ratio,Budget,Cumulative Worldwide Gross,director,genres,year,rating,votes,synopsis,box office
Inception,2.39 : 1,"$160,000,000 (estimated)","$825,532,764, 06 Jan 2011",[<Person id:0634240[http] name:_Christopher No...,"['Action', 'Adventure', 'Sci-Fi', 'Thriller']",2010,8.8,2315131,"A young man, exhausted and delirious, washes u...",
Good Will Hunting,1.85 : 1,"$10,000,000 (estimated)","$225,933,435",[<Person id:0001814[http] name:_Gus Van Sant_>],"['Drama', 'Romance']",1997,8.3,954678,Though Will Hunting (Matt Damon) has genius-le...,
Cabin in the Woods,,"$3,000 (estimated)",,[<Person id:5981437[http] name:_Kyla Jarrett_>],"['Short', 'Drama']",,,,,
Independence Day,2.20 : 1 (70 mm print),"$75,000,000 (estimated)",,[<Person id:0000386[http] name:_Roland Emmeric...,"['Action', 'Adventure', 'Sci-Fi']",1996,7.0,572816,"On July 2, a giant alien mother ship, over 340...",
Spotlight,1.85 : 1,"$20,000,000 (estimated)","$98,275,238",[<Person id:0565336[http] name:_Tom McCarthy_>],"['Biography', 'Crime', 'Drama']",2015,8.1,466101,The opening shot shows the text: BASED ON ACTU...,
...,...,...,...,...,...,...,...,...,...,...
American Psycho,1.78 : 1 (open matte Blu-ray),"$7,000,000 (estimated)","$34,266,564",[<Person id:0366004[http] name:_Mary Harron_>],"['Crime', 'Drama', 'Horror']",2000,7.6,596510,A white background. Red drops begin to fall pa...,
Shaun of the Dead,1.78 : 1 (negative ratio),"GBP4,000,000 (estimated)","$30,039,392",[<Person id:0942367[http] name:_Edgar Wright_>],"['Comedy', 'Horror']",2004,7.9,554306,"The film begins in The Winchester, a tradition...",
Silence,2.39 : 1,"$46,000,000 (estimated)","$23,737,523",[<Person id:0000217[http] name:_Martin Scorses...,"['Drama', 'History']",2016,7.1,112784,"Over darkness, environmental sounds (crickets ...",
Arrival,2.39 : 1,"$47,000,000 (estimated)","$203,388,186",[<Person id:0898288[http] name:_Denis Villeneu...,"['Drama', 'Mystery', 'Sci-Fi']",2016,7.9,683982,The film starts with the voice of Dr. Louise B...,


# Process data

## Budget

In [139]:
movie_stats = pd.read_csv("movie_stats.csv")

In [140]:
def process_budget(x):
    # print(x["Budget"])
    x = re.findall("\d{1,3},{0,1}\d{1,3},{0,1}\d{1,3}", str(x))
    if x == []: return float(np.nan)
    else: return float(x[0].replace(",", ""))
    
movie_stats["Budget"] = movie_stats["Budget"].apply(process_budget)

## Gross

In [141]:
movie_stats["Cumulative Worldwide Gross"] = movie_stats["Cumulative Worldwide Gross"].apply(process_budget)

## genres

In [142]:
# first get a list of all unique categories and also keep the list of categories per movie
all_movie_cats = []
unique_categories = []
for s in movie_stats["genres"].values:
    s = s.replace(" ", "")
    s = s.replace("'", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    movie_cats = s.split(",")
    all_movie_cats.append(movie_cats)
    for i in movie_cats:
        if i not in unique_categories:
            unique_categories.append(i)
unique_categories

['Action',
 'Adventure',
 'Sci-Fi',
 'Thriller',
 'Drama',
 'Romance',
 'Short',
 'Biography',
 'Crime',
 'History',
 'War',
 'Western',
 'Talk-Show',
 'Animation',
 'Comedy',
 'Fantasy',
 'Music',
 'Family',
 'Sport',
 'Mystery',
 'Horror',
 'Documentary',
 'Musical']

In [143]:
len(movie_stats.iloc[:,0])

84

In [144]:
genre_matrix = {}
# now create a column for every genre with a True of False depending on wheter a movie has a certain genre
for i in range(len(all_movie_cats)):
    movie = movie_stats.iloc[:,0].values[i]
    d = {}
    for genre in unique_categories:
        if genre in all_movie_cats[i]:
            d[genre] = True 
        else:
            d[genre] = False 
    genre_matrix[movie] = d
    #movie_stats.iloc[:,0]
genre_matrix = pd.DataFrame.from_dict(genre_matrix, orient="index").reset_index()
genre_matrix

Unnamed: 0,index,Action,Adventure,Sci-Fi,Thriller,Drama,Romance,Short,Biography,Crime,...,Animation,Comedy,Fantasy,Music,Family,Sport,Mystery,Horror,Documentary,Musical
0,Inception,True,True,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Good Will Hunting,False,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Cabin in the Woods,False,False,False,False,True,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Independence Day,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Spotlight,False,False,False,False,True,False,False,True,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,American Psycho,False,False,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
80,Shaun of the Dead,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,True,False,False
81,Silence,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
82,Arrival,False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [145]:
pd.merge(df, movie_stats[columns_to_keep], left_on="Film", right_on="Unnamed: 0").drop("Unnamed: 0", axis=1)

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Budget,Cumulative Worldwide Gross,year,rating,votes,synopsis
0,Inception,8.0,8.25,,8.3,8.000,,160000000.0,825532764.0,2010.0,8.8,2315131.0,"A young man, exhausted and delirious, washes u..."
1,Good Will Hunting,9.0,9.00,,9.1,,,10000000.0,225933435.0,1997.0,8.3,954678.0,Though Will Hunting (Matt Damon) has genius-le...
2,Cabin in the Woods,7.0,7.00,6.5,4.6,8.300,,3000.0,,,,,
3,Independence Day,7.0,7.20,7,3.7,6.900,,75000000.0,,1996.0,7.0,572816.0,"On July 2, a giant alien mother ship, over 340..."
4,Spotlight,10.0,8.80,8.5,7.7,,,20000000.0,98275238.0,2015.0,8.1,466101.0,The opening shot shows the text: BASED ON ACTU...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,American Psycho,8.0,7.00,4.5,9.32,,7.496,7000000.0,34266564.0,2000.0,7.6,596510.0,A white background. Red drops begin to fall pa...
80,Shaun of the Dead,7.5,8.00,6.5,7.87,7.545,,4000000.0,30039392.0,2004.0,7.9,554306.0,"The film begins in The Winchester, a tradition..."
81,Silence,9.0,7.60,6.4,6.09,,,46000000.0,23737523.0,2016.0,7.1,112784.0,"Over darkness, environmental sounds (crickets ..."
82,Arrival,7.5,9.00,8.2,7.34,,,47000000.0,203388186.0,2016.0,7.9,683982.0,The film starts with the voice of Dr. Louise B...


In [146]:
columns_to_keep = ["Unnamed: 0", "Budget", "Cumulative Worldwide Gross", "year", "rating", "votes", "synopsis" ]
clean_df = pd.merge(df, movie_stats[columns_to_keep], left_on="Film", right_on="Unnamed: 0").drop("Unnamed: 0", axis=1)
clean_df = pd.merge(clean_df, genre_matrix, left_on="Film", right_on="index").drop("index", axis=1)
clean_df.to_csv("clean_df.csv")
clean_df

Unnamed: 0,Film,Seb,Jos,Coen,Stijn,Merle,Twan,Budget,Cumulative Worldwide Gross,year,...,Animation,Comedy,Fantasy,Music,Family,Sport,Mystery,Horror,Documentary,Musical
0,Inception,8.0,8.25,,8.3,8.000,,160000000.0,825532764.0,2010.0,...,False,False,False,False,False,False,False,False,False,False
1,Good Will Hunting,9.0,9.00,,9.1,,,10000000.0,225933435.0,1997.0,...,False,False,False,False,False,False,False,False,False,False
2,Cabin in the Woods,7.0,7.00,6.5,4.6,8.300,,3000.0,,,...,False,False,False,False,False,False,False,False,False,False
3,Independence Day,7.0,7.20,7,3.7,6.900,,75000000.0,,1996.0,...,False,False,False,False,False,False,False,False,False,False
4,Spotlight,10.0,8.80,8.5,7.7,,,20000000.0,98275238.0,2015.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,American Psycho,8.0,7.00,4.5,9.32,,7.496,7000000.0,34266564.0,2000.0,...,False,False,False,False,False,False,False,True,False,False
80,Shaun of the Dead,7.5,8.00,6.5,7.87,7.545,,4000000.0,30039392.0,2004.0,...,False,True,False,False,False,False,False,True,False,False
81,Silence,9.0,7.60,6.4,6.09,,,46000000.0,23737523.0,2016.0,...,False,False,False,False,False,False,False,False,False,False
82,Arrival,7.5,9.00,8.2,7.34,,,47000000.0,203388186.0,2016.0,...,False,False,False,False,False,False,True,False,False,False
