In [1]:
# Importing libraries and establishing connection to SQL database

import numpy as np
import pandas as pd
import sqlite3 as sql
import difflib
data_path = '../zippedData'
conn = sql.connect(data_path + '/im.db')

In [2]:
# Declaration of base dataframes

# Studios and gross revenue - 3387 rows
# https://www.boxofficemojo.com/
gross_df = pd.read_csv(data_path + '/bom.movie_gross.csv')
# The movie database - https://www.themoviedb.org/?language=en-US
# Reviews - 26517 rows, release date, popularity score, votes?
reviews_df = pd.read_csv(data_path + '/tmdb.movies.csv')
# Budget and gross revenues - 5782 rows
# https://www.the-numbers.com/
budget_df = pd.read_csv(data_path + '/tn.movie_budgets.csv')
# Movie ratings (R, etc.), genre, director, writer, theater/dvd date, length, box office amount, studio - 1560 rows
movie_info_df = pd.read_csv(data_path + '/rt.movie_info.tsv', sep='\t')
# Rotten tomatoes - reviews, crtiic, review rating, critic quality - 54,432 rows
rt_reviews_df = pd.read_csv(data_path + '/rt.reviews.tsv', sep='\t', encoding='latin-1')
# Ratings and genres by director from imdb database
director_and_ratings_df = pd.read_sql(
"""SELECT DISTINCT b.*,
            r.averagerating AS avg_rating,
            r.numvotes AS num_votes,
            p.primary_name AS director,
            p.primary_profession
FROM movie_basics b
JOIN movie_ratings r
ON b.movie_id = r.movie_id
LEFT JOIN directors d
ON b.movie_id = d.movie_id
LEFT JOIN persons p
ON d.person_id = p.person_id
""", conn)
# Ratings and genres by writers from imdb database
writers_df = pd.read_sql("""
SELECT DISTINCT b.*, r.averagerating AS avg_rating, r.numvotes AS num_votes, p.primary_name AS 'writer', p.primary_profession
FROM movie_basics b
JOIN movie_ratings r
ON b.movie_id = r.movie_id
LEFT JOIN writers w
ON b.movie_id = w.movie_id
LEFT JOIN persons p
ON w.person_id = p.person_id

""", conn)

In [3]:
reviews_df

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...,...
26512,26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [4]:
# From https://www.themoviedb.org/talk/5daf6eb0ae36680011d7e6ee
genre_dictionary = [{"id":28,"name":"Action"},
                              {"id":12,"name":"Adventure"},
                              {"id":16,"name":"Animation"},
                              {"id":35,"name":"Comedy"},
                              {"id":80,"name":"Crime"},
                              {"id":99,"name":"Documentary"},
                              {"id":18,"name":"Drama"},
                              {"id":10751,"name":"Family"},
                              {"id":14,"name":"Fantasy"},
                              {"id":36,"name":"History"},
                              {"id":27,"name":"Horror"},
                              {"id":10402,"name":"Music"},
                              {"id":9648,"name":"Mystery"},
                              {"id":10749,"name":"Romance"},
                              {"id":878,"name":"Science Fiction"},
                              {"id":10770,"name":"TV Movie"},
                              {"id":53,"name":"Thriller"},
                              {"id":10752,"name":"War"},
                              {"id":37,"name":"Western"}]

In [5]:
reviews_df.genre_ids.str.strip('\[\]').str.replace(" ", "").str.split(',')

0            [12, 14, 10751]
1        [14, 12, 16, 10751]
2              [12, 28, 878]
3            [16, 35, 10751]
4              [28, 878, 12]
                ...         
26512               [27, 18]
26513               [18, 53]
26514           [14, 28, 12]
26515        [10751, 12, 28]
26516               [53, 27]
Name: genre_ids, Length: 26517, dtype: object

In [6]:
# reviews_df.genre_ids = reviews_df.genre_ids.str.strip('\[\]').str.replace(" ", "").str.strip().str.split(',')
# reviews_df = reviews_df.explode('genre_ids')
# reviews_df.genre_ids = reviews_df.genre_ids.replace(r'^\s*$', np.nan, regex=True)
# reviews_df.genre_ids.astype('float')
# def find_genre(key):
#     if key is np.nan:
#         return 'None'
#     for dictionary in genre_dictionary:
#         if dictionary['id'] == int(key):
#             return dictionary['name']
#     return 'None'

# reviews_df.genre_ids = reviews_df.genre_ids.apply(lambda x: find_genre(x))


In [7]:
len(reviews_df.title.unique())

24688

In [8]:
reviews_df

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.920,2010-07-16,Inception,8.3,22186
...,...,...,...,...,...,...,...,...,...,...
26512,26512,"[27, 18]",488143,en,Laboratory Conditions,0.600,2018-10-13,Laboratory Conditions,0.0,1
26513,26513,"[18, 53]",485975,en,_EXHIBIT_84xxx_,0.600,2018-05-01,_EXHIBIT_84xxx_,0.0,1
26514,26514,"[14, 28, 12]",381231,en,The Last One,0.600,2018-10-01,The Last One,0.0,1
26515,26515,"[10751, 12, 28]",366854,en,Trailer Made,0.600,2018-06-22,Trailer Made,0.0,1


In [9]:
budget_df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"
...,...,...,...,...,...,...
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0


In [10]:
gross_df

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010
...,...,...,...,...,...
3382,The Quake,Magn.,6200.0,,2018
3383,Edward II (2018 re-release),FM,4800.0,,2018
3384,El Pacto,Sony,2500.0,,2018
3385,The Swan,Synergetic,2400.0,,2018


In [11]:
# Reformatting data in budget_df

# Turned all the monetary strings into ints
budget_df[budget_df.columns[3:]] = budget_df[budget_df.columns[3:]].replace('[\$\.]|,', '', regex=True).astype('int')
# Added column of world wide gross - production budget to get the net profit
budget_df['profit'] = budget_df['worldwide_gross'] - budget_df['production_budget']

In [12]:
# Merging budget dataframe and gross dataframe by domestic gross
# Merged on movie from budget_df and title from gross_df
merged_budget_and_gross_df = pd.merge(budget_df, gross_df, how = "outer", left_on = 'movie', right_on = 'title')
# Dropping id and year columns, since year is represented in release_date (may need to go back, since some release_dates are missing)
merged_budget_and_gross_df.drop(['id', 'year'], axis=1, inplace=True)
# Merging gross_y and gross_x columns, preferring gross_x which seem to have higher precision
merged_budget_and_gross_df.domestic_gross_x = merged_budget_and_gross_df.domestic_gross_x.fillna(merged_budget_and_gross_df.domestic_gross_y)
# Merging movie and title, preferring movie
merged_budget_and_gross_df.movie = merged_budget_and_gross_df.movie.fillna(merged_budget_and_gross_df.title)
# Dropping redundant gross and title columns
merged_budget_and_gross_df.drop(['domestic_gross_y', 'title'], axis=1, inplace=True)
merged_budget_and_gross_df

Unnamed: 0,release_date,movie,production_budget,domestic_gross_x,worldwide_gross,profit,studio,foreign_gross
0,"Dec 18, 2009",Avatar,425000000.0,760507625.0,2.776345e+09,2.351345e+09,,
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1.045664e+09,6.350639e+08,BV,804600000
2,"Jun 7, 2019",Dark Phoenix,350000000.0,42762350.0,1.497624e+08,-2.002376e+08,,
3,"May 1, 2015",Avengers: Age of Ultron,330600000.0,459005868.0,1.403014e+09,1.072414e+09,BV,946400000
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1.316722e+09,9.997217e+08,,
...,...,...,...,...,...,...,...,...
7926,,The Quake,,6200.0,,,Magn.,
7927,,Edward II (2018 re-release),,4800.0,,,FM,
7928,,El Pacto,,2500.0,,,Sony,
7929,,The Swan,,2400.0,,,Synergetic,


In [13]:
merged_budget_and_gross_df.movie = merged_budget_and_gross_df.movie.str.replace('\W', ' ', regex=True).str.replace('\s+', ' ', regex=True).str.lower()

In [14]:
director_and_ratings_df.primary_title = director_and_ratings_df.primary_title.str.replace('\W', ' ', regex=True).str.replace('\s+', ' ', regex=True).str.lower()

In [15]:
reviews_df.title = reviews_df.title.str.replace('\W', ' ', regex=True).str.replace('\s+', ' ', regex=True).str.lower()

In [16]:
merged_budget_and_gross_df = merged_budget_and_gross_df.loc[merged_budget_and_gross_df.production_budget.notna()]

In [17]:
merge_reviews = pd.merge(merged_budget_and_gross_df, reviews_df, how='left', left_on='movie', right_on='title')
merge_reviews_directors = pd.merge(merge_reviews, director_and_ratings_df, how='left', left_on='movie', right_on='primary_title')

In [18]:
merge_reviews_directors

Unnamed: 0.1,release_date_x,movie,production_budget,domestic_gross_x,worldwide_gross,profit,studio,foreign_gross,Unnamed: 0,genre_ids,...,movie_id,primary_title,original_title_y,start_year,runtime_minutes,genres,avg_rating,num_votes,director,primary_profession
0,"Dec 18, 2009",avatar,425000000.0,760507625.0,2.776345e+09,2.351345e+09,,,6.0,"[28, 12, 14, 878]",...,tt1775309,avatar,Abatâ,2011.0,93.0,Horror,6.1,43.0,Atsushi Wada,director
1,"May 20, 2011",pirates of the caribbean on stranger tides,410600000.0,241063875.0,1.045664e+09,6.350639e+08,BV,804600000,2470.0,"[12, 28, 14]",...,tt1298650,pirates of the caribbean on stranger tides,Pirates of the Caribbean: On Stranger Tides,2011.0,136.0,"Action,Adventure,Fantasy",6.6,447624.0,Rob Marshall,"director,miscellaneous,producer"
2,"Jun 7, 2019",dark phoenix,350000000.0,42762350.0,1.497624e+08,-2.002376e+08,,,,,...,tt6565702,dark phoenix,Dark Phoenix,2019.0,113.0,"Action,Adventure,Sci-Fi",6.0,24451.0,Simon Kinberg,"producer,writer,director"
3,"May 1, 2015",avengers age of ultron,330600000.0,459005868.0,1.403014e+09,1.072414e+09,BV,946400000,14169.0,"[28, 12, 878]",...,tt2395427,avengers age of ultron,Avengers: Age of Ultron,2015.0,141.0,"Action,Adventure,Sci-Fi",7.3,665594.0,Joss Whedon,"writer,producer,director"
4,"Dec 15, 2017",star wars ep viii the last jedi,317000000.0,620181382.0,1.316722e+09,9.997217e+08,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7887,"Apr 2, 1999",following,6000.0,48482.0,2.404950e+05,2.344950e+05,,,,,...,,,,,,,,,,
7888,"Jul 13, 2005",return to the land of wonders,5000.0,1338.0,1.338000e+03,-3.662000e+03,,,,,...,,,,,,,,,,
7889,"Sep 29, 2015",a plague so pleasant,1400.0,0.0,0.000000e+00,-1.400000e+03,,,,,...,tt2107644,a plague so pleasant,A Plague So Pleasant,2013.0,76.0,"Drama,Horror,Thriller",5.4,72.0,Benjamin Roberds,"camera_department,director,writer"
7890,"Sep 29, 2015",a plague so pleasant,1400.0,0.0,0.000000e+00,-1.400000e+03,,,,,...,tt2107644,a plague so pleasant,A Plague So Pleasant,2013.0,76.0,"Drama,Horror,Thriller",5.4,72.0,Jordan Reyes,"director,cinematographer,composer"


In [19]:
merge_reviews_directors.columns

Index(['release_date_x', 'movie', 'production_budget', 'domestic_gross_x',
       'worldwide_gross', 'profit', 'studio', 'foreign_gross', 'Unnamed: 0',
       'genre_ids', 'id', 'original_language', 'original_title_x',
       'popularity', 'release_date_y', 'title', 'vote_average', 'vote_count',
       'movie_id', 'primary_title', 'original_title_y', 'start_year',
       'runtime_minutes', 'genres', 'avg_rating', 'num_votes', 'director',
       'primary_profession'],
      dtype='object')

In [20]:
merge_reviews_directors.drop(['studio',
                              'Unnamed: 0',
                              'original_language',
                              'id',
                              'popularity',
                              'title',
                              'movie_id',
                              'primary_title',
                              'original_title_x',
                              'original_title_y',
                              'release_date_y',
                              'vote_average',
                              'vote_count',
                              'start_year',
                              'avg_rating',
                              'num_votes',
                              'primary_profession'
                             ], axis=1, inplace=True)

In [21]:
merge_reviews_directors.loc[merge_reviews_directors.genre_ids.notna()]

Unnamed: 0,release_date_x,movie,production_budget,domestic_gross_x,worldwide_gross,profit,foreign_gross,genre_ids,runtime_minutes,genres,director
0,"Dec 18, 2009",avatar,425000000.0,760507625.0,2.776345e+09,2.351345e+09,,"[28, 12, 14, 878]",93.0,Horror,Atsushi Wada
1,"May 20, 2011",pirates of the caribbean on stranger tides,410600000.0,241063875.0,1.045664e+09,6.350639e+08,804600000,"[12, 28, 14]",136.0,"Action,Adventure,Fantasy",Rob Marshall
3,"May 1, 2015",avengers age of ultron,330600000.0,459005868.0,1.403014e+09,1.072414e+09,946400000,"[28, 12, 878]",141.0,"Action,Adventure,Sci-Fi",Joss Whedon
6,"Apr 27, 2018",avengers infinity war,300000000.0,678815482.0,2.048134e+09,1.748134e+09,1369.5,"[12, 28, 14]",149.0,"Action,Adventure,Sci-Fi",Anthony Russo
7,"Apr 27, 2018",avengers infinity war,300000000.0,678815482.0,2.048134e+09,1.748134e+09,1369.5,"[12, 28, 14]",149.0,"Action,Adventure,Sci-Fi",Joe Russo
...,...,...,...,...,...,...,...,...,...,...,...
7858,"Apr 21, 2015",ten,25000.0,0.0,0.000000e+00,-2.500000e+04,,"[12, 27, 9648, 53]",83.0,"Adventure,Horror,Mystery",Sophia Cacciola
7859,"Apr 21, 2015",ten,25000.0,0.0,0.000000e+00,-2.500000e+04,,"[12, 27, 9648, 53]",82.0,"Horror,Mystery,Thriller",Chris Robert
7862,"Dec 31, 2014",dry spell,22000.0,0.0,0.000000e+00,-2.200000e+04,,"[35, 10749]",90.0,"Comedy,Romance",Travis Legge
7864,"Jan 4, 2013",all superheroes must die,20000.0,0.0,0.000000e+00,-2.000000e+04,,"[878, 53]",78.0,"Sci-Fi,Thriller",Jason Trost
