In [11]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('./Data/im.db')

BOM_Data = pd.read_csv('./Data/bom.movie_gross.csv')
BOM_Data.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [12]:
RT_Movie_Data = pd.read_csv('./Data/rt.movie_info.tsv', sep='\t')
RT_Movie_Data.head()

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One
2,5,Illeana Douglas delivers a superb performance ...,R,Drama|Musical and Performing Arts,Allison Anders,Allison Anders,"Sep 13, 1996","Apr 18, 2000",,,116 minutes,
3,6,Michael Douglas runs afoul of a treacherous su...,R,Drama|Mystery and Suspense,Barry Levinson,Paul Attanasio|Michael Crichton,"Dec 9, 1994","Aug 27, 1997",,,128 minutes,
4,7,,NR,Drama|Romance,Rodney Bennett,Giles Cooper,,,,,200 minutes,


In [13]:
RT_Review_Data = pd.read_csv('./Data/rt.reviews.tsv', sep='\t', encoding='latin1')
RT_Review_Data.head(10)

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"
5,3,... Cronenberg's Cosmopolis expresses somethin...,,fresh,Michelle Orange,0,Capital New York,"September 11, 2017"
6,3,"Quickly grows repetitive and tiresome, meander...",C,rotten,Eric D. Snider,0,EricDSnider.com,"July 17, 2013"
7,3,Cronenberg is not a director to be daunted by ...,2/5,rotten,Matt Kelemen,0,Las Vegas CityLife,"April 21, 2013"
8,3,"Cronenberg's cold, exacting precision and emot...",,fresh,Sean Axmaker,0,Parallax View,"March 24, 2013"
9,3,Over and above its topical urgency or the bit ...,,fresh,Kong Rithdee,0,Bangkok Post,"March 4, 2013"


Recommendations

1. Best genres to make movies of (top 10 highest grossing genres)

In [14]:
# According to Box Office Mojo

# Clean
# Remove commas and convert to numeric
BOM_Data['domestic_gross'] = BOM_Data['domestic_gross'].astype(str).str.replace(',', '')
BOM_Data['foreign_gross'] = BOM_Data['foreign_gross'].astype(str).str.replace(',', '')

# Convert the cleaned strings to actual numbers
BOM_Data['domestic_gross'] = pd.to_numeric(BOM_Data['domestic_gross'], errors='coerce')
BOM_Data['foreign_gross'] = pd.to_numeric(BOM_Data['foreign_gross'], errors='coerce')

# Handle Missing Values
BOM_Data['domestic_gross'] = BOM_Data['domestic_gross'].fillna(0)
BOM_Data['foreign_gross'] = BOM_Data['foreign_gross'].fillna(0)

# Perform Addition
BOM_Data['Total_Earnings'] = BOM_Data['domestic_gross'] + BOM_Data['foreign_gross']
BOM_Data.sort_values(by='Total_Earnings', ascending=False)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year,Total_Earnings
727,Marvel's The Avengers,BV,623400000.0,895500000.0,2012,1.518900e+09
1875,Avengers: Age of Ultron,BV,459000000.0,946400000.0,2015,1.405400e+09
3080,Black Panther,BV,700100000.0,646900000.0,2018,1.347000e+09
328,Harry Potter and the Deathly Hallows Part 2,WB,381000000.0,960500000.0,2011,1.341500e+09
2758,Star Wars: The Last Jedi,BV,620200000.0,712400000.0,2017,1.332600e+09
...,...,...,...,...,...,...
3078,2:22,Magn.,400.0,0.0,2017,4.000000e+02
2321,The Chambermaid,FM,300.0,0.0,2015,3.000000e+02
2756,News From Planet Mars,KL,300.0,0.0,2016,3.000000e+02
2757,Satanic,Magn.,300.0,0.0,2016,3.000000e+02


In [15]:
# According to IMDB (recent >= 2005) (good rating >= 7)

pd.read_sql('''
    SELECT mb.primary_title, mb.genres, mr.averagerating
    FROM movie_basics mb
    JOIN movie_ratings mr
    ON mb.movie_id = mr.movie_id
    WHERE mr.averagerating >= 7
    AND mb.start_year >= 2005
    ORDER BY mr.averagerating DESC;
''', conn)

Unnamed: 0,primary_title,genres,averagerating
0,Exteriores: Mulheres Brasileiras na Diplomacia,Documentary,10.0
1,The Dark Knight: The Ballad of the N Word,"Comedy,Drama",10.0
2,Freeing Bernie Baran,"Crime,Documentary",10.0
3,Hercule contre Hermès,Documentary,10.0
4,I Was Born Yesterday!,Documentary,10.0
...,...,...,...
26902,The Good Terrorist,Documentary,7.0
26903,Anavitória: Araguaína - Las Vegas,Documentary,7.0
26904,La Fin des terres,Documentary,7.0
26905,The Projectionist,Documentary,7.0


In [16]:
# According to Rotten tomatoes

# 1. Clean the 'box_office' column
# Convert to string, remove commas, and convert to numeric
RT_Movie_Data['box_office'] = (RT_Movie_Data['box_office']
                               .astype(str)
                               .str.replace(',', '')
                               .str.replace('$', '', regex=False))

RT_Movie_Data['box_office'] = pd.to_numeric(RT_Movie_Data['box_office'], errors='coerce')

# 2. Split genres and explode the dataframe
# This creates a new row for each genre a movie belongs to
RT_Movie_Data['genre'] = RT_Movie_Data['genre'].str.split('|')
RT_Movie_Exploded = RT_Movie_Data.explode('genre')

# 3. Group by genre, sum the revenue, and sort descending
genre_earnings = (RT_Movie_Exploded.groupby('genre')['box_office']
                  .sum()
                  .sort_values(ascending=False))

# View the results
genre_earnings.head()

genre
Comedy                         5.657980e+09
Drama                          5.650752e+09
Action and Adventure           4.923971e+09
Mystery and Suspense           2.808625e+09
Science Fiction and Fantasy    2.590332e+09
Name: box_office, dtype: float64

2. Best runtime (avg runtime of the top 10 grossing movies)
