In [12]:
#importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [13]:
#loading data

df = pd.read_csv('tn.movie_budgets.csv')

In [14]:
#dataset exploration
df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [15]:
df.tail()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
5777,78,"Dec 31, 2018",Red 11,"$7,000",$0,$0
5778,79,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495"
5779,80,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338"
5780,81,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0
5781,82,"Aug 5, 2005",My Date With Drew,"$1,100","$181,041","$181,041"


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 5782 non-null   int64 
 1   release_date       5782 non-null   object
 2   movie              5782 non-null   object
 3   production_budget  5782 non-null   object
 4   domestic_gross     5782 non-null   object
 5   worldwide_gross    5782 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


The data information shows that there are no missing values.

In [17]:
#changing all the currency values from objects to float

currency_cols = ['production_budget','domestic_gross','worldwide_gross']
for x in currency_cols:
    df[x] = df[x].astype(str).replace({'\$':'',',':''},regex = True)
    df[x] = df[x].astype(float)
    df[x] = df[x].round(0).astype(int)
df.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000,760507625,-2147483648
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,3,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [18]:
#check for duplicates

df.duplicated().sum()

0

There are no duplicated entries

In [19]:
#dropping rows where gross revenue is 0

df = df[df['worldwide_gross'] != 0]

In [22]:
#checking the return on investment for each movie in terms of %
df['r_o_i'] = (((df['worldwide_gross'] - df['production_budget'])/df['production_budget'])*100).astype(int)
df

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,r_o_i
5745,46,"Jun 30, 1972",Deep Throat,25000,45000000,45000000,179900
5613,14,"Mar 21, 1980",Mad Max,200000,8750000,99750000,49775
5492,93,"Sep 25, 2009",Paranormal Activity,450000,107918810,194183034,43051
5679,80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41556
5406,7,"Jul 14, 1999",The Blair Witch Project,600000,140539099,248300000,41283
...,...,...,...,...,...,...,...
5335,36,"Nov 21, 2014",Food Chains,913000,0,176,-99
2152,53,"Aug 24, 1997",The Grimm Brothers' Snow White,26000000,5000,5000,-99
3818,19,"May 8, 2015",Skin Trade,9000000,1242,1242,-99
4081,82,"May 21, 2010",Perrierâs Bounty,6600000,828,828,-99


In [26]:
#sorting data from the highest return on investment
df = df.sort_values(by='r_o_i', ascending=False)
df.head(50)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,r_o_i
5745,46,"Jun 30, 1972",Deep Throat,25000,45000000,45000000,179900
5613,14,"Mar 21, 1980",Mad Max,200000,8750000,99750000,49775
5492,93,"Sep 25, 2009",Paranormal Activity,450000,107918810,194183034,43051
5679,80,"Jul 10, 2015",The Gallows,100000,22764410,41656474,41556
5406,7,"Jul 14, 1999",The Blair Witch Project,600000,140539099,248300000,41283
5709,10,"May 7, 2004",Super Size Me,65000,11529368,22233808,34105
5346,47,"Aug 13, 1942",Bambi,858000,102797000,268000000,31135
5773,74,"Feb 26, 1993",El Mariachi,7000,2040920,2041928,29070
5676,77,"Oct 1, 1968",Night of the Living Dead,114000,12087064,30087064,26292
5210,11,"Nov 21, 1976",Rocky,1000000,117235147,225000000,22400
