In [198]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
%matplotlib inline

In [149]:
pd.set_option('display.max_rows', 500)

In [346]:
# We use the bom.movie_gross.csv data to identify each movie's studio.
# We wanted to know the studio so we can identify trends within individual studios
bom = pd.read_csv('Data/bom.movie_gross.csv')

# We use the tn.movie_budgets.csv to identify each movie's budget and revenue figures.
tn = pd.read_csv('Data/tn.movie_budgets.csv')

In [347]:
# cleaning up an encoding issue for apostrophes
tn.movie = tn.movie.apply(lambda x: x.replace('â\x80\x99', "'"))

# Converting the money columns, which have values stored as strings due to dollar-signs and commas, to numbers.


def convert_currency_str_to_num(dataframe, column):
    dataframe[column] = dataframe[column].apply(lambda x: x.replace('$', ''))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(',', ''))
    dataframe[column] = dataframe[column].astype('float')


convert_currency_str_to_num(tn, 'production_budget')
convert_currency_str_to_num(tn, 'domestic_gross')
convert_currency_str_to_num(tn, 'worldwide_gross')

tn.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,425000000.0,760507625.0,2776345000.0
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0
2,3,"Jun 7, 2019",Dark Phoenix,350000000.0,42762350.0,149762400.0
3,4,"May 1, 2015",Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0


In [348]:
# Removing movies from before 2010 to conform with the time period in the other datasets
tn['year'] = tn.release_date.apply(lambda x: int(x[-4:]))
tn = tn.loc[tn.year >= 2010]

In [349]:
# Converting the release_date to a datetime object
tn['release_date'] = pd.to_datetime(tn.release_date)

In [350]:
tn.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,410600000.0,241063875.0,1045664000.0,2011
2,3,2019-06-07,Dark Phoenix,350000000.0,42762350.0,149762400.0,2019
3,4,2015-05-01,Avengers: Age of Ultron,330600000.0,459005868.0,1403014000.0,2015
4,5,2017-12-15,Star Wars Ep. VIII: The Last Jedi,317000000.0,620181382.0,1316722000.0,2017
5,6,2015-12-18,Star Wars Ep. VII: The Force Awakens,306000000.0,936662225.0,2053311000.0,2015


In [351]:
bom.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [352]:
# This function returns True if the movie title has a 4-digit number/year within a parenthetical
# We found some titles in this dataset which had the year appended in a parenthetical,
# but that wasn't the case in the other datasets,
# so they would get excluded from an inner join on the titles.


def parenthetical_is_a_year(words):
    if words.find("(") != -1:
        new_words = words[(words.find("(")+1):(words.find(")"))]
        if (new_words.isdigit()) and (len(new_words) == 4):
            return True
        else:
            return False
    else:
        return False

In [353]:
# This function relies on parenthetical_is_a_year() to determine if there's a year appended to the title,
# and then removes the appended year if it exists


def remove_appended_yr_from_title(dataframe, column):
    dataframe[column] = dataframe[column].apply(
        lambda x: x[:(x.find("(")-1)] if parenthetical_is_a_year(x) else x)

In [354]:
remove_appended_yr_from_title(bom, 'title')

In [355]:
bom.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland,BV,334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,WB,296000000.0,664300000,2010
3,Inception,WB,292600000.0,535700000,2010
4,Shrek Forever After,P/DW,238700000.0,513900000,2010


In [356]:
# This function removes punctuation from the titles and makes all characters lowercase.
# The intent is to remove styling that could inhibit a match when joining different datasets


def title_string_format_standardization(dataframe, column):
    dataframe[column] = dataframe[column].apply(lambda x: str(x).translate(str.maketrans('',
                                                                                         '',
                                                                                         string.punctuation)
                                                                           ).lower())

In [357]:
title_string_format_standardization(bom, 'title')

In [358]:
title_string_format_standardization(tn, 'movie')

In [359]:
bom.head()

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,toy story 3,BV,415000000.0,652000000,2010
1,alice in wonderland,BV,334200000.0,691300000,2010
2,harry potter and the deathly hallows part 1,WB,296000000.0,664300000,2010
3,inception,WB,292600000.0,535700000,2010
4,shrek forever after,P/DW,238700000.0,513900000,2010


In [360]:
tn.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,year
1,2,2011-05-20,pirates of the caribbean on stranger tides,410600000.0,241063875.0,1045664000.0,2011
2,3,2019-06-07,dark phoenix,350000000.0,42762350.0,149762400.0,2019
3,4,2015-05-01,avengers age of ultron,330600000.0,459005868.0,1403014000.0,2015
4,5,2017-12-15,star wars ep viii the last jedi,317000000.0,620181382.0,1316722000.0,2017
5,6,2015-12-18,star wars ep vii the force awakens,306000000.0,936662225.0,2053311000.0,2015


In [361]:
bom.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


In [362]:
tn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2194 entries, 1 to 5780
Data columns (total 7 columns):
id                   2194 non-null int64
release_date         2194 non-null datetime64[ns]
movie                2194 non-null object
production_budget    2194 non-null float64
domestic_gross       2194 non-null float64
worldwide_gross      2194 non-null float64
year                 2194 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(2), object(1)
memory usage: 137.1+ KB


In [363]:
print(bom.columns)
print(tn.columns)

# Only keeping pertinent columns of each dataframe.
bom = bom[['title', 'studio']]
tn = tn[['movie', 'release_date', 'production_budget', 'domestic_gross',
         'worldwide_gross']]

# Dropping the 5 rows from bom that have nulls since the count is so low
bom.dropna(inplace=True)

Index(['title', 'studio', 'domestic_gross', 'foreign_gross', 'year'], dtype='object')
Index(['id', 'release_date', 'movie', 'production_budget', 'domestic_gross',
       'worldwide_gross', 'year'],
      dtype='object')


In [364]:
len(bom.studio.unique())

257

In [365]:
tn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2194 entries, 1 to 5780
Data columns (total 5 columns):
movie                2194 non-null object
release_date         2194 non-null datetime64[ns]
production_budget    2194 non-null float64
domestic_gross       2194 non-null float64
worldwide_gross      2194 non-null float64
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 102.8+ KB


In [366]:
# Creating a foreign_gross column.
tn['foreign_gross'] = tn['worldwide_gross'] - tn['domestic_gross']
tn.head()

Unnamed: 0,movie,release_date,production_budget,domestic_gross,worldwide_gross,foreign_gross
1,pirates of the caribbean on stranger tides,2011-05-20,410600000.0,241063875.0,1045664000.0,804600000.0
2,dark phoenix,2019-06-07,350000000.0,42762350.0,149762400.0,107000000.0
3,avengers age of ultron,2015-05-01,330600000.0,459005868.0,1403014000.0,944008100.0
4,star wars ep viii the last jedi,2017-12-15,317000000.0,620181382.0,1316722000.0,696540400.0
5,star wars ep vii the force awakens,2015-12-18,306000000.0,936662225.0,2053311000.0,1116649000.0


In [367]:
sum(bom.duplicated())

0

In [368]:
sum(tn.duplicated())

0

In [369]:
# We did a left merge to explore the non-matches for a later inner join that will ultimately be analyzed.
# The previous data cleaning of the title strings was in response to this EDA.
studio_movie_performance_left = bom.merge(right=tn,
                                          how='left',
                                          left_on='title',
                                          right_on='movie')

In [370]:
sum(studio_movie_performance_left.movie.isna())

1939

In [371]:
studio_movie_performance_left.sort_values('title').head(100)

Unnamed: 0,title,studio,movie,release_date,production_budget,domestic_gross,worldwide_gross,foreign_gross
2394,10 cloverfield lane,Par.,10 cloverfield lane,2016-03-11,5000000.0,72082999.0,108286422.0,36203423.0
1038,10 years,Anch.,,NaT,,,,
1830,1000 times good night,FM,,NaT,,,,
2282,1001 grams,KL,,NaT,,,,
3222,102 not out,Sony,,NaT,,,,
533,111111,Rocket,,NaT,,,,
3161,12 strong,WB,12 strong,2018-01-19,35000000.0,45819713.0,71118378.0,25298665.0
1168,12 years a slave,FoxS,12 years a slave,2013-10-18,20000000.0,56671993.0,181025343.0,124353350.0
94,127 hours,FoxS,127 hours,2010-11-05,18000000.0,18335230.0,60217171.0,41881941.0
486,13 assassins,Magn.,,NaT,,,,


In [372]:
studio_movie_performance_left.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3387 entries, 0 to 3386
Data columns (total 8 columns):
title                3387 non-null object
studio               3387 non-null object
movie                1448 non-null object
release_date         1448 non-null datetime64[ns]
production_budget    1448 non-null float64
domestic_gross       1448 non-null float64
worldwide_gross      1448 non-null float64
foreign_gross        1448 non-null float64
dtypes: datetime64[ns](1), float64(4), object(3)
memory usage: 238.1+ KB


In [373]:
# Inner joining the studio and budget+revenue data sets to ensure we have a set of movies with a complete set of info.
studio_movie_performance_inner = bom.merge(right=tn,
                                           how='inner',
                                           left_on='title',
                                           right_on='movie')

In [374]:
studio_movie_performance_inner.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1448 entries, 0 to 1447
Data columns (total 8 columns):
title                1448 non-null object
studio               1448 non-null object
movie                1448 non-null object
release_date         1448 non-null datetime64[ns]
production_budget    1448 non-null float64
domestic_gross       1448 non-null float64
worldwide_gross      1448 non-null float64
foreign_gross        1448 non-null float64
dtypes: datetime64[ns](1), float64(4), object(3)
memory usage: 101.8+ KB


In [375]:
imdb_titles = pd.read_csv('Exploration/imdb_df_join3.csv')

In [376]:
imdb_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 10 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             146144 non-null object
averagerating      73856 non-null float64
numvotes           73856 non-null float64
directors          140417 non-null object
writers            110261 non-null object
dtypes: float64(3), int64(1), object(6)
memory usage: 11.2+ MB


In [377]:
imdb_titles.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres,averagerating,numvotes,directors,writers
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama",7.0,77.0,nm0712540,"nm0023551,nm1194313,nm0347899,nm1391276"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama",7.2,43.0,nm0002411,
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama,6.9,4517.0,nm0000080,"nm0000080,nm0462648"
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama",6.1,13.0,nm0611531,nm0347899
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy",6.5,119.0,"nm0765384,nm0749914","nm1360635,nm0749914"


In [378]:
len(set(imdb_titles.original_title))

137774

In [379]:
# Creating subsets of the dataframe with the unique IMDb title ID (i.e. tconst).
# These are then concatenated and duplicates are removed.
# This provides a map between all potential IMDb titles and their unique IDs.
primary_titles = imdb_titles[['tconst', 'primary_title']]
original_titles = imdb_titles[['tconst', 'original_title']]
primary_titles.columns = ['tconst', 'title']
original_titles.columns = ['tconst', 'title']

In [380]:
imdb_titles_only = pd.concat([primary_titles, original_titles])

In [381]:
title_string_format_standardization(imdb_titles_only, 'title')

In [382]:
imdb_titles_only.head()

Unnamed: 0,tconst,title
0,tt0063540,sunghursh
1,tt0066787,one day before the rainy season
2,tt0069049,the other side of the wind
3,tt0069204,sabse bada sukh
4,tt0100275,the wandering soap opera


In [383]:
imdb_titles_only.tail()

Unnamed: 0,tconst,title
146139,tt9916538,kuambil lagi hatiku
146140,tt9916622,rodolpho teóphilo o legado de um pioneiro
146141,tt9916706,dankyavar danka
146142,tt9916730,6 gunn
146143,tt9916754,chico albuquerque revelações


In [384]:
imdb_titles_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 292288 entries, 0 to 146143
Data columns (total 2 columns):
tconst    292288 non-null object
title     292288 non-null object
dtypes: object(2)
memory usage: 6.7+ MB


In [385]:
# Removing duplicates.
imdb_titles_only = imdb_titles_only.loc[imdb_titles_only.duplicated() == False]
imdb_titles_only.dropna(inplace=True)

In [386]:
imdb_titles_only.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160557 entries, 0 to 146135
Data columns (total 2 columns):
tconst    160557 non-null object
title     160557 non-null object
dtypes: object(2)
memory usage: 3.7+ MB


In [387]:
imdb_titles_only.reset_index(inplace=True)

In [388]:
imdb_titles_only.drop('index', axis=1, inplace=True)
imdb_titles_only.head()

Unnamed: 0,tconst,title
0,tt0063540,sunghursh
1,tt0066787,one day before the rainy season
2,tt0069049,the other side of the wind
3,tt0069204,sabse bada sukh
4,tt0100275,the wandering soap opera


In [389]:
studio_movie_performance_inner.head()

Unnamed: 0,title,studio,movie,release_date,production_budget,domestic_gross,worldwide_gross,foreign_gross
0,toy story 3,BV,toy story 3,2010-06-18,200000000.0,415004880.0,1068880000.0,653874642.0
1,alice in wonderland,BV,alice in wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,691300000.0
2,inception,WB,inception,2010-07-16,160000000.0,292576195.0,835524600.0,542948447.0
3,shrek forever after,P/DW,shrek forever after,2010-05-21,165000000.0,238736787.0,756244700.0,517507886.0
4,the twilight saga eclipse,Sum.,the twilight saga eclipse,2010-06-30,68000000.0,300531751.0,706102800.0,405571077.0


In [390]:
# Joining the unique IMDb title ID (i.e. tconst) with the budget, revenue and studio data.
studio_movie_performance_inner_w_imdb = studio_movie_performance_inner.merge(right=imdb_titles_only,
                                                                             how='inner',
                                                                             left_on='title',
                                                                             right_on='title')

In [391]:
studio_movie_performance_inner_w_imdb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2077 entries, 0 to 2076
Data columns (total 9 columns):
title                2077 non-null object
studio               2077 non-null object
movie                2077 non-null object
release_date         2077 non-null datetime64[ns]
production_budget    2077 non-null float64
domestic_gross       2077 non-null float64
worldwide_gross      2077 non-null float64
foreign_gross        2077 non-null float64
tconst               2077 non-null object
dtypes: datetime64[ns](1), float64(4), object(4)
memory usage: 162.3+ KB


In [392]:
studio_movie_performance_inner_w_imdb.head()

Unnamed: 0,title,studio,movie,release_date,production_budget,domestic_gross,worldwide_gross,foreign_gross,tconst
0,toy story 3,BV,toy story 3,2010-06-18,200000000.0,415004880.0,1068880000.0,653874642.0,tt0435761
1,alice in wonderland,BV,alice in wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,691300000.0,tt1014759
2,alice in wonderland,BV,alice in wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,691300000.0,tt1926979
3,alice in wonderland,BV,alice in wonderland,2010-03-05,200000000.0,334191110.0,1025491000.0,691300000.0,tt2049386
4,inception,WB,inception,2010-07-16,160000000.0,292576195.0,835524600.0,542948447.0,tt1375666


In [393]:
studio_movie_performance_inner_w_imdb.drop('movie', axis=1, inplace=True)

In [394]:
studio_movie_performance_inner_w_imdb.to_csv(
    'Exploration/studio_movie_performance_inner_w_imdb.csv', index=False)