In [69]:
#Dependencies
#pip install xlrd
import numpy as np
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.subplots as ps
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [70]:
#Read Excel file
df = pd.read_excel('./Resources/The_Numbers_Data.xlsx')

#check right rows to select
# print(df.loc[99:144]) 

#create df
movies_df = df.loc[101:144]

#Labels for columns
movies_df.rename(columns={'The Numbers - Where Data and Movies Meet':'Release_Date',
                          'Unnamed: 1':'Title', 'Unnamed: 2':'Budget','Unnamed: 3':'Opening_Weekend',
                            'Unnamed: 4':'Domestic_Box_Office','Unnamed: 5':'Worldwide_Box_Office'}, inplace = True)
# Drop Werewolf by Night as no numbers along with NaN numbers
movies_df = movies_df.dropna()
# Assign budget for Wakanda Forever
# movies_df.at[114,'Budget']=250000000

# Change to date with datetime
movies_df['Release_Date'] = pd.to_datetime(movies_df['Release_Date'])
# Change date to ascending order
movies_df.sort_values(by='Release_Date', inplace=True)
movies_df



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,Release_Date,Title,Budget,Opening_Weekend,Domestic_Box_Office,Worldwide_Box_Office
144,2008-06-13,The Incredible Hulk,137500000,55414050,134806913,265573859
143,2010-05-07,Iron Man 2,170000000,128122480,312433331,621156389
142,2011-05-06,Thor,150000000,65723338,181030624,449326618
141,2011-07-22,Captain America: The First …,140000000,65058524,176654505,370569776
140,2012-05-04,The Avengers,225000000,207438708,623357910,1515100211
139,2013-05-03,Iron Man 3,200000000,174144585,408992272,1215392272
138,2013-11-08,Thor: The Dark World,150000000,85737841,206362140,644602516
137,2014-04-04,Captain America: The Winter…,170000000,95023721,259746958,714401889
136,2014-08-01,Guardians of the Galaxy,170000000,94320883,333714112,770882395
135,2015-05-01,Avengers: Age of Ultron,365000000,191271109,459005868,1395316979


In [71]:
# create total box office and profit columns
# movies_df['Total_Box_Office'] = movies_df.loc[:,['Domestic_Box_Office','Worldwide_Box_Office']].sum(axis=1)
movies_df = movies_df.assign(International_Box_Office=movies_df['Worldwide_Box_Office'] - movies_df['Domestic_Box_Office'])
movies_df = movies_df.assign(Profit=movies_df['Worldwide_Box_Office'] - movies_df['Budget'])

movies_df.head()

Unnamed: 0,Release_Date,Title,Budget,Opening_Weekend,Domestic_Box_Office,Worldwide_Box_Office,International_Box_Office,Profit
144,2008-06-13,The Incredible Hulk,137500000,55414050,134806913,265573859,130766946,128073859
143,2010-05-07,Iron Man 2,170000000,128122480,312433331,621156389,308723058,451156389
142,2011-05-06,Thor,150000000,65723338,181030624,449326618,268295994,299326618
141,2011-07-22,Captain America: The First …,140000000,65058524,176654505,370569776,193915271,230569776
140,2012-05-04,The Avengers,225000000,207438708,623357910,1515100211,891742301,1290100211


In [72]:
# change monetary string values to float values
movies_df = movies_df.astype({'Budget':'float','Opening_Weekend':'float',
                              'Domestic_Box_Office':'float','Worldwide_Box_Office':'float',
                              'International_Box_Office':'float','Profit':'float'})
# set float option to remove sicentific notation
pd.set_option('display.float_format', lambda x: '%.0f' % x)

movies_df.dtypes

Release_Date                datetime64[ns]
Title                               object
Budget                             float64
Opening_Weekend                    float64
Domestic_Box_Office                float64
Worldwide_Box_Office               float64
International_Box_Office           float64
Profit                             float64
dtype: object

In [73]:
fig = go.Figure()
# fig.add_trace(go.Bar(
#     x=movies_df['Title'],
#     y=movies_df['Budget'],
#     name='Budget',
#     marker_color='red'
# ))
fig.add_trace(go.Bar(
    x=movies_df['Title'],
    y=movies_df['Domestic_Box_Office'],
    name='Domestic Box Office',
    marker_color='blue'
))

fig.add_trace(go.Bar(
    x=movies_df['Title'],
    y=movies_df['International_Box_Office'],
    name='International Box Office',
    # marker_color='lightsalmon'
))
fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Domestic vs International Box Office",
                autosize=False,width=1300,height=700)
fig.update_traces(textangle=270)
fig.show()

In [74]:
fig = px.bar(movies_df, y='Budget', x='Title', text_auto='.2s',
            color_discrete_sequence=["red"],
            title="Budget by Movie Title")
fig.update_layout(barmode='group', xaxis_tickangle=-45,
                  autosize=False,width=1300,height=700)
fig.update_traces(textangle=270)

fig.show()

In [75]:
fig = px.bar(movies_df, y='Worldwide_Box_Office', x='Title', text_auto='.2s',
            color_discrete_sequence=["green"],
            title="Worldwide Box Office by Movie Title")
fig.update_layout(barmode='group', xaxis_tickangle=-45,
                  autosize=False,width=1300,height=700)
fig.update_traces(textangle=270)
fig.show()

In [76]:
fig = px.bar(movies_df, x="Title", y=['Budget',"Profit"],  title="Budget vs Profit",
            color_discrete_sequence=["red", "green"],text_auto='0.2s')
fig.update_layout(barmode='stack', xaxis_tickangle=-45,
                autosize=False,width=1300,height=700)
fig.update_traces(textangle=270, textposition='outside', width=0.5)
fig.show()

In [96]:
# Assign MCU phases
movies_df.reset_index(drop=True, inplace=True)
# movies_df.loc[0:5,'Phase'] = 1
# movies_df.loc[6:11,'Phase'] = 2
# movies_df.loc[12:21,'Phase'] = 3
# movies_df.loc[22:29,'Phase'] = 4
# movies_df.loc[30:,'Phase'] = 5

# Slection of dates for each Phase
# Select movies in the date range of interest and add a 'Phase' column
movies_df = (movies_df[(movies_df['Release_Date'] > '2008-03-01') & (movies_df['Release_Date'] <= '2012-05-04')]
        .assign(Phase=1)
        .append(movies_df[(movies_df['Release_Date'] > '2013-05-02') & (movies_df['Release_Date'] <= '2015-07-18')]
        .assign(Phase=2))
        .append(movies_df[(movies_df['Release_Date'] > '2016-05-03') & (movies_df['Release_Date'] <= '2019-07-02')]
                .assign(Phase=3))
        #      .append(movies_df[(movies_df['Release_Date'] > '2016-05-05') & (movies_df['Release_Date'] <= '2019-07-02')]
                #      .assign(Phase=4))
        .append(movies_df[(movies_df['Release_Date'] > '2021-06-09') & (movies_df['Release_Date'] <= '2022-11-11')]
                .assign(Phase=4))            
        .append(movies_df[(movies_df['Release_Date'] > '2023-01-17') & (movies_df['Release_Date'] <= '2024-09-06')]
                .assign(Phase=5))
                )

# ph1 = movies_df[(movies_df['Release_Date'] > '2008-05-02') & (movies_df['Release_Date'] <= '2012-05-04')]
# Add a new column 'Phase' with a value of 1 for all rows
# ph1 = ph1.assign(Phase=1)


movies_df.to_csv('marvel_box_office.csv', sep='\t', encoding='utf-8')
movies_df

Unnamed: 0,Release_Date,Title,Budget,Opening_Weekend,Domestic_Box_Office,Worldwide_Box_Office,International_Box_Office,Profit,Phase
0,2008-06-13,The Incredible Hulk,137500000,55414050,134806913,265573859,130766946,128073859,1
1,2010-05-07,Iron Man 2,170000000,128122480,312433331,621156389,308723058,451156389,1
2,2011-05-06,Thor,150000000,65723338,181030624,449326618,268295994,299326618,1
3,2011-07-22,Captain America: The First …,140000000,65058524,176654505,370569776,193915271,230569776,1
4,2012-05-04,The Avengers,225000000,207438708,623357910,1515100211,891742301,1290100211,1
5,2013-05-03,Iron Man 3,200000000,174144585,408992272,1215392272,806400000,1015392272,2
6,2013-11-08,Thor: The Dark World,150000000,85737841,206362140,644602516,438240376,494602516,2
7,2014-04-04,Captain America: The Winter…,170000000,95023721,259746958,714401889,454654931,544401889,2
8,2014-08-01,Guardians of the Galaxy,170000000,94320883,333714112,770882395,437168283,600882395,2
9,2015-05-01,Avengers: Age of Ultron,365000000,191271109,459005868,1395316979,936311111,1030316979,2


In [97]:
#  split movies into phases
# phase_1 = movies_df.loc[0:5]
ph1 = movies_df[(movies_df['Release_Date'] > '2008-04-02') & (movies_df['Release_Date'] <= '2012-05-04')]
#  Phase one profit trend
fig_p1 = px.scatter(ph1, x= ph1.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["blue", "red"],
                  title="Phase One Worldwide Box Office by Movie Title")
fig_p1.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0,1, 2, 3, 4],
        ticktext = ['Iron Man', 'The Incredible Hulk', 'Thor', "Iron Man 2",
                    'Captain America',"The Avengers"],
        title = 'Movie Title'
    )
)

fig_p1.show()

In [100]:
#  Phase 2 profit trend
# phase_2 = movies_df.loc[6:11]
ph2 = movies_df[(movies_df['Release_Date'] > '2012-05-04') & (movies_df['Release_Date'] <= '2015-08-15')]

fig = px.scatter(ph2, x= ph2.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["orange", "red"],
                title="Phase Two Worldwide Box Office by Movie Title",
               )
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [5,6,7,8,9,10],
        ticktext = ['Iron Man 3', 'Thor: The Dark World',
                    'Captain America Winter Soldier','Guardians of the Galaxy',
                    "Avengers: Age of Ultron", "Ant Man"],
        title = 'Movie Title'
    )
)

fig.show()


In [103]:
#  Phase 3 profit trend
# phase_3 = movies_df.loc[12:21]
ph3 = movies_df[(movies_df['Release_Date'] > '2016-05-03') & (movies_df['Release_Date'] <= '2019-07-02')]

fig = px.scatter(ph3, x= ph3.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["green", "red"],
                  title="Phase Three Worldwide Box Office by Movie Title")
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [11,12,13,14,15,16,17,18,19,20,21],
        ticktext = ['Captain America Civil War', 'Doctor Strange',
                    'Guardians of the Galaxy Vol 2', "Spider-Man: Homecoming",
                    'Thor: Ragnarok',"Black Panther	", 'Avengers: Infinity War',"Ant-Man and the Wasp",
                    'Captain Marvel', 'Avengers: Endgame',"Spider-Man: Far from Home" ],
        title = 'Movie Title'
    )
)

fig.show()


In [102]:
#  Phase 4 profit trend
# phase_4 = movies_df.loc[23:30]
ph4 = movies_df[(movies_df['Release_Date'] > '2021-06-09') & (movies_df['Release_Date'] <= '2022-11-11')]

fig = px.scatter(ph4, x= ph4.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["red", "red"],
                  title="Phase Four Worldwide Box Office by Movie Title")
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [22,23,24,25,26,27,28],
        ticktext = [ 'Black Widow', 'Shang-Chi', 'Eternals',
                    'Spider-Man: No Way Home', "Doctor Strange: MOM",'Thor: Love and Thunder',
                    "Black Panther: Wakanda Forever"],
        title = 'Movie Title'
    )
)

fig.show()

In [90]:
ph5 = movies_df[(movies_df['Release_Date'] > '2023-01-17') & (movies_df['Release_Date'] <= '2024-09-06')]


fig = px.scatter(ph5, x= ph5.index, y="Worldwide_Box_Office", size = 'Worldwide_Box_Office',#change size to profit once Marvels becomes positive.
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["purple", "purple"],
                  title="Phase Five  Worldwide Box Office by Movie Title")
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [29,30,31],
        ticktext = [ "Ant-Man and the Wasp: Quantumania", "Guardians Vol 3", "The Marvels"],
        title = 'Movie Title'
    )
)

fig.show()

In [104]:
# use facet_col to show phases and trendlines in one chart
fig = px.scatter(movies_df, x=movies_df.index, y="Worldwide_Box_Office", size="Worldwide_Box_Office", #change size to profit once Marvels becomes positive.
                facet_col="Phase", color='Phase', trendline="ols",)
                
fig.update_xaxes(matches=None)
fig.update_xaxes(tickangle=45, title="Movie Title")
fig.update_yaxes(title="Worldwide Box Office")
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True))
fig.update_layout(height=500, width=1200,
                  xaxis = dict(
                    tickmode = 'array',
                    tickvals = [0,1, 2, 3, 4],
                    ticktext = ['Iron Man', 'The Incredible Hulk', 'Thor', "Iron Man 2",
                    'Captain America',"The Avengers"]),
                  xaxis2 = dict(
                    tickmode = 'array',
                    tickvals = [5,6,7, 8, 9, 10],
                    ticktext = ['Iron Man 3', 'Thor: The Dark World',
                    'Captain America Winter Soldier','Guardians of the Galaxy',
                    "Avengers: Age of Ultron", "Ant Man"]),
                  xaxis3 = dict(
                      tickmode = 'array',
                    tickvals = [11,12,13,14,15,16,17,18,19,20,21],
                    ticktext = ['Captain America Civil War', 'Doctor Strange',
                    'Guardians of the Galaxy Vol 2', "Spider-Man: Homecoming",
                    'Thor: Ragnarok',"Black Panther	", 'Avengers: Infinity War',"Ant-Man and the Wasp",
                    'Captain Marvel', 'Avengers: Endgame',"Spider-Man: Far from Home" ]),
                  xaxis4 = dict(
                    tickmode = 'array',
                    tickvals = [22,23,24,25,26,27,28],
                    ticktext = [ 'Black Widow', 'Shang-Chi', 'Eternals',
                                'Spider-Man: No Way Home', "Doctor Strange: MOM",'Thor: Love and Thunder',
                                "Black Panther: Wakanda Forever"]),
                   
                   xaxis5=dict(
                    tickmode = 'array',
                    tickvals = [29,30,31],
                    ticktext=[ "Ant-Man and the Wasp: Quantumania", "Guardians Vol 3", "The Marvels"]
                  ))


fig.show()




In [105]:
ph5 = movies_df[(movies_df['Release_Date'] > '2023-01-17') & (movies_df['Release_Date'] <= '2024-09-06')]

# scatter subplots for ww box office profit
fig = make_subplots(rows=3, cols=2, start_cell="top-left",
                    subplot_titles=("Phase 1", "Phase 2", "Phase 3", "Phase 4", 'Phase 5'))
                    

fig.add_trace(go.Scatter(x=ph1.index, y=ph1['Profit'],
                 name="Phase 1"),
              row=1, col=1)

fig.add_trace(go.Scatter(x=ph2.index, y=ph2['Profit'],
               name="Phase 2"),
              row=1, col=2)

fig.add_trace(go.Scatter(x=ph3.index, y=ph3['Profit'],
               name="Phase 3"),
              row=2, col=1)

fig.add_trace(go.Scatter(x=ph4.index, y=ph4['Profit'],
               name="Phase 4"),
              row=2, col=2)
fig.add_trace(go.Scatter(x=ph5.index, y=ph5['Profit'],
               name="Phase 5"),
              row=3, col=1)

fig.update_layout(height=1000, width=1000,
                  xaxis = dict(
                    tickmode = 'array',
                    tickvals = [0,1, 2, 3, 4],
                    ticktext = ['Iron Man', 'The Incredible Hulk', 'Thor', "Iron Man 2",
                    'Captain America',"The Avengers"]),
                  xaxis2 = dict(
                    tickmode = 'array',
                    tickvals = [5,6,7, 8, 9, 10,],
                    ticktext = ['Iron Man 3', 'Thor: The Dark World',
                    'Captain America Winter Soldier','Guardians of the Galaxy',
                    "Avengers: Age of Ultron", "Ant Man"]),
                  xaxis3 = dict(
                      tickmode = 'array',
                    tickvals = [11,12,13,14,15,16,17,18,19,20,21],
                    ticktext = ['Captain America Civil War', 'Doctor Strange',
                    'Guardians of the Galaxy Vol 2', "Spider-Man: Homecoming",
                    'Thor: Ragnarok',"Black Panther	", 'Avengers: Infinity War',"Ant-Man and the Wasp",
                    'Captain Marvel', 'Avengers: Endgame',"Spider-Man: Far from Home" ]),
                  xaxis4=dict(
                      tickmode = 'array',
                      tickvals = [22,23,24,25,26,27,28],
                      ticktext = [ 'Black Widow', 'Shang-Chi', 'Eternals',
                                'Spider-Man: No Way Home', "Doctor Strange: MOM",'Thor: Love and Thunder',
                                "Black Panther: Wakanda Forever"]),
                    xaxis5=dict(
                    tickmode = 'array',
                    tickvals = [29,30,31],
                    ticktext=[ "Ant-Man and the Wasp: Quantumania", "Guardians Vol 3", "The Marvels"]),

                  title_text="Profit of each Movie by Phase")
fig.update_xaxes(tickangle=45)
fig.show()

In [86]:
# Group and average phases
four_phase_avg = movies_df.groupby(['Phase']).mean()
four_phase_avg =four_phase_avg.reset_index()
#Remove Spiderman moviesfrom phase 4 and average phases
no_spider2 = movies_df.drop([22, 26])
no_spider_avg = no_spider2.groupby(['Phase']).mean()
no_spider_avg = no_spider_avg.reset_index()

In [87]:
# Show profitabilty of all 4 phases
fig = px.scatter(four_phase_avg, x='Phase', y="Profit", size = 'Profit',trendline="ols", trendline_scope="overall")
fig.update_layout(
            autosize=False,
    width=1000,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [1, 2, 3, 4,5],
        ticktext = ['One', 'Two', 'Three', 'Four', 'Five']
    )
)
fig.show()

In [88]:


labels = ['Phase 1', 'Phase 2', 'Phase 3', 'Phase 4', 'Phase 5']

# Define color sets of paintings
colors = px.colors.sequential.RdBu
# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=2, cols=2, specs=specs,
                     subplot_titles=['Worldwide Box Office',
                    'Worldwide Box Office No Spiderman', "Profit Phase", "Profit no Spiderman"])

# Define pie charts
fig.add_trace(go.Pie(labels=labels, values=four_phase_avg['Worldwide_Box_Office'],
                     name='With Spiderman',marker_colors=colors), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=no_spider_avg['Worldwide_Box_Office'],
                     name='No Spiderman',
                     marker_colors=colors), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=four_phase_avg['Profit'],
                     name='With Spiderman',marker_colors=colors), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=no_spider_avg['Profit'],
                     name='No Spiderman',
                     marker_colors=colors), 2, 2)

# Tune layout and hover info
fig.update_traces(hoverinfo='label+percent+name')
fig.update(layout_title_text='Worldwide Box Office and Profit Phase',
           layout_showlegend=True,)
fig.update_layout(autosize=False,width=1000,height=500)

# fig = go.Figure(fig)
fig.show()