In [1]:
#pip install xlrd
#Dependencies
import numpy as np
import pandas as pd
import datetime as dt
import plotly.express as px
import plotly.subplots as ps
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [2]:
#Read Excel file
df = pd.read_excel('./Resources/The_Numbers_Data.xlsx')

#check right rows to select
# print(df.loc[99:144]) 

#create df
movies_df = df.loc[113:144]

#Labels for columns
movies_df.rename(columns={'The Numbers - Where Data and Movies Meet':'Release_Date',
                          'Unnamed: 1':'Title', 'Unnamed: 2':'Budget','Unnamed: 3':'Opening_Weekend',
                            'Unnamed: 4':'Domestic_Box_Office','Unnamed: 5':'Worldwide_Box_Office'}, inplace = True)
# Drop Werewolf by Night as no numbers
movies_df = movies_df.drop(115)
# Assign budget for Wakanda Forever
movies_df.at[114,'Budget']=250000000

# Change to date with datetime
movies_df['Release_Date'] = pd.to_datetime(movies_df['Release_Date'])
# Change date to ascending order
movies_df.sort_values(by='Release_Date', inplace=True)
movies_df.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Release_Date,Title,Budget,Opening_Weekend,Domestic_Box_Office,Worldwide_Box_Office
144,2008-05-02,Iron Man,186000000,102118668,318604126,585171547
143,2008-06-13,The Incredible Hulk,137500000,55414050,134806913,265573859
142,2010-05-07,Iron Man 2,170000000,128122480,312433331,621156389
141,2011-05-06,Thor,150000000,65723338,181030624,449326618
140,2011-07-22,Captain America: The First …,140000000,65058524,176654505,370569776


In [3]:
# create total box office and profit columns
# movies_df['Total_Box_Office'] = movies_df.loc[:,['Domestic_Box_Office','Worldwide_Box_Office']].sum(axis=1)
movies_df = movies_df.assign(International_Box_Office=movies_df['Worldwide_Box_Office'] - movies_df['Domestic_Box_Office'])
movies_df = movies_df.assign(Profit=movies_df['Worldwide_Box_Office'] - movies_df['Budget'])

movies_df.head()

Unnamed: 0,Release_Date,Title,Budget,Opening_Weekend,Domestic_Box_Office,Worldwide_Box_Office,International_Box_Office,Profit
144,2008-05-02,Iron Man,186000000,102118668,318604126,585171547,266567421,399171547
143,2008-06-13,The Incredible Hulk,137500000,55414050,134806913,265573859,130766946,128073859
142,2010-05-07,Iron Man 2,170000000,128122480,312433331,621156389,308723058,451156389
141,2011-05-06,Thor,150000000,65723338,181030624,449326618,268295994,299326618
140,2011-07-22,Captain America: The First …,140000000,65058524,176654505,370569776,193915271,230569776


In [4]:
# change monetary string values to float values
movies_df = movies_df.astype({'Budget':'float','Opening_Weekend':'float',
                              'Domestic_Box_Office':'float','Worldwide_Box_Office':'float',
                              'International_Box_Office':'float','Profit':'float'})
# set float option to remove sicentific notation
pd.set_option('display.float_format', lambda x: '%.0f' % x)

movies_df.dtypes

Release_Date                datetime64[ns]
Title                               object
Budget                             float64
Opening_Weekend                    float64
Domestic_Box_Office                float64
Worldwide_Box_Office               float64
International_Box_Office           float64
Profit                             float64
dtype: object

In [5]:
fig = go.Figure()
# fig.add_trace(go.Bar(
#     x=movies_df['Title'],
#     y=movies_df['Budget'],
#     name='Budget',
#     marker_color='red'
# ))
fig.add_trace(go.Bar(
    x=movies_df['Title'],
    y=movies_df['Domestic_Box_Office'],
    name='Domestic Box Office',
    marker_color='blue'
))

fig.add_trace(go.Bar(
    x=movies_df['Title'],
    y=movies_df['International_Box_Office'],
    name='International Box Office',
    # marker_color='lightsalmon'
))
fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Domestic vs International Box Office",
                  autosize=False,width=1300,height=700)
fig.update_traces(textangle=270)
fig.show()

In [6]:
fig = px.bar(movies_df, y='Budget', x='Title', text_auto='.2s',
            color_discrete_sequence=["red"],
            title="Budget by Movie Title")
fig.update_layout(barmode='group', xaxis_tickangle=-45,
                  autosize=False,width=1300,height=700)
fig.update_traces(textangle=270)

fig.show()

In [7]:
fig = px.bar(movies_df, y='Worldwide_Box_Office', x='Title', text_auto='.2s',
            color_discrete_sequence=["green"],
            title="Worldwide Box Office by Movie Title")
fig.update_layout(barmode='group', xaxis_tickangle=-45,
                  autosize=False,width=1300,height=700)
fig.update_traces(textangle=270)
fig.show()

In [8]:
fig = px.bar(movies_df, x="Title", y=['Budget',"Profit"],  title="Budget vs Profit",
            color_discrete_sequence=["red", "green"],text_auto='0.2s')
fig.update_layout(barmode='stack', xaxis_tickangle=-45,
                autosize=False,width=1300,height=700)
fig.update_traces(textangle=270, textposition='outside', width=0.5)
fig.show()

In [9]:
# Assign MCU phases
movies_df.reset_index(drop=True, inplace=True)
movies_df.loc[0:5,'Phase'] = 1
movies_df.loc[6:11,'Phase'] = 2
movies_df.loc[12:21,'Phase'] = 3
movies_df.loc[22:30,'Phase'] = 4
movies_df
movies_df.to_csv('marvel_box_office.csv', sep='\t', encoding='utf-8')

In [10]:
#  split movies into phases
phase_1 = movies_df.loc[0:5]
#  Phase one profit trend
fig_p1 = px.scatter(phase_1, x= phase_1.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["blue", "red"],
                  title="Phase One Worldwide Box Office by Movie Title")
fig_p1.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [0,1, 2, 3, 4, 5],
        ticktext = ['Iron Man', 'The Incredible Hulk', 'Thor', "Iron Man 2",
                    'Captain America',"The Avengers"],
        title = 'Movie Title'
    )
)

fig_p1.show()

In [11]:
#  Phase 2 profit trend
phase_2 = movies_df.loc[6:11]

fig = px.scatter(phase_2, x= phase_2.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["orange", "red"],
                title="Phase Two Worldwide Box Office by Movie Title",
               )
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [6,7, 8, 9, 10, 11],
        ticktext = ['Iron Man 3', 'Thor: The Dark World',
                    'Captain America Winter Soldier','Guardians of the Galaxy',
                    "Avengers: Age of Ultron", "Ant Man"],
        title = 'Movie Title'
    )
)

fig.show()

In [12]:
#  Phase 3 profit trend
phase_3 = movies_df.loc[12:21]

fig = px.scatter(phase_3, x= phase_3.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["green", "red"],
                  title="Phase Three Worldwide Box Office by Movie Title")
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [12,13,14,15,16,17,18,19,20],
        ticktext = ['Captain America Civil War', 'Doctor Strange',
                    'Guardians of the Galaxy Vol 2', "Spider-Man: Homecoming",
                    'Thor: Ragnarok',"Black Panther	", "Ant-Man and the Wasp",
                    'Captain Marvel', 'Avengers: Endgame'],
        title = 'Movie Title'
    )
)

fig.show()


In [13]:
#  Phase 2 profit trend
phase_4 = movies_df.loc[22:30]
fig = px.scatter(phase_4, x= phase_4.index, y="Worldwide_Box_Office", size = 'Profit',
                 trendline="ols", trendline_scope="overall",
                 color_discrete_sequence=["red", "red"],
                  title="Phase Four Worldwide Box Office by Movie Title")
fig.update_layout(
            autosize=False,
    width=800,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [22,23,24,25,26,27,28,29,30],
        ticktext = ['Spider-Man: Far From Home', 'Black Widow', 'Shang-Chi', 'Eternals',
                    'Spider-Man: No Way Home', "Doctor Strange: MOM",'Thor: Love and Thunder',
                    "Black Panther: Wakanda Forever", "Ant-Man and the Wasp: Quantumania"],
        title = 'Movie Title'
    )
)

fig.show()

In [14]:
# use facet_col to show phases and trendlines in one chart
fig = px.scatter(movies_df, x=movies_df.index, y="Worldwide_Box_Office", size="Profit",
                facet_col="Phase", color='Phase', trendline="ols",)
                
fig.update_xaxes(matches=None)
fig.for_each_xaxis(lambda xaxis: xaxis.update(showticklabels=True))
fig.update_layout(height=500, width=1200,
                  xaxis = dict(
                    tickmode = 'array',
                    tickvals = [0,1, 2, 3, 4, 5],
                    ticktext = ['Iron Man', 'The Incredible Hulk', 'Thor', "Iron Man 2",
                    'Captain America',"The Avengers"]),
                  xaxis2 = dict(
                    tickmode = 'array',
                    tickvals = [6,7, 8, 9, 10, 11],
                    ticktext = ['Iron Man 3', 'Thor: The Dark World',
                    'Captain America Winter Soldier','Guardians of the Galaxy',
                    "Avengers: Age of Ultron", "Ant Man"]),
                  xaxis3 = dict(
                      tickmode = 'array',
                      tickvals = [12,13,14,15,16,17,18,19,20],
                      ticktext = ['Captain America Civil War', 'Doctor Strange',
                                  'Guardians of the Galaxy Vol 2', "Spider-Man: Homecoming",
                                  'Thor: Ragnarok',"Black Panther	", "Ant-Man and the Wasp",
                                  'Captain Marvel', 'Avengers: Endgame']),
                  xaxis4=dict(tickmode = 'array',
                     tickvals = [22,23,24,25,26,27,28,29,30],
                     ticktext=['Spider-Man: Far From Home', 'Black Widow', 'Shang-Chi', 'Eternals',
                    'Spider-Man: No Way Home', "Doctor Strange: MOM",'Thor: Love and Thunder',
                    "Black Panther: Wakanda Forever", "Ant-Man and the Wasp: Quantumania"]),

                  title_text="Worldwide Box Office with phase trendlines and size of dot shows how profitable")
fig.update_xaxes(tickangle=45, title="Movie Title")
fig.update_yaxes(title="Worldwide Box Office")

fig.show()

In [15]:
# no_spider1 = phase_4.drop([22, 26])
# fig = px.scatter(no_spider1, x= no_spider1.index, y="Worldwide_Box_Office", size = 'Profit',
#                  trendline="ols", trendline_scope="overall",
#                  color_discrete_sequence=["red", "red"])
# fig.update_layout(
#             autosize=False,
#     width=1000,
#     height=500,
#     margin=dict(
#     ),
#     xaxis = dict(
#         tickmode = 'array',
#         tickvals = [23,24,25,27,28,29,30],
#         ticktext = [ 'Black Widow', 'Shang-Chi', 'Eternals',"Doctor Strange: MOM",
#                     'Thor: Love and Thunder',"Black Panther: Wakanda Forever",
#                       "Ant-Man and the Wasp: Quantumania"],
#         title = 'Movie Title'
#     )
# )

# fig.show()

In [16]:
# scatter subplots for ww box office profit
fig = make_subplots(rows=2, cols=2, start_cell="top-left",
                    subplot_titles=("Phase 1", "Phase 2", "Phase 3", "Phase 4")
                    )

fig.add_trace(go.Scatter(x=phase_1.index, y=phase_1['Profit'],
                 name="Phase 1"),
              row=1, col=1)

fig.add_trace(go.Scatter(x=phase_2.index, y=phase_2['Profit'],
               name="Phase 1"),
              row=1, col=2)

fig.add_trace(go.Scatter(x=phase_3.index, y=phase_3['Profit'],
               name="Phase 1"),
              row=2, col=1)

fig.add_trace(go.Scatter(x=phase_4.index, y=phase_4['Profit'],
               name="Phase 1"),
              row=2, col=2)

fig.update_layout(height=1000, width=1000,
                  xaxis = dict(
                    tickmode = 'array',
                    tickvals = [0,1, 2, 3, 4, 5],
                    ticktext = ['Iron Man', 'The Incredible Hulk', 'Thor', "Iron Man 2",
                    'Captain America',"The Avengers"]),
                  xaxis2 = dict(
                    tickmode = 'array',
                    tickvals = [6,7, 8, 9, 10, 11],
                    ticktext = ['Iron Man 3', 'Thor: The Dark World',
                    'Captain America Winter Soldier','Guardians of the Galaxy',
                    "Avengers: Age of Ultron", "Ant Man"]),
                  xaxis3 = dict(
                      tickmode = 'array',
                      tickvals = [12,13,14,15,16,17,18,19,20],
                      ticktext = ['Captain America Civil War', 'Doctor Strange',
                                  'Guardians of the Galaxy Vol 2', "Spider-Man: Homecoming",
                                  'Thor: Ragnarok',"Black Panther	", "Ant-Man and the Wasp",
                                  'Captain Marvel', 'Avengers: Endgame']),
                  xaxis4=dict(tickmode = 'array',
                     tickvals = [22,23,24,25,26,27,28,29,30],
                     ticktext=['Spider-Man: Far From Home', 'Black Widow', 'Shang-Chi', 'Eternals',
                    'Spider-Man: No Way Home', "Doctor Strange: MOM",'Thor: Love and Thunder',
                    "Black Panther: Wakanda Forever", "Ant-Man and the Wasp: Quantumania"]),

                  title_text="Profit of each Movie by Phase")
fig.update_xaxes(tickangle=45)
fig.show()

In [17]:
# Group and average phases
four_phase_avg = movies_df.groupby(['Phase']).mean()
four_phase_avg =four_phase_avg.reset_index()
#Remove Spiderman moviesfrom phase 4 and average phases
no_spider2 = movies_df.drop([22, 26])
no_spider_avg = no_spider2.groupby(['Phase']).mean()
no_spider_avg = no_spider_avg.reset_index()

In [18]:
# Show profitabilty of all 4 phases
fig = px.scatter(four_phase_avg, x='Phase', y="Profit", size = 'Profit',trendline="ols", trendline_scope="overall")
fig.update_layout(
            autosize=False,
    width=1000,
    height=500,
    margin=dict(
    ),
    xaxis = dict(
        tickmode = 'array',
        tickvals = [1, 2, 3, 4,],
        ticktext = ['One', 'Two', 'Three', 'Four']
    )
)
fig.show()

In [19]:


labels = ['Phase 1', 'Phase 2', 'Phase 3', 'Phase 4']

# Define color sets of paintings
colors = px.colors.sequential.RdBu
# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=2, cols=2, specs=specs,
                     subplot_titles=['Worldwide Box Office',
                    'Worldwide Box Office No Spiderman', "Profit Phase", "Profit no Spiderman"])

# Define pie charts
fig.add_trace(go.Pie(labels=labels, values=four_phase_avg['Worldwide_Box_Office'],
                     name='With Spiderman',marker_colors=colors), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=no_spider_avg['Worldwide_Box_Office'],
                     name='No Spiderman',
                     marker_colors=colors), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=four_phase_avg['Profit'],
                     name='With Spiderman',marker_colors=colors), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=no_spider_avg['Profit'],
                     name='No Spiderman',
                     marker_colors=colors), 2, 2)

# Tune layout and hover info
fig.update_traces(hoverinfo='label+percent+name')
fig.update(layout_title_text='Worldwide Box Office and Profit Phase',
           layout_showlegend=True,)
fig.update_layout(autosize=False,width=1000,height=500)

# fig = go.Figure(fig)
fig.show()