In [79]:
import pandas as pd
import numpy as np
import plotly as py
import plotly.graph_objs as go
import ipywidgets as widgets
import datetime

import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff

py.offline.init_notebook_mode(connected=True)

from math import floor

In [8]:
data = pd.read_csv("../data/cleaned_data.csv")

In [9]:
data["profit"] = data["revenue"] - data["budget"]

In [10]:
profit_by_dow = data[data["year"]>1990].groupby(["dow"]).profit.mean().reset_index()

In [16]:
bar_data = [go.Bar(x=['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']\
                   , y=profit_by_dow["profit"])]

py.offline.iplot({ 'data': bar_data,
            'layout': {
               'title': 'Average Profit by Day of Week',
               'xaxis': {
                 'title': 'Day of Week'},
               'yaxis': {
                'title': 'Profit'}
        }})

Release movies on Tuesday 

### Violin Plots

In [17]:
def extract_decade(x):
    return str(floor(x/10)*10)+"s"

In [20]:
data["decade"] = data["year"].apply(extract_decade)

In [21]:
df = data

In [29]:
df = df.sort_values(by=['decade'], ascending=True)

In [34]:
da = []
for i in range(0,len(pd.unique(df['decade']))):
    trace = {
            "type": 'violin',
            "x": df['decade'][df['decade'] == pd.unique(df['decade'])[i]],
            "y": df['vote_average'][df['decade'] == pd.unique(df['decade'])[i]],
            "name": pd.unique(df['decade'])[i],
            "box": {
                "visible": True
            },
            "meanline": {
                "visible": True
            }
        }
    da.append(trace)

        
fig = {
    "data": da,
    "layout" : {
        "title": "Average Movie Ratings by Decade",
            "xaxis" : dict(title = 'Decade', autotick=False, showticklabels=True),
            "yaxis" : dict(title = 'Average Rating')
    }
}

iplot(fig, validate = False)

In [37]:
df.columns

Index(['budget', 'genres', 'keywords', 'original_language', 'overview',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages',
       'vote_average', 'vote_count', 'title', 'cast', 'crew', 'year', 'month',
       'day', 'dow', 'profit', 'decade'],
      dtype='object')

### The profit values are too high in some cases like when they say billions

In [57]:
df.profit.values

array([  8008844, -91969578,  21755000, ..., 125478348,  -6617054,
       -18000000])

### Bubble Chart

In [78]:
layout = go.Layout(
    title='Profit vs. Budget',
    xaxis=dict(
        title='Budget',
        gridcolor='rgb(255, 255, 255)',
        range=[0, 4e8],
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    ),
    yaxis=dict(
        title='Profit',
        gridcolor='rgb(255, 255, 255)',
        zerolinewidth=1,
        ticklen=5,
        gridwidth=2,
    )
)
data = [go.Scatter(
    x=df.budget.values, # Budget
    y=df.profit.values,  # Gross
    mode='markers',
    text= df.title.values, # Movie Titles
    marker=dict(
        size=2.7*(df.vote_average),
        sizeref=1.0,
        color=df.vote_average.values,
        showscale=True,
        colorscale='Viridis'
    ))]
fig = go.Figure(data=data, layout=layout)
iplot(fig)