In [1]:
import sqlite3
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import functions as fs
# %matplotlib inline

In [2]:
conn = sqlite3.connect('movies.db')
cur = conn.cursor()

In [3]:
cur.execute("""SELECT mb.release_date, mb.movie, mb.production_budget, mb.domestic_gross, mb.worldwide_gross, tm.genre_ids, tm.popularity, tm.vote_average, tm.vote_count FROM movie_budgets mb left join tmdb_movies tm on mb.movie = tm.title ;
          """)
gross_df = pd.DataFrame(cur.fetchall())
gross_df.columns = [x[0] for x in cur.description]
print(len(gross_df))
gross_df

6191


Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,genre_ids,popularity,vote_average,vote_count
0,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279","[28, 12, 14, 878]",26.526,7.4,18676
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875","[12, 28, 14]",30.579,6.4,8571
2,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",,,,
3,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963","[28, 12, 878]",44.383,7.3,13457
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747","[14, 12, 878, 28]",34.293,7,8534
...,...,...,...,...,...,...,...,...,...
6186,"Dec 31, 2018",Red 11,"$7,000",$0,$0,,,,
6187,"Apr 2, 1999",Following,"$6,000","$48,482","$240,495",,,,
6188,"Jul 13, 2005",Return to the Land of Wonders,"$5,000","$1,338","$1,338",,,,
6189,"Sep 29, 2015",A Plague So Pleasant,"$1,400",$0,$0,,,,


In [4]:
cur.execute("""SELECT * FROM genre_ids;
          """)
genre_df = pd.DataFrame(cur.fetchall())
genre_df.columns = [x[0] for x in cur.description]
print(len(genre_df))
genre_df

19


Unnamed: 0,Unnamed: 1,id,name
0,0,28,Action
1,1,12,Adventure
2,2,16,Animation
3,3,35,Comedy
4,4,80,Crime
5,5,99,Documentary
6,6,18,Drama
7,7,10751,Family
8,8,14,Fantasy
9,9,36,History


In [5]:
def clean_currency(df, column=None):
    """ Used for removing "$" and "," from currency

    Parameters:

    df (str) : Dataframe to perform cleaning function on
    column ( str): Column within dataframe to perform cleaning function on

    Returns:
    pd.Series: Cleaned df['column']
    
    """
    df[f'{column}'] = df[f'{column}'].apply(lambda x: x.replace(',','').replace('$',''))
    return df[f'{column}']

In [6]:
def to_date(df, column=None):
    """ Used for turning strings to date type pandas series

    Parameters:

    df (str) : Dataframe to perform date function on
    column ( str): Column within dataframe to date cleaning function on

    Returns:
    pd.Series: date type df['column']
    
    """
    df[f'{column}'] = pd.to_datetime(df[f'{column}'])
    return df[f'{column}']

In [7]:
def to_numeric(df, column=None):
    """ Used for turning any appropriate column into numeric values

    Parameters:

    df (str) : Dataframe to perform numeric function on
    column ( str): Column within dataframe to perform numeric function on

    Returns:
    pd.Series: Numeric df['column']
    
    """
    df[f'{column}'] = pd.to_numeric(df[f'{column}'])
    return df[f'{column}']

In [8]:
def gid_to_gname():
    """ Used to access each item within a pandas series, enter the list, and replace genre id's with genre category it represents keeping the list intact.

    Parameters:

    None

    Returns:
    pd.Series: Updated series gathered from genre_dict key: value pairs.
    
    """
    for x in gross_df.genre_ids:
        print(x)
        if x != None:
            for i, ele in enumerate(x):
                if ele in genre_dict.keys():
                    x[i] = genre_dict[ele]
                    # print(ele)
                    print(x[i])

In [9]:
def to_date(df, column=None):
    """ Used for turning strings to date type pandas series

    Parameters:

    df (str) : Dataframe to perform date function on
    column ( str): Column within dataframe to date cleaning function on

    Returns:
    pd.Series: date type df['column']
    
    """
    df[f'{column}'] = pd.to_datetime(df[f'{column}'])
    return df[f'{column}']

In [10]:
def find_corr(df, name=None, x=None, y=None):
    """ Used to identify the correlation between two pandas series

    Parameters:

    df (str) : Dataframe to perform correlation function on
    column ( str): Column within dataframe to correlation cleaning function on

    Returns:
    pd.Series: date type df['column']
    
    """
    name = np.corrcoef(df[f'{x}'], df[f'{y}'])
    name = round(name[0,1], 2)
    return name


In [11]:
genre_dict = genre_df.set_index('id')['name'].to_dict()

In [12]:
gross_df = gross_df.sort_values(by='domestic_gross', ascending=False)
# gross_df.genre_ids = gross_df.genre_ids.apply(lambda x: '0' if x == None else x)
gross_df.genre_ids = gross_df.genre_ids.apply(lambda x: x.strip("[]").replace(',','').split(' ') if x != None else x)
gross_df.genre_ids = gross_df.genre_ids.apply(gid_to_gname())
gross_df.genre_ids = gross_df.genre_ids.astype(str)
# gross_df.genre_ids = gross_df.genre_ids.apply(lambda x: x.replace("[",'').replace(']',''))
gross_df.head()

 '10751']
Adventure
Comedy
Family
None
None
None
['53', '80']
Thriller
Crime
['28', '9648', '53']
Action
Mystery
Thriller
['53', '80']
Thriller
Crime
None
['35', '80']
Comedy
Crime
None
None
['35', '99', '28']
Comedy
Documentary
Action
None
['80', '18', '35']
Crime
Drama
Comedy
None
None
None
None
['12', '28', '53', '878']
Adventure
Action
Thriller
Science Fiction
None
None
None
None
None
None
['18']
Drama
None
None
['18', '10752']
Drama
War
None
None
['16', '12', '10751']
Animation
Adventure
Family
None
['35']
Comedy
None
['14', '18', '35', '10749', '10751']
Fantasy
Drama
Comedy
Romance
Family
['14', '18', '35', '10749', '10751']
Fantasy
Drama
Comedy
Romance
Family
['18', '10749']
Drama
Romance
None
['18']
Drama
None
None
None
['12', '35', '14', '10751']
Adventure
Comedy
Fantasy
Family
['18', '35']
Drama
Comedy
['35', '18', '10749']
Comedy
Drama
Romance
None
None
None
['35']
Comedy
['28', '53']
Action
Thriller
['35']
Comedy
None
None
None
None
None
None
None
None
None
None
['16', '12'

TypeError: 'NoneType' object is not callable

In [13]:
gross_df.genre_ids[1]
# type(gross_df.genre_ids[1])


['Adventure', 'Action', 'Fantasy']

In [14]:
to_numeric(gross_df, column='popularity')

349        NaN
1718       NaN
704     20.931
4685       NaN
771     23.922
         ...  
5564     0.885
5565       NaN
5566       NaN
5567       NaN
5103       NaN
Name: popularity, Length: 6191, dtype: float64

In [15]:
currency_list = ['domestic_gross','worldwide_gross','production_budget']

for i in currency_list:
    clean_currency(gross_df, column=i)
    gross_df[i] = pd.to_numeric(gross_df[i])

In [16]:
to_date(gross_df, column='release_date')

349    2008-07-04
1718   2011-02-11
704    2018-08-03
4685   1999-06-04
771    1997-06-13
          ...    
5564   2017-03-07
5565   2009-04-07
5566   2010-03-23
5567   2015-04-07
5103   2013-01-15
Name: release_date, Length: 6191, dtype: datetime64[ns]

In [17]:
find_corr(grosscat, name='wwcor', x='production_budget', y='worldwide_gross')
find_corr(grosscat, name='dcor', x='production_budget', y='domestic_gross')
find_corr(grosscat, name='gcor', x='domestic_gross', y='worldwide_gross')

NameError: name 'grosscat' is not defined

## main fig

In [18]:
grosscat = gross_df.sort_values(by='worldwide_gross', ascending=False).drop_duplicates(subset=['movie'], keep='last')
grosscat = grosscat[(grosscat['domestic_gross'] != 0) & (grosscat['worldwide_gross'] != 0)]
grosscat['profit'] = grosscat.worldwide_gross - grosscat.production_budget
# fig2 = px.scatter(grosscat, x='production_budget', y='worldwide_gross', trendline='ols', trendline_color_override='#bf9f3d')
# fig3 = px.scatter(grosscat, x='production_budget', y='domestic_gross', trendline='ols', trendline_color_override='#663837')
# trendline = fig2.data[1]
# trendline2 = fig3.data[1]

# fig = go.Figure(data=[
#     go.Scatter(name='World Wide', hovertext=grosscat.movie, x=grosscat.production_budget, y=grosscat.worldwide_gross, mode='markers',marker=dict(color='burlywood',size=10,opacity=0.9, line=dict(width=0.5, color='seashell'))),
#     go.Scatter(name='Domestic', hovertext=grosscat.movie, x=grosscat.production_budget, y=grosscat.domestic_gross, mode='markers',marker=dict(color='#b36360',size=10,opacity=0.9, line=dict(width=0.5, color='seashell'))),
# ])
# fig.add_trace(trendline)
# fig.add_trace(trendline2)

# fig.update_layout(title=dict(text='Revenue and Budget by Genre', y=0.98,x=0.5, xanchor='center', yanchor='auto'),plot_bgcolor='lightslategrey',paper_bgcolor='ivory', width=700, height=400)

# fig.update_traces(marker=dict(line=dict(width=2, color='seashell')),
#                   selector=dict(type='bar')) # marker_color="black"
# fig.show()


In [19]:
gross_df[gross_df['movie'] == 'Avatar']

Unnamed: 0,release_date,movie,production_budget,domestic_gross,worldwide_gross,genre_ids,popularity,vote_average,vote_count
0,2009-12-18,Avatar,425000000,760507625,2776345279,"[Action, Adventure, Fantasy, Science Fiction]",26.526,7.4,18676


In [20]:
gross_df.genre_ids = gross_df.genre_ids.astype(str)
fig2 = px.scatter(grosscat[grosscat.profit<0], x='production_budget', y='profit',color='profit', hover_name='movie', color_continuous_scale=px.colors.sequential.turbid)
fig = px.scatter(grosscat[grosscat.profit>0], x="production_budget", y="profit",
	         color='profit', size='profit', color_continuous_scale=px.colors.sequential.Sunset,
                 hover_name="movie", log_x=False,log_y=False, size_max=30)
fig.add_trace(fig2.data[0])
fig.update_layout(title=dict(text='Profit Generated vs. Production Budget', y=0.98,x=0.5, xanchor='center', yanchor='auto'), hoverlabel=dict(bgcolor='#ECD7A7'),plot_bgcolor='lightslategrey',paper_bgcolor='ivory', width=1400, height=800)
fig.show()

In [21]:
franchise = pd.read_csv(r'C:\Users\Juice\Python_Projects\flatiron\class-materials\section01\projects\dsc-phase-1-project-online\unzipped_data\franchises.csv', encoding='latin-1')
# franchise.set_index('Franchise')
franchise.head()


Unnamed: 0,Franchise,Total,Releases,#1 Release,Lifetime Gross
0,Marvel Cinematic Universe,8546269257,25,Avengers: Endgame,858373000
1,Star Wars,5080236177,21,Star Wars: Episode VII - The Force Awakens,936662225
2,Disney Live Action Reimaginings,3407690881,16,The Lion King,543638043
3,J.K. Rowling's Wizarding World,2786941008,26,Harry Potter and the Deathly Hallows: Part 2,381011219
4,Spider-Man,2496998119,9,Spider-Man,403706375


In [22]:
fig = px.scatter(franchise, x='Releases', y="Total",
	         color='Total', size='Total', color_continuous_scale=px.colors.sequential.Sunset,
                 hover_name="Franchise", size_max=30)
fig.update_layout(title=dict(text='Profit Generated vs. Production Budget', y=0.98,x=0.5, xanchor='center', yanchor='auto'),plot_bgcolor='lightslategrey',paper_bgcolor='ivory', width=1400, height=800)
fig.show()

In [23]:
cur.execute("""SELECT tb.primary_title, nb.primary_name, death_year FROM title_crew join title_principals  using (tconst) join name_basics nb using (nconst) join title_basics tb using(tconst) where category = 'director' and death_year == '' group by tconst;
          """)
crew_df = pd.DataFrame(cur.fetchall())
crew_df.columns = [x[0] for x in cur.description]
crew_df.loc[crew_df['primary_name'] == 'Ravi Punj',['primary_name']] = 'James Cameron'
crew_df.loc[crew_df['primary_title'] == 'Avatar 2', ['primary_title']] = 'Avatar'
print(len(crew_df))
crew_df

127728


Unnamed: 0,primary_title,primary_name,death_year
0,The Wandering Soap Opera,Valeria Sarmiento,
1,A Thin Life,Frank Howson,
2,Bigfoot,Mc Jones,
3,O Silêncio,José Manuel Alves Pereira,
4,Pál Adrienn,Ágnes Kocsis,
...,...,...,...
127723,Kuambil Lagi Hatiku,Azhar Kinoi Lubis,
127724,Rodolpho Teóphilo - O Legado de um Pioneiro,Angela Gurgel,
127725,Dankyavar Danka,Kanchan Nayak,
127726,6 Gunn,Kiran Gawade,


In [24]:
crew_df[crew_df['primary_title'] == 'Avatar 2']

Unnamed: 0,primary_title,primary_name,death_year


In [25]:
crew_df[crew_df['primary_name'] == 'JJ Abrahms']


Unnamed: 0,primary_title,primary_name,death_year


Unnamed: 0,original_title,primary_name,death_year
120452,Titanic,TTESTE,


In [26]:
top15dom = gross_df.sort_values(by='domestic_gross', ascending=False).drop_duplicates(subset=['movie'], keep='last').head(15)
top15ww = gross_df.sort_values(by='worldwide_gross', ascending=False).drop_duplicates(subset=['movie'], keep='last').head(15)
low15dom = gross_df[gross_df['domestic_gross'] != 0].sort_values(by='domestic_gross', ascending=True).drop_duplicates(subset=['movie'], keep='last').head(15)
low15ww = gross_df.sort_values(by='worldwide_gross', ascending=True).drop_duplicates(subset=['movie'], keep='last').head(15)

In [27]:
directors = pd.merge(crew_df,grosscat, how='right', left_on='primary_title',right_on='movie')
directors[directors['primary_name'] == 'James Cameron']

Unnamed: 0,primary_title,primary_name,death_year,release_date,movie,production_budget,domestic_gross,worldwide_gross,genre_ids,popularity,vote_average,vote_count,profit
1034,Avatar,James Cameron,,2009-12-18,Avatar,425000000,760507625,2776345279,"[Action, Adventure, Fantasy, Science Fiction]",26.526,7.4,18676,2351345279
2777,Titanic,James Cameron,,1997-12-19,Titanic,200000000,659363944,2208208395,"[Drama, Romance]",50.39,7.9,17632,2008208395


In [28]:
dir_num = directors.groupby('primary_name').mean()
directors = pd.merge(dir_num, directors,how='inner', on='primary_name')

In [43]:
directors = directors.sort_values(by='worldwide_gross_x', ascending=False)
directors = directors[(directors['primary_name'] != 'Atsushi Wada') & (directors['primary_name'] != 'Chi-kin Kwok')]
directors = directors.iloc[:,:7]
directors.columns = ['name','production_budget','domestic_gross','worldwide_gross','popularity','profit','title']
directors.head()

Unnamed: 0,name,production_budget,domestic_gross,worldwide_gross,popularity,profit,title
1030,James Cameron,312500000.0,709935784.5,2492277000.0,38.458,2179777000.0,Avatar
1031,James Cameron,312500000.0,709935784.5,2492277000.0,38.458,2179777000.0,Titanic
170,Anthony Russo,240000000.0,448882263.0,1300869000.0,45.994333,1060869000.0,Captain America: Civil War
171,Anthony Russo,240000000.0,448882263.0,1300869000.0,45.994333,1060869000.0,Avengers: Infinity War
169,Anthony Russo,240000000.0,448882263.0,1300869000.0,45.994333,1060869000.0,Captain America: The Winter Soldier


In [30]:
# director_rank = pd.read_csv(r'C:\Users\Juice\Python_Projects\flatiron\class-materials\section01\projects\dsc-phase-1-project-online\unzipped_data\directors.csv')


In [31]:
# directors = pd.merge(directors,director_rank, how='inner', left_on='primary_name',right_on='name')
# directors.head()

In [32]:
list(directors.production_budget[:15])

AttributeError: 'DataFrame' object has no attribute 'production_budget'

In [33]:
directors[directors['primary_name'] == 'Steven Spielberg'].sort_values(by='worldwide_gross', ascending=False)

KeyError: 'worldwide_gross'

In [47]:
directors = directors.sort_values('worldwide_gross', ascending=False)
# gross_df.genre_ids = gross_df.genre_ids.astype(str)
fig2 = px.scatter(directors[directors.profit<0], x='production_budget', y='profit',color='profit', hover_name='name',hover_data=["title"],color_continuous_scale=px.colors.sequential.turbid)
fig = px.scatter(directors[directors.profit>0], x="production_budget", y="profit",
	         color='profit', size='profit', color_continuous_scale=px.colors.sequential.Sunset,
                 hover_name="name",hover_data=["title"], log_x=False,log_y=False, size_max=40)
fig.add_trace(fig2.data[0])

# fig.add_trace(go.Scatter(
#     x=directors.production_budget.apply(lambda x:x-15000000),
#     y=directors.profit.apply(lambda x:x+100000000),
#     mode="text",
#     text=directors.name[:5],
#     # textposition="top left",
#     hoverinfo='none',
#     showlegend=False,
#     textfont=dict(
#         size=14,
#         color="seashell",
#     )
    

# ))
# fig.add_annotation(
#             x=list(directors.production_budget[:15]),
#             y=list(directors.profit.values[:15]),
#             text=str('fdsfds'))
# fig.update_annotations(dict(
#             xref="x",
#             yref="y",
#             showarrow=True,
#             arrowhead=7,
#             ax=0,
#             ay=-40
# ))
fig.update_layout(title=dict(text='Profit Generated vs. Production Budget Sized by Director Rating', y=0.98,x=0.5, xanchor='center', yanchor='auto'), hoverlabel=dict(bgcolor='#ECD7A7'),plot_bgcolor='lightslategrey',paper_bgcolor='ivory', width=1400, height=800)
fig.show()

In [35]:
directors.head()

Unnamed: 0,primary_name,production_budget_x,domestic_gross_x,worldwide_gross_x,popularity_x,profit_x,primary_title,death_year,release_date,movie,production_budget_y,domestic_gross_y,worldwide_gross_y,genre_ids,popularity_y,vote_average,vote_count,profit_y
1030,James Cameron,312500000.0,709935784.5,2492277000.0,38.458,2179777000.0,Avatar,,2009-12-18,Avatar,425000000,760507625,2776345279,"[Action, Adventure, Fantasy, Science Fiction]",26.526,7.4,18676,2351345279
1031,James Cameron,312500000.0,709935784.5,2492277000.0,38.458,2179777000.0,Titanic,,1997-12-19,Titanic,200000000,659363944,2208208395,"[Drama, Romance]",50.39,7.9,17632,2008208395
170,Anthony Russo,240000000.0,448882263.0,1300869000.0,45.994333,1060869000.0,Captain America: Civil War,,2016-05-06,Captain America: Civil War,250000000,408084349,1140069413,"[Adventure, Action, Science Fiction]",39.137,7.4,14000,890069413
169,Anthony Russo,240000000.0,448882263.0,1300869000.0,45.994333,1060869000.0,Captain America: The Winter Soldier,,2014-04-04,Captain America: The Winter Soldier,170000000,259746958,714401889,"[Action, Adventure, Science Fiction]",18.073,7.7,11034,544401889
171,Anthony Russo,240000000.0,448882263.0,1300869000.0,45.994333,1060869000.0,Avengers: Infinity War,,2018-04-27,Avengers: Infinity War,300000000,678815482,2048134200,"[Adventure, Action, Fantasy]",80.773,8.3,13948,1748134200


In [36]:
5

5

- include size based on ranking or rating of director in one or some of the plots
- maybe scrape actors list and include in scatter or bars