In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../input/marvel-films/df_marvel.csv')
print(df.head(10))
print(df.dtypes)

In [None]:
#Convert Dates to Datetime objects
df['ReleaseDateUS'] = pd.to_datetime(df['ReleaseDateUS'])
#Map Bool values to more readable values
df['Animated'] = df['Animated'].map({0: 'No', 1: 'Yes'})
df['MCU'] = df['MCU'].map({0: 'No', 1: 'Yes'})

In [None]:
#Basic info on mean, mode, standard deviation
Categories = ['Distributor', 'MCU','Production', 'Director', 'Producer', 'Animated','Race', 'Gender']
Continuous = ['Budget','OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide', 'Length']
print('Mean:')
print(df[Continuous].mean())
print('\nStandard Deviation:')
print(df[Continuous].std())
print('\n Most common Category:')
print(df[Categories].mode().dropna())

In [None]:
#Some General Pair plots to compare all the variables for the simple categories
sns.pairplot(df, hue = 'Animated', palette = 'Dark2')
sns.pairplot(df, hue = 'MCU', palette = 'CMRmap')
sns.pairplot(df, hue = 'Gender', palette = 'Paired')
sns.pairplot(df, hue = 'Race', palette = 'Set3')

In [None]:
#Some simple tables to show relationships
#Total gross of each Director, by Studio
print('Total gross of each Director, by Studio')
print(df.pivot_table(index = 'Distributor', columns = 'Director', values = 'Worldwide', aggfunc='sum').T)
#Surprisingly, this shows not a single director has worked on marvel movies for more than 1 studio

In [None]:
#Average gross of each Studio, by Race
print('Average gross for each Studio, by Race of Superhero')
print(df.pivot_table(index = 'Distributor', columns = 'Race', values = 'Worldwide', aggfunc='mean'))
#Only 2 studios have produced Marvel movies with multiple races(and only 1 studio had an Asian superhero!),
#...and the difference in gross varies with both studio

In [None]:
#Average gross of each Studio, by Gender
print('Average gross for each Studio, by Race of Superhero')
data = df.pivot_table(index = 'Distributor', columns = 'Gender', values = 'Worldwide', aggfunc='mean')
print(data)
#Only 2 studios have produced Marvel movies with Female leads!,
#...and the difference in average gross by gender is staggering
data = data.dropna()
data = data.stack()
data


In [None]:
#Average gross of each Studio, by Gender of Superhero
print('Average gross for each Studio, by Race of Superhero')
data = pd.DataFrame(df.pivot_table(index = 'Distributor', columns = 'Gender', values = 'Worldwide', aggfunc='mean').dropna().stack())
data = pd.DataFrame(data.reset_index())
data.columns = (['Studio','Gender', 'Gross_mil'])
data
sns.barplot(data = data, x = data.Studio, y = data['Gross_mil']/1000000, hue = data.Gender )

In [None]:
#Average gross of each Studio, by Animated or not
print('Average gross for each Studio, by Race of Superhero')
data = pd.DataFrame(df.pivot_table(index = 'Distributor', columns = 'Animated', values = 'Worldwide', aggfunc='mean').dropna().stack())
data = pd.DataFrame(data.reset_index())
data.columns = (['Studio','Animated', 'Gross_mil'])
data
sns.barplot(data = data, x = data.Studio, y = data['Gross_mil']/1000000, hue = data.Animated, palette = 'pastel' )

In [None]:
#Lets see Average grosses by director and by studio
df2 = df[['Distributor','OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']].groupby('Distributor').mean()
df3 = df[['Director','OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']].groupby('Director').mean()
gross_list = ['OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']

i = 1
fig = plt.figure(figsize = (20,10))
for g in gross_list:
    df2 = df2.sort_values(by = g, ascending = 0)
    sub = fig.add_subplot(2,2,i)
    sub.set_ylabel(g)
    sub.ticklabel_format(useOffset=False, style='plain')
    sub.set_ylim((0.999 * df2[g].min()), (1.001* df2[g].max()))
    sub.tick_params(labelrotation=45)
    sns.barplot(data=df2, x = df2[g], y = df2.index, palette = 'bright')
    i += 1
plt.tight_layout()


i = 1
fig = plt.figure(figsize = (20,30))
for g in gross_list:
    df3 = df3.sort_values(by = g, ascending = 0)
    sub = fig.add_subplot(2,2,i)
    sub.set_ylabel(g)
    sub.ticklabel_format(useOffset=False, style='plain')
    sub.set_ylim((0.999 * df2[g].min()), (1.001* df2[g].max()))
    sub.tick_params(labelrotation=45)
    sns.barplot(data=df3, x = df3[g], y = df3.index, palette = 'Spectral')
    i += 1
plt.tight_layout()


In [None]:
#Histogram and distribution of film lengths
plt.xlabel('Length in Minutes')
sns.histplot(df['Length'], kde = True, bins = 40, color = 'red')  
    

In [None]:
#Histogram and distribution of film budgets
plt.xlabel('Budget(million)')
sns.histplot(df['Budget']/1000000, kde = True, bins = 40, color = 'green')

In [None]:
#How the budget has changed over the years, per studio
plt.figure(figsize = (15,10))
sns.lineplot(x = df['ReleaseDateUS'], y = df['Budget']/1000000, hue = df['Production'])
plt.ylabel('Budget(Milllion)')
plt.tight_layout()

In [None]:
#Lifetime grosses by director,studio and producer
df2 = df[['Distributor','OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']].groupby('Distributor').sum()
df3 = df[['Director','OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']].groupby('Director').sum()
df4 = df[['Producer','OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']].groupby('Producer').sum()
gross_list = ['OpeningWeekendNorthAmerica', 'NorthAmerica', 'OtherTerritories', 'Worldwide']

i = 1
fig = plt.figure(figsize = (20,10))
for g in gross_list:
    df2 = df2.sort_values(by = g, ascending = 0)
    sub = fig.add_subplot(2,2,i)
    sub.set_ylim((0.999 * df2[g].min()), (1.001* df2[g].max()))
    sub.tick_params(labelrotation=45)
    sns.barplot(data=df2, x = df2[g]/1000000000, y = df2.index, palette = 'Set3')
    plt.title('Lifetime Gross(bil)')
    i += 1
plt.tight_layout()


i = 1
fig = plt.figure(figsize = (20,30))
for g in gross_list:
    df3 = df3.sort_values(by = g, ascending = 0)
    sub = fig.add_subplot(2,2,i)
    sub.set_ylim((0.999 * df2[g].min()), (1.001* df2[g].max()))
    sub.tick_params(labelrotation=45)
    sns.barplot(data=df3, x = df3[g]/1000000000, y = df3.index, palette = 'viridis')
    plt.title('Lifetime Gross(bil)')
    i += 1
plt.tight_layout()


i = 1
fig = plt.figure(figsize = (20,30))
for g in gross_list:
    df4 = df4.sort_values(by = g, ascending = 0)
    sub = fig.add_subplot(2,2,i)
    sub.set_ylim((0.999 * df2[g].min()), (1.001* df2[g].max()))
    sub.tick_params(labelrotation=45)
    sns.barplot(data=df4, x = df4[g]/1000000000, y = df4.index, palette = 'gist_rainbow')
    plt.title('Lifetime Gross(bil)')
    i += 1
plt.tight_layout()

In [None]:
#Lastly, lets look at those films with the highest percentage of Grosses comming from North America
df['US%Gross'] = (df['NorthAmerica']/df['Worldwide'])*100
df.sort_values(by ='US%Gross', ascending = 0).head(10)

In [None]:
#And those films with the lowest percentage of Grosses comming from North America
df['US%Gross'] = (df['NorthAmerica']/df['Worldwide'])*100
df.sort_values(by ='US%Gross', ascending = 1).head(10)

In [None]:

#And those films with the highest percentage of Grosses comming from North America
df['US%Gross'] = (df['NorthAmerica']/df['Worldwide'])*100
df2 = df.sort_values(by ='US%Gross', ascending = 0).head(20)
plt.figure(figsize = (20,20))
sns.barplot(data = df2, y = df2['Title'], x = df2['US%Gross'])