In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.ticker as mtick
import pickle
import re
from collections import Counter

from ipywidgets import *
from IPython.display import display
import plotly.graph_objects as go

from sklearn.decomposition import PCA

%config InlineBackend.figure_format = 'retina'

#### _This notebook was used for exploratory analysis of all data, including: names, principals, titles, movies, and Academy Awards._

***
### Step 0: Setup

#### Customized Colormap and Color Palette

In [2]:
colors_ext = ['#f5edd3', '#ecddab', '#e4cd82', '#dbbd5a', '#d2ad32', '#ae8e25', '#866d1c', '#5e4c14', '#352b0b']  #shades of "gold", from lightest to darkest (with help from icolorpalette.com)
name = 'golds_ext'
golds_ext = LinearSegmentedColormap.from_list(name, colors_ext, N=100)

colors = ['#dbbd5a', '#d2ad32', '#ae8e25']  #shades of "gold", from lightest to darkest (with help from icolorpalette.com)
cmap_name_ = 'golds'
golds = LinearSegmentedColormap.from_list(cmap_name_, colors, N=100)

Golds = sns.color_palette(colors)
Golds_ext = sns.color_palette(colors_ext)

#### Reading in datasets

In [3]:
name_basics = pd.read_csv('../data/name_basics_ce.csv')
title_ratings = pd.read_csv('../data/title.ratings.tsv', delimiter = '\t', index_col='tconst')
title_basics = pd.read_csv('../data/title_basics_c.csv', index_col = 'tconst')
title_principals = pd.read_csv('../data/title_principals_ce.csv', low_memory=True)
movies = pd.read_csv('../data/movies_1990_2020_with_detail_oscars_complete.csv', index_col= 'tconst', low_memory=False)

***
### Step 1: Analysis of names data (1910-2020)¶

In [4]:
professions = []
for i in range(len(name_basics)):
    if type(name_basics['primaryProfession'][i]) == str:
        for j in range(name_basics['primaryProfession'][i].count(",")+1):
            professions.append(name_basics['primaryProfession'][i].split(",")[j])

In [5]:
ctr_prof = Counter(professions)
cnts = []
for i in range(10):
    prof, cnt = ctr_prof.most_common(10)[i]
    cnts.append(cnt)
    print(f'Profession "{prof}" accounts for {round(cnt/sum(ctr_prof.values()),4)*100}% of all names in the dataset.')
    print(f'The top {i+1} professions account for {round(sum(cnts)/sum(ctr_prof.values()),4)*100}% of all names in the dataset.\n')

Profession "actor" accounts for 33.96% of all names in the dataset.
The top 1 professions account for 33.96% of all names in the dataset.

Profession "miscellaneous" accounts for 9.36% of all names in the dataset.
The top 2 professions account for 43.32% of all names in the dataset.

Profession "producer" accounts for 7.95% of all names in the dataset.
The top 3 professions account for 51.27% of all names in the dataset.

Profession "writer" accounts for 6.02% of all names in the dataset.
The top 4 professions account for 57.29% of all names in the dataset.

Profession "camera_department" accounts for 5.45% of all names in the dataset.
The top 5 professions account for 62.739999999999995% of all names in the dataset.

Profession "director" accounts for 4.91% of all names in the dataset.
The top 6 professions account for 67.64% of all names in the dataset.

Profession "art_department" accounts for 3.17% of all names in the dataset.
The top 7 professions account for 70.82000000000001% of

In [6]:
top_twenty_prof = [prof for prof, cnt in ctr_prof.most_common(20)]

In [None]:
top_twenty_m_f = [{'profession': profession, 
                  'M': len(name_basics[(name_basics['gender']== 'M') & (name_basics['primaryProfession'].str.contains(profession))]),
                  'F': len(name_basics[(name_basics['gender']== 'F') & (name_basics['primaryProfession'].str.contains(profession))])}
                 for profession in top_twenty_prof]

In [None]:
top_twenty_m_f

In [None]:
top_twenty_m_f_df = pd.DataFrame(top_twenty_m_f)
top_twenty_m_f_df['M_perc'] = top_twenty_m_f_df['M'] / (top_twenty_m_f_df['M'] + top_twenty_m_f_df['F'])
top_twenty_m_f_df['F_perc'] = top_twenty_m_f_df['F'] / (top_twenty_m_f_df['M'] + top_twenty_m_f_df['F'])

In [None]:
top_twenty_m_f_df.head()

In [None]:
pal = sns.color_palette(Golds)
pal.as_hex()

In [None]:
fig = plt.figure(figsize = (17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.title('Gender balance across professions / departments', fontsize=20, color='#dfc56e', pad=20)

plt.bar(top_twenty_m_f_df['profession'], top_twenty_m_f_df['M_perc'], color = '#ae8e25', alpha = 1, label = 'male professionals')
plt.bar(top_twenty_m_f_df['profession'], top_twenty_m_f_df['F_perc'], bottom = top_twenty_m_f_df['M_perc'], color = '#dbbd5a', alpha = 1, label = "female professionals")
plt.xticks(rotation=90, fontsize=11, color='#ebdaa4')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
plt.yticks(fontsize=11, color='#ebdaa4')
plt.legend()

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/gndr_blnc_cat.png',facecolor=fig.get_facecolor(), edgecolor='none')

***
### Step 2: Analysis of title ratings (1910-2020)¶

In [None]:
title_ratings.head()

In [None]:
plt.figure(figsize = (12,5))
plt.title('Average ratings of all movies (1910-2020)')
sns.distplot(title_ratings['averageRating'], bins = 30, color = '#dbbd5a', hist_kws=dict(alpha=1, linewidth = 1, edgecolor="k", ));

_Exporting a dictionary of ratings per title (tconst) to easily integrate average rating with the data (post cleaning and engineering, in EDA). This information will not be used in predicting Oscars because most of the ratings are likely to have been collected long after the awards were decided, and may be influenced by the Academy decisions._

In [None]:
rating = {tconst: rating for tconst, rating in zip(title_ratings.index, title_ratings['averageRating'])}
pickle.dump(rating, open('../pickles/rating.p', 'wb'))

***
### Step 2: Analysis of titles data (1910-2020)¶

In [None]:
plt.figure(figsize = (17,5),  facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
sns.countplot(data = title_basics, x = 'startYear', color = 'gold') 
plt.title('Count of titles produced by year', fontsize=20, color='#dfc56e', pad=20)
plt.xlabel('year')
plt.xticks(rotation = 90, fontsize = 7, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.xlabel('year', fontsize=11, color='#ebdaa4')
plt.ylabel('titles (incl. movies, series, TV specials, etc.)', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();
plt.xticks(rotation = 90, fontsize = 7);

In [None]:
title_basics['titleType'].value_counts()

In [None]:
movie_titles = title_basics[title_basics['titleType']=='movie']

In [None]:
plt.figure(figsize = (17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
sns.countplot(data = movie_titles, x = 'startYear', color = '#c29e29')
plt.title('Number of movies produced worldwide, by year', fontsize=20, color='#dfc56e', pad=20)
plt.xticks(rotation = 90, fontsize = 7, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.xlabel('year', fontsize=11, color='#ebdaa4')
plt.ylabel('movies', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/mvs_yr.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
print(f"{len(title_basics[(title_basics['titleType']=='movie') & (title_basics['startYear'].between(1990, 1995))])} movies were made between 1990 and 1995.")
print(f"{len(title_basics[(title_basics['titleType']=='movie') & (title_basics['startYear'].between(1995, 2000))])} movies were made between 1995 and 2000.")
print(f"{len(title_basics[(title_basics['titleType']=='movie') & (title_basics['startYear'].between(2000, 2005))])} movies were made between 2000 and 2005.")
print(f"{len(title_basics[(title_basics['titleType']=='movie') & (title_basics['startYear'].between(2005, 2010))])} movies were made between 2005 and 2010.")
print(f"{len(title_basics[(title_basics['titleType']=='movie') & (title_basics['startYear'].between(2010, 2015))])} movies were made between 2010 and 2015.")
print(f"{len(title_basics[(title_basics['titleType']=='movie') & (title_basics['startYear'].between(2015, 2020))])} movies were made between 2015 and 2020.")

***
### Step 3: Analysis of principals data (1910-2020)¶

In [None]:
title_principals.drop([col for col in title_principals.columns if col[:3]=='Unn'], axis = 1, inplace= True)

In [None]:
title_principals['titleYear'] = title_principals['titleYear'].astype(int)

In [None]:
plt.figure(figsize = (17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.title('Number of movies made each year (1910-2020)', fontsize=20, color='#dfc56e', pad=20)
sns.countplot(data = title_principals[title_principals['category']=='director'], x = 'titleYear', dodge = False, color = '#dbbd5a', linewidth = 1, saturation = 1)
plt.xticks(rotation = 90, fontsize = 7, color='#ebdaa4')
plt.xlabel('year', fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().set_yticklabels(['{:.0f}'.format(x) for x in plt.gca().get_yticks()])
plt.ylabel('movies', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/mvs_yr.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
plt.figure(figsize = (17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.title('Number of movies made each year, by gender of director', fontsize=20, color='#dfc56e', pad=20)
sns.countplot(data = title_principals[title_principals['category']=='director'], x = 'titleYear', hue = 'gender', dodge = False,
              palette= {'F': '#dbbd5a', 'M': '#ae8e25'}, linewidth = 1, saturation = 1)
plt.xticks(rotation = 90, color='#ebdaa4')
plt.xlabel('year', fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('movies', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/mvs_yr_gndr.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
def titles_by_gender(category = 'director'):

    cnt_by_cat_gen_yr = title_principals[title_principals['category']==category].groupby(['titleYear', 'gender'])['tconst'].count().reset_index()
    cnt_by_cat_gen_yr_p = cnt_by_cat_gen_yr.pivot_table(values = 'tconst', index = 'titleYear', columns = 'gender').reset_index()
    cnt_by_cat_gen_yr_p['F_prop'] = cnt_by_cat_gen_yr_p['F']/ (cnt_by_cat_gen_yr_p['F']+cnt_by_cat_gen_yr_p['M'])
    cnt_by_cat_gen_yr_p['M_prop'] = cnt_by_cat_gen_yr_p['M']/ (cnt_by_cat_gen_yr_p['F']+cnt_by_cat_gen_yr_p['M'])

    plt.figure(figsize = (17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
    plt.title('Proportion of movies made each year by female and male '+category+'s', fontsize=20, color='#dfc56e', pad=20)
    plt.bar(cnt_by_cat_gen_yr_p['titleYear'], cnt_by_cat_gen_yr_p['F_prop'], color = '#dbbd5a', label = 'female '+category, width = 0.95)
    plt.bar(cnt_by_cat_gen_yr_p['titleYear'], cnt_by_cat_gen_yr_p['M_prop'], bottom= cnt_by_cat_gen_yr_p['F_prop'], color = '#ae8e25', 
            label = 'male '+category, width = 0.95)
    plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
    plt.yticks(fontsize=11, color='#ebdaa4')
    plt.xticks(fontsize=11, color='#ebdaa4')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().set_facecolor('#1d1d1d')
    plt.legend(loc = 'upper left')
    plt.tight_layout();
    
mvs_yr_gndr_cat_yr = interact(titles_by_gender, category = ['actor', 'director', 'writer', 'producer', 'cinematographer',
                                       'production_designer', 'composer', 'editor']);

mvs_yr_gndr_cat_yr;

#embed_minimal_html('../visualizations/mvs_yr_gndr_cat_yr.html', views = [mvs_yr_gndr_cat_yr], title = '% of movies with male / female principals, by year')

#plt.savefig('../visualizations/mvs_yr_gndr_cat_yr.png', transparent = True)

In [None]:
avg_age_by_gender_cat = title_principals.groupby(['category','gender'])['age'].mean().unstack()
#avg_age_by_gender_cat

In [None]:
avg_age_by_gender_cat.plot(kind = 'bar', title = 'Average age of entertainment professionals, by gender and category', figsize = (17,7), cmap = golds, alpha = 1, edgecolor = '#1d1d1d');
plt.title('Average age of entertainment professionals, by gender and category', fontsize=20, color='#dfc56e', pad=20)
plt.xticks(fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('age', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.legend(loc = 'upper left')
plt.tight_layout();

plt.savefig('../visualizations/avg_age_gndr_cat.png', facecolor= '#1d1d1d', edgecolor='none')

**Observations:** For nearly all entertainment-related professions, the average female professional is younger than the average male professional.<br>While it's tempting to think that women are simply more talented and start being successful at an earlier age, it's more likely that this is a reflection of double standards in the industry.

In [None]:
def title_principals_by_gender_cat(year = 2010):
    avg_age_by_gender_cat_yr = title_principals[title_principals['titleYear']==year].groupby(['category','gender'])['age'].mean().unstack()
    fig = avg_age_by_gender_cat_yr.plot(kind = 'bar', figsize = (17,5), cmap = golds, alpha = 1);
    fig.set_facecolor('#1d1d1d')
    plt.title(f'Average age of entertainment professionals, by gender and category ({year})', fontsize=20, color='#dfc56e', pad=20)
    plt.xticks(fontsize=11, color='#ebdaa4')
    plt.yticks(fontsize=10, color='#ebdaa4')
    plt.ylabel('age', fontsize=11, color='#ebdaa4')
    plt.gca().set_facecolor('#1d1d1d')
    plt.xlabel('')
interact(title_principals_by_gender_cat, year = (2010,2020,1));

plt.savefig('../visualizations/avg_age_gndr_cat_yr.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
plt.figure(figsize = (17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.title('Age distribution across movie professions (1910-2020)', fontsize=20, color='#dfc56e', pad=20)
sns.violinplot(data = title_principals, x = 'category', y = 'age', hue = 'gender', split = True, scale= 'count', 
               palette= {'F': '#dbbd5a', 'M': '#ae8e25'}, linewidth = 1, saturation = 1, kws=dict(alpha=1, linewidth = 1, edgecolor="gold")); 

plt.xticks(fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('age', fontsize=11, color='#ebdaa4')
plt.xlabel('')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.legend(loc = 'upper left')
plt.tight_layout();

plt.savefig('../visualizations/avg_age_gndr_cat_vln.png',facecolor=fig.get_facecolor(), edgecolor='none')

***
### Step 4: Analysis of detailed data for movies made between 1990 and 2020

In [None]:
movies.tail(2)

In [None]:
sns.distplot(movies['runtimeMinutes'], bins =25, color = '#dbbd5a', kde = False, hist = True, hist_kws=dict(alpha=1, linewidth = 1, edgecolor="k"));

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(1,3,1)
plt.title('Documentary runtime histogram')
sns.distplot(movies[movies['Documentary']==1]['runtimeMinutes'], bins =25, color = '#dbbd5a', kde = False, hist = True, hist_kws=dict(alpha=1, linewidth = 1, edgecolor="k"));
plt.subplot(1,3,2)
plt.title('Drama runtime histogram')
sns.distplot(movies[movies['Drama']==1]['runtimeMinutes'], bins =25, color = '#dbbd5a', kde = False, hist = True, hist_kws=dict(alpha=1, linewidth = 1, edgecolor="k"));
plt.subplot(1,3,3)
plt.title('Comedy runtime histogram')
sns.distplot(movies[movies['Comedy']==1]['runtimeMinutes'], bins =25, color = '#dbbd5a', kde = False, hist = True, hist_kws=dict(alpha=1, linewidth = 1, edgecolor="k"));

In [None]:
cnt_by_lang = movies.groupby('language')['titleType'].count().sort_values(ascending = False).reset_index().rename(columns = {'titleType': 'count'})

In [None]:
cnt_by_lang_pop = cnt_by_lang[cnt_by_lang['count']>=1000]

In [None]:
plt.figure(figsize = (10, 7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
fig = sns.barplot(data = cnt_by_lang_pop, x = 'count', y = 'language', orient = 'h', color = 'gold', saturation=1);
plt.title('Number of movies made between 1990 and 2020, by language', fontsize=20, color='#dfc56e', pad=20)
plt.xticks(fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('', fontsize=11, color='#ebdaa4')
plt.xlabel('count of movies',fontsize=11, color='#ebdaa4')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/mvs_lng.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
from utils import list_uniques

In [None]:
def count_by_genre_yr(year):
    count_by_genre = {}
    for genre in unique_genres:
        count_by_genre[genre] = movies[movies['startYear']==year][genre].sum()
    return pd.DataFrame.from_dict(count_by_genre, orient = 'index', columns = ['count'])

In [None]:
def plot_interact(year = 2020):
    plt.figure(figsize = (12,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
    sns.barplot(data = count_by_genre_yr(year), x = count_by_genre_yr(year).index, y = 'count', palette= Golds)
    plt.title(f'Number of movies made in {year}, by genre', fontsize=20, color='#dfc56e', pad=20)
    plt.xticks(rotation = 90, fontsize=11, color='#ebdaa4')
    plt.yticks(fontsize=11, color='#ebdaa4')
    plt.ylim(0,5000)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.gca().set_facecolor('#1d1d1d')
    plt.tight_layout();

interact(plot_interact, year = (1990,2020,1));

#plt.savefig('../visualizations/mvs_cat_yr.png',facecolor=fig.get_facecolor(), edgecolor='none')

***
### Step 5: Analysis of movies and artists nominated or awarded with an Oscar between 1990 and 2020

In [None]:
oscar_prodcos = pickle.load(open('../pickles/oscar_prodcos.p', 'rb'))

In [None]:
movies_by_prodco = {}
oscars_by_prodco = {}
for prodco in oscar_prodcos:
    movies_by_prodco[prodco] = movies[prodco].sum()
    oscars_by_prodco[prodco] = movies[(movies[prodco]==1) & (movies['Oscars']==1)]['Oscars'].sum()
count_by_prodco_df = pd.DataFrame.from_dict(movies_by_prodco, orient = 'index', columns = ['movies'])
count_by_prodco_df['oscar_movies'] = count_by_prodco_df.index.map(oscars_by_prodco)
count_by_prodco_df['oscar_as_perc'] = count_by_prodco_df['oscar_movies'] / count_by_prodco_df['movies']

In [None]:
count_by_prodco_df

In [None]:
count_by_prodco_plt = count_by_prodco_df.drop(columns = ['oscar_as_perc']).unstack().reset_index().rename(columns = {'level_0': 'cat', 'level_1': 'production company', 0:'count'})

In [None]:
plt.figure(figsize = (18,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.subplot(2,1,1)

sns.barplot(data = count_by_prodco_plt, x = 'production company', y = 'count', hue = 'cat',palette= Golds)
plt.title(f'Number of movies / Oscar movies made by key production companies', fontsize=20, color='#dfc56e', pad=20)
plt.xticks([])
plt.xlabel('')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('count', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.subplot(2,1,2)
sns.barplot(data = count_by_prodco_df, x = count_by_prodco_df.index, y = 'oscar_as_perc',palette= Golds)
plt.title(f"Oscar movies made by key production companies as % of all movies made by each company", fontsize=20, color='#dfc56e', pad=20)
plt.xticks(rotation = 90,fontsize=11, color='#ebdaa4')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('count', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/mvs_aa_prodco.png',facecolor=fig.get_facecolor(), edgecolor='none')

**Note:** "Oscar density" in the two charts below is defined as "number of individuals credited for movies which were previously nominated or won". _Another interesting thing to look at would be a similar scatterplot but considering only nominations and awards won by this person (this would not count, e.g., actors whose names are listed for movies nominated for cinematography)

In [None]:
movies['o_wins_noms'] = movies['o_wins'] + movies['o_noms']

In [None]:
movies['prev_wins_noms'] = movies['prev_Oscar_wins_nm'] + movies['prev_Oscar_noms_nm']

In [None]:
plt.figure(figsize = (18,9), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.suptitle('Oscar wins & nominations vs Oscar-density of cast and crew,\nby movie (1995-2020)\n\n', fontsize=20, color='#dfc56e')
plt.subplot(1,2,1)
sns.scatterplot(movies[movies['startYear']>=1995]['prev_Oscars'], movies[movies['startYear']>=1995]['o_wins_noms'], marker = '*', s = 500, color = '#dbbd5a', alpha = 0.3)
plt.xlabel('Number of previous Oscar productions involving cast of movie', fontsize=11, color='#ebdaa4')
plt.xticks(fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('Oscars wons + nominations', fontsize=11, color='#ebdaa4')

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.subplot(1,2,2)
#plt.title('Oscar wins & nominations vs Oscar-density of cast and crew,\nby movie (1995-2020)', fontsize=20, color='#dfc56e', pad=20)
sns.scatterplot(movies[movies['startYear']>=1995]['prev_wins_noms'], movies[movies['startYear']>=1995]['o_wins_noms'], marker = '*', s = 500, color = '#dbbd5a', alpha = 0.3)
plt.xlabel('Number of previous Academy recognitions awarded to cast of movie', fontsize=11, color='#ebdaa4')
plt.xticks(fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('Oscars wons + nominations', fontsize=11, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/wins_noms_prev_nw.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
nm_wins_noms = pickle.load(open('../pickles/nm_wins_noms.p', 'rb'))

In [None]:
nm_wins_noms.head(2)

In [None]:
subset = nm_wins_noms[(nm_wins_noms['perc_won']>= 0.4) & (nm_wins_noms['all_nominations']>= 5)]

fig = plt.figure(figsize = (17,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
fig.set_facecolor('#1d1d1d')
plt.suptitle('Academy Sweethearts\n', fontsize=20, color='#dfc56e')

plt.subplot(1,3,1)
plt.title('\n\nNominations received from 1990 to 2020', fontsize=16, color='#dfc56e', pad=20)
subset.sort_values(['all_nominations'], ascending = False, inplace = True)
sns.barplot(y = subset['name'], x = subset['all_nominations'], color = '#dbbd5a', orient='h', alpha = 0.3)
sns.scatterplot(y = subset['name'], x = subset['all_nominations'], zorder=10, color='#dbbd5a', marker = '*', edgecolor='#6b5717' , s= 1000)
plt.xticks(range(0,15,5), fontsize=8, color='#ebdaa4')
plt.yticks(fontsize=11, color='#ebdaa4')
plt.ylabel('')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')

plt.subplot(1,3,2)
plt.title('Awards received from 1990 to 2020', fontsize=16, color='#dfc56e', pad=20)
#subset.sort_values(['wins'], ascending = False, inplace = True)
sns.barplot(y = subset['name'], x = subset['wins'], color = '#dbbd5a', orient='h', alpha = 0.3)
sns.scatterplot(y = subset['name'], x = subset['wins'], zorder=10, color='#dbbd5a', marker = '*', edgecolor='#6b5717' , s= 1000)
plt.xticks(range(0,5,1), fontsize=10, color='#ebdaa4')
plt.ylabel('')
plt.yticks([])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')

plt.subplot(1,3,3)
plt.title('Awards received from 1990 to 2020 \nas % of received nominations', fontsize=16, color='#dfc56e', pad=20)
#subset.sort_values(['wins'], ascending = False, inplace = True)
sns.barplot(y = subset['name'], x = subset['perc_won'], color = '#dbbd5a', orient='h', alpha = 0.3)
sns.scatterplot(y = subset['name'], x = subset['perc_won'], zorder=10, color='#dbbd5a', marker = '*', edgecolor='#6b5717' , s= 1000)
plt.xticks(np.linspace(0,0.9,10), fontsize=10, color='#ebdaa4')
plt.gca().set_xticklabels(['{:.0f}%'.format(y*100) for y in plt.gca().get_xticks()])
plt.gca().set_facecolor('#1d1d1d')
plt.ylabel('')
plt.yticks([])

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/sweethearts.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
subset = nm_wins_noms[(nm_wins_noms['perc_won']==0) & (nm_wins_noms['all_nominations']>= 7)]

plt.figure(figsize = (17,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.suptitle('Academy Sweetsnubs\n', fontsize=20, color='#dfc56e')

plt.subplot(1,3,1)
plt.title('\n\nNominations received from 1990 to 2020', fontsize=16, color='#dfc56e', pad=20)
subset.sort_values(['all_nominations'], ascending = False, inplace = True)
sns.barplot(y = subset['name'], x = subset['all_nominations'], color = '#dbbd5a', orient='h', alpha = 0.3)
sns.scatterplot(y = subset['name'], x = subset['all_nominations'], zorder=10, color='#dbbd5a', marker = '*', edgecolor='#6b5717' , s= 1000)
plt.xticks(range(0,23,5), fontsize=8, color='#ebdaa4')
plt.yticks(fontsize=11, color='#ebdaa4')
plt.ylabel('')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')

plt.subplot(1,3,2)
plt.title('Awards received from 1990 to 2020', fontsize=16, color='#dfc56e', pad=20)
#subset.sort_values(['wins'], ascending = False, inplace = True)
sns.barplot(y = subset['name'], x = subset['wins'], color = '#dbbd5a', orient='h', alpha = 0.3)
sns.scatterplot(y = subset['name'], x = subset['wins'], zorder=10, color='#dbbd5a', marker = '*', edgecolor='#6b5717' , s= 1000)
plt.xticks(range(0,5,1), fontsize=10, color='#ebdaa4')
plt.ylabel('')
plt.yticks([])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')

plt.subplot(1,3,3)
plt.title('Awards received from 1990 to 2020\n as % of received nominations', fontsize=16, color='#dfc56e', pad=20)
#subset.sort_values(['wins'], ascending = False, inplace = True)
sns.barplot(y = subset['name'], x = subset['perc_won'], color = '#dbbd5a', orient='h', alpha = 0.3)
sns.scatterplot(y = subset['name'], x = subset['perc_won'], zorder=10, color='#dbbd5a', marker = '*', edgecolor='#6b5717' , s= 1000)
plt.xticks(np.linspace(0,0.5,6), fontsize=10, color='#ebdaa4')
plt.gca().set_xticklabels(['{:.0f}%'.format(y*100) for y in plt.gca().get_xticks()])
plt.ylabel('')
plt.yticks([])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')

plt.tight_layout();

plt.savefig('../visualizations/sweetsnubsplt.png',facecolor=fig.get_facecolor(), edgecolor='none')

What I would like is for this to feed a chart that shows all these names on the y axis, 1990-2020 on the x axis, and an empty star for nomination in a year, gold star for a win.

In [None]:
### FROM

In [None]:
academy_shs = nm_wins_noms[(nm_wins_noms['all_nominations']>=5) & (nm_wins_noms['perc_won']>=0.4)].index

In [None]:
academy_shs

In [None]:
academy_sbs = nm_wins_noms[(nm_wins_noms['all_nominations']>=7) & (nm_wins_noms['perc_won']==0)].index

In [None]:
pickle.dump(academy_sbs, open('../pickles/sweetsnubs.p', 'wb'))

In [None]:
pickle.dump(academy_shs, open('../pickles/sweethearts.p', 'wb'))

In [None]:
### TO

In [None]:
nconst_name = pickle.load(open('../pickles/nconst_name.p', 'rb'))

In [None]:
shwins_df = pickle.load(open('../pickles/shwins_df.p', 'rb'))
shnoms_df = pickle.load(open('../pickles/shnoms_df.p', 'rb'))
sbwins_df = pickle.load(open('../pickles/sbwins_df.p', 'rb'))
sbnoms_df = pickle.load(open('../pickles/sbnoms_df.p', 'rb'))

Unstacking and renaming columns for use in visualizaton:

In [None]:
shwins_yr = shwins_df[range(1990,2021)].unstack().reset_index()

shwins_yr.rename(columns = {'level_0': 'year', 0: 'wins', 'level_1': 'nconst'}, inplace=True)
shwins_yr['size'] = shwins_yr['wins']*150
shwins_yr.set_index('nconst', inplace=True)

shnoms_yr = shnoms_df[range(1990,2021)].unstack().reset_index()

shnoms_yr.rename(columns = {'level_0': 'year', 0: 'wins', 'level_1': 'nconst'}, inplace=True)
shnoms_yr['size'] = shnoms_yr['wins']*130
shnoms_yr.set_index('nconst', inplace=True)

sbwins_yr = sbwins_df[range(1990,2021)].unstack().reset_index()

sbwins_yr.rename(columns = {'level_0': 'year', 0: 'wins', 'level_1': 'nconst'}, inplace=True)
sbwins_yr['size'] = sbwins_yr['wins']*150
sbwins_yr.set_index('nconst', inplace=True)

sbnoms_yr = sbnoms_df[range(1990,2021)].unstack().reset_index()

sbnoms_yr.rename(columns = {'level_0': 'year', 0: 'wins', 'level_1': 'nconst'}, inplace=True)
sbnoms_yr['size'] = sbnoms_yr['wins']*130
sbnoms_yr.set_index('nconst', inplace=True)

In [None]:
shwins_yr['name'] = shwins_yr.index.map(nconst_name)
shnoms_yr['name'] = shnoms_yr.index.map(nconst_name)

sbwins_yr['name'] = sbwins_yr.index.map(nconst_name)
sbnoms_yr['name'] = sbnoms_yr.index.map(nconst_name)

In [None]:
plt.figure(figsize = (18, 8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.title('Academy loving its sweethearts (1990-2020)', fontsize=20, color='#dfc56e', pad=20)

plt.scatter(x = shnoms_yr['year'], y = shnoms_yr['name'], edgecolors = '#dec267', c = '#f9f5e7', alpha=.1, s = 50, marker = 'o')
plt.scatter(x = shnoms_yr['year'], y = shnoms_yr['name'], c = '#f9f5e7', edgecolors = '#dec267', alpha=0.8, s = shnoms_yr['size'], cmap = golds, marker = 'o')
plt.scatter(x = shwins_yr['year'], y = shwins_yr['name'], c = '#c9a42b', edgecolors = '#93781f', alpha=0.8, s = shwins_yr['size'], cmap = golds, marker = '*')

plt.xticks(fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=11, color='#ebdaa4')
plt.ylabel('', fontsize=11, color='#ebdaa4')
plt.xlabel('')
plt.gca().invert_yaxis()
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/shs_wn_yr.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
plt.figure(figsize = (18, 8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.title('Academy snubbing its sweetsnubs (1910-2020)', fontsize=20, color='#dfc56e', pad=20)

plt.scatter(x = sbnoms_yr['year'], y = sbnoms_yr['name'], edgecolors = '#efe2b8', c = '#f9f5e7', alpha=.1, s = 50, marker = 'o')
plt.scatter(x = sbnoms_yr['year'], y = sbnoms_yr['name'], c = '#f9f5e7', edgecolors = '#efe2b8', alpha=0.8, s = sbnoms_yr['size'], cmap = golds, marker = 'o')

plt.xticks(fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=11, color='#ebdaa4')
plt.xlabel('')
plt.gca().invert_yaxis()
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.tight_layout();

plt.savefig('../visualizations/sbs_wn_yr.png',facecolor=fig.get_facecolor(), edgecolor='none')

### Analysis of wins and nominations by gender and age of winners / nominees

In [None]:
noms_wins_df = pickle.load(open('../pickles/noms_wins_df.p', 'rb'))
wins_df = pickle.load(open('../pickles/wins_df.p', 'rb'))

In [None]:
noms_wins_df_us = noms_wins_df.unstack().reset_index()

In [None]:
title_principals_1990_20_w_Oscars = pd.read_csv('../data/title_principals_1990_20_w_Oscars.csv')

In [None]:
nm_g_byr = title_principals_1990_20_w_Oscars.groupby(['nconst', 'gender'])['birthYear'].mean().reset_index()
nm_g_byr.set_index('nconst', inplace = True)

In [None]:
noms_wins_df_us.rename(columns = {'level_0': 'title_year', 'level_1': 'nconst', 0: 'noms_wins'}, inplace = True)
noms_wins_df_us = noms_wins_df_us.merge(nm_g_byr, how = 'left', left_on = 'nconst', right_index = True)
noms_wins_df_us['age']= noms_wins_df_us['title_year'] - noms_wins_df_us['birthYear']
noms_wins_df_us['gender_bin'] = noms_wins_df_us['gender'].map({'F': 1, 'M': 0})
noms_wins_df_us.head()

In [None]:
noms_wins_df_us = noms_wins_df_us[noms_wins_df_us['noms_wins']>0]
noms_wins_df_us.head()

In [None]:
age_gndr_yr = noms_wins_df_us.groupby('title_year')[['age', 'gender_bin']].mean()
temp = noms_wins_df_us.groupby(['title_year', 'gender'])['age'].mean().reset_index()
age_by_gndr_yr = temp.pivot_table(index = 'title_year', columns = 'gender', values = 'age')

age_gndr_by_gndr_yr = pd.merge(age_gndr_yr, age_by_gndr_yr, left_index= True, right_index= True)

In [None]:
age_gndr_by_gndr_yr.rename(columns = {'age': 'avg_age', 'gender_bin': 'part_fem', 'F': 'avg_age_fem', 'M': 'avg_age_male'}, inplace = True)
age_gndr_by_gndr_yr.head()

In [None]:
wins_df_us = wins_df.unstack().reset_index()

wins_df_us.rename(columns = {'level_0': 'title_year', 'level_1': 'nconst', 0: 'wins'}, inplace = True)
wins_df_us = wins_df_us.merge(nm_g_byr, how = 'left', left_on = 'nconst', right_index = True)
wins_df_us['age']= wins_df_us['title_year'] - wins_df_us['birthYear']
wins_df_us['gender_bin'] = wins_df_us['gender'].map({'F': 1, 'M': 0})
wins_df_us = wins_df_us[wins_df_us['wins']>0]

wins_age_gndr_yr = wins_df_us.groupby('title_year')[['age', 'gender_bin']].mean()
temp = wins_df_us.groupby(['title_year', 'gender'])['age'].mean().reset_index()
wins_age_by_gndr_yr = temp.pivot_table(index = 'title_year', columns = 'gender', values = 'age')

wins_age_gndr_by_gndr_yr = pd.merge(wins_age_gndr_yr, wins_age_by_gndr_yr, left_index= True, right_index= True)

wins_age_gndr_by_gndr_yr.rename(columns = {'age': 'avg_age', 'gender_bin': 'part_fem', 'F': 'avg_age_fem', 'M': 'avg_age_male'}, inplace = True)
wins_age_gndr_by_gndr_yr.head()

####  Time to visualize

In [None]:
fig, ax1 = plt.subplots(figsize=(17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')

ax1 = plt.bar( age_gndr_by_gndr_yr.index, age_gndr_by_gndr_yr['part_fem'], color = '#f8f2e0', alpha = 1, label = '% of female nominees and recipients')
ax1 = sns.regplot(data = age_gndr_by_gndr_yr, x = age_gndr_by_gndr_yr.index, y = 'part_fem',  
            scatter = False, ci = 0, line_kws={'lw': 2, 'ls': '-', 'color': '#f2e8c5'})

plt.title('Average age of Academy Award nominees and recipients', fontsize=20, color='#dfc56e', pad=20)
ax1.set_xlabel('year', fontsize=11, color='#ebdaa4')
ax1.set_ylabel('% of women', fontsize=11, color='#ebdaa4')
plt.xticks(age_gndr_by_gndr_yr.index, fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
ax1.legend();

ax2 = ax1.twinx()
ax2 = sns.lineplot(data = age_gndr_by_gndr_yr, x = age_gndr_by_gndr_yr.index, y = 'avg_age_fem', color = '#d8b74d', marker = '*', markersize =20, lw = 1,
             label = 'average age of female nominees and recipients')
ax2 = sns.regplot(data = age_gndr_by_gndr_yr, x = age_gndr_by_gndr_yr.index, y = 'avg_age_fem',  
            scatter = False, ci = 0, 
            line_kws={'lw': 2, 'ls': '--', 'color': '#d8b74d'})
ax2 = sns.lineplot(data = age_gndr_by_gndr_yr, x = age_gndr_by_gndr_yr.index, y = 'avg_age_male', color = '#ae8e25',  marker = '*', markersize =20, lw = 1,
             label = 'average age of male nominees and recipients')
ax2 = sns.regplot(data = age_gndr_by_gndr_yr, x = age_gndr_by_gndr_yr.index, y = 'avg_age_male', 
            scatter = False, ci = 0, 
            line_kws={'lw': 2, 'ls': '--',  'color': '#ae8e25'})
ax2.set_xlabel('year', fontsize=11, color='#ebdaa4')
ax2.set_ylabel('age', fontsize=11, color='#ebdaa4')
plt.xticks(age_gndr_by_gndr_yr.index, fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
ax2.legend(loc = 'upper center');

plt.tight_layout()

plt.savefig('../visualizations/avg_age_winnoms_gndr.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
fig, ax1 = plt.subplots(figsize=(17,7), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')

ax1 = plt.bar( wins_age_gndr_by_gndr_yr.index, wins_age_gndr_by_gndr_yr['part_fem'], color = '#f8f2e0', alpha = 1, label = '% of female recipients')
ax1 = sns.regplot(data = wins_age_gndr_by_gndr_yr, x = wins_age_gndr_by_gndr_yr.index, y = 'part_fem',  
            scatter = False, ci = 0, line_kws={'lw': 2, 'ls': '-', 'color': '#f2e8c5'})

plt.title('Average age of Academy Award recipients', fontsize=20, color='#dfc56e', pad=20)
ax1.set_xlabel('year', fontsize=11, color='#ebdaa4')
ax1.set_ylabel('% of women', fontsize=11, color='#ebdaa4')
plt.xticks(wins_age_gndr_by_gndr_yr.index, fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
ax1.legend();

ax2 = ax1.twinx()
ax2 = sns.lineplot(data = wins_age_gndr_by_gndr_yr, x = wins_age_gndr_by_gndr_yr.index, y = 'avg_age_fem', color = '#dbbd5a', marker = '*', markersize =20, lw = 1,
             label = 'average age of female recipients')
ax2 = sns.regplot(data = wins_age_gndr_by_gndr_yr, x = wins_age_gndr_by_gndr_yr.index, y = 'avg_age_fem',  
            scatter = False, ci = 0, 
            line_kws={'lw': 2, 'ls': '--', 'color': '#dbbd5a'})
ax2 = sns.lineplot(data = wins_age_gndr_by_gndr_yr, x = wins_age_gndr_by_gndr_yr.index, y = 'avg_age_male', color = '#ae8e25',  marker = '*', markersize =20, lw = 1,
             label = 'average age of male recipients')
ax2 = sns.regplot(data = wins_age_gndr_by_gndr_yr, x = wins_age_gndr_by_gndr_yr.index, y = 'avg_age_male', 
            scatter = False, ci = 0, 
            line_kws={'lw': 2, 'ls': '--',  'color': '#ae8e25'})
ax2.set_xlabel('year', fontsize=11, color='#ebdaa4')
ax2.set_ylabel('age', fontsize=11, color='#ebdaa4')
plt.xticks(wins_age_gndr_by_gndr_yr.index, fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
ax2.legend(loc = 'upper center');

plt.tight_layout()

plt.savefig('../visualizations/avg_age_wins_gndr.png',facecolor=fig.get_facecolor(), edgecolor='none')

### Analysis of wins and nominations by individuals without previous nominations¶

In [None]:
new_in_all_winners = pickle.load(open('../pickles/new_in_all_winners.p', 'rb'))
new_in_all_nominees = pickle.load(open('../pickles/new_in_all_nominees.p', 'rb'))

In [None]:
plt.figure(figsize = (18,9), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.subplot(2,1,2)
plt.title('Number # of Oscar newcomers awarded by the Academy as % of all winners, by year (1990-2020)', fontsize=20, color='#dfc56e', pad=20)
plt.bar(x = range(0,28), height = new_in_all_winners, color = '#dfc56e')
plt.xticks(range(0,28,5), range(1991,2020,5), fontsize=10, color='#ebdaa4')
plt.axhline(np.mean(new_in_all_winners[10:]), ls = '--', color='#ebdaa4')
plt.xlabel('year', fontsize=11, color='#ebdaa4')
plt.ylabel('% of newcomers', fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()]);

plt.subplot(2,1,1)
plt.title('Number # of Oscar newcomers nominated by the Academy as % of all nominees, by year (1990-2020)', fontsize=20, color='#dfc56e', pad=20)
plt.bar(x = range(0,28), height = new_in_all_nominees, color = '#dfc56e')
plt.xticks(range(0,28,5), range(1991,2020,5), fontsize=10, color='#ebdaa4')
plt.axhline(np.mean(new_in_all_nominees[10:]), ls = '--', color='#ebdaa4')
plt.xlabel('year', fontsize=11, color='#ebdaa4')
plt.ylabel('% of newcomers', fontsize=11, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d')
plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()]);

plt.tight_layout()

plt.savefig('../visualizations/oscar_newcomers_win.png',facecolor=fig.get_facecolor(), edgecolor='none')

### Inspecting the relationship between rating and number of awards / nominations

In [None]:
ratings = pickle.load(open('../pickles/rating.p', 'rb'))

In [None]:
movies['avg_rating'] = movies.index.map(ratings)

In [None]:
plt.figure(figsize = (20,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.suptitle('Academy "ratings" (nominations and wins") vs viewer ratings', fontsize=20, color='#dfc56e')
plt.subplot(1,2,1)
sns.scatterplot(x = movies['avg_rating'], y= movies['o_wins_noms'], marker = '*', color = '#dbbd5a', s = 500, alpha = 0.5)
plt.axvline(5, ls = '--', color = '#9a7d21', lw = 2.5)
plt.xlabel('budget (USD)', fontsize=11, color='#ebdaa4')
plt.xticks(fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.ylabel('Academy Award nominations and wins', fontsize=11, color='#ebdaa4')
plt.gca().set_facecolor('#1d1d1d')

plt.subplot(1,2,2)
sns.scatterplot(x = movies['avg_rating'], y= movies['o_wins_noms'], marker = '*', color = '#dbbd5a', s = 500, alpha = 0.5)
plt.xlim(5,10)
plt.xlabel('budget (USD)', fontsize=11, color='#ebdaa4')
plt.xticks(fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().set_facecolor('#1d1d1d')

plt.tight_layout();

plt.savefig('../visualizations/wins_noms_rtgs.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
plt.figure(figsize = (20,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.suptitle('Opening weekend revenues and oscar results vs estimated budget', fontsize=20, color='#dfc56e')
plt.subplot(1,2,1)
sns.scatterplot(movies['budget_USD'], movies['US_open_WE'], alpha = 0.3, color = '#d7b546', marker = '*', s = 300)
plt.xlabel('budget (USD)', fontsize=11, color='#ebdaa4')
plt.ylabel('US opening weekend (USD)', fontsize=11, color='#ebdaa4')
plt.xticks(fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().set_facecolor('#1d1d1d')

plt.subplot(1,2,2)
sns.scatterplot(movies['budget_USD'], movies['o_wins_noms'], alpha = 0.3, color = '#d7b546', marker = '*', s = 300)
plt.xlabel('budget (USD)', fontsize=11, color='#ebdaa4')
plt.ylabel('Academy Award nominations and wins', fontsize=11, color='#ebdaa4')
plt.xticks(fontsize=10, color='#ebdaa4')
plt.yticks(fontsize=10, color='#ebdaa4')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().set_facecolor('#1d1d1d');

plt.savefig('../visualizations/openWE_wins_noms_bdgt.png',facecolor=fig.get_facecolor(), edgecolor='none')

In [None]:
plt.figure(figsize = (20,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.subplot(1,2,1)
sns.regplot(movies['budget_USD'], movies['US_open_WE'],  color = '#d7b546', scatter_kws={'alpha':0.3, 'marker': "*"})
plt.gca().set_facecolor('#1d1d1d')
plt.subplot(1,2,2)
sns.regplot(movies['budget_USD'], movies['US_open_WE'], color = '#d7b546', scatter_kws={'alpha':0.3, 'marker': "*"})
plt.gca().set_facecolor('#1d1d1d')

plt.xlim(0,200000000)
plt.ylim(0,200000000);

In [None]:
plt.figure(figsize = (20,8), facecolor = '#1d1d1d', edgecolor = '#1d1d1d')
plt.subplot(1,2,1)
sns.regplot(movies['budget_USD'], movies['o_wins_noms'],  color = '#d7b546', scatter_kws={'alpha':0.3, 'marker': "*"})
plt.gca().set_facecolor('#1d1d1d')
plt.subplot(1,2,2)
sns.regplot(movies['budget_USD'], movies['o_wins_noms'], color = '#d7b546', scatter_kws={'alpha':0.3, 'marker': "*"})
plt.gca().set_facecolor('#1d1d1d')

plt.xlim(0,200000000)
plt.ylim(0,200000000);

In [None]:
genres = list(movies.columns[84:107])

In [None]:
genre_sets = []
for genre in genres:
    for i in range(len(genres)-genres.index(genre)):
        genre_sets.append([genre, genres[genres.index(genre)+i]])

In [None]:
best_set = None
best_set_score = 0
for gset in genre_sets:
    if movies[(movies[gset[0]]==1) & (movies[gset[1]]==1)]['Oscars'].mean() > best_set_score:
        best_set = gset
        best_set_score = movies[(movies[gset[0]]==1) & (movies[gset[1]]==1)]['Oscars'].mean()
print("")
print(best_set, best_set_score)

In [None]:
tconst_title = pickle.load(open('../pickles/tconst_title.p', 'rb'))

In [None]:
X_vars_all = ['runtimeMinutes', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Reality-TV', 'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 
       'Warner Bros.', 'Universal Pictures', 'Columbia Pictures',
       'Paramount Pictures', 'Dreamworks Pictures', 'Walt Disney Pictures',
       'Miramax', 'Twentieth Century Fox', 'New Line Cinema', 'Focus Features',
       'Fox Searchlight Pictures', 'Touchstone Pictures',
       'Walt Disney Animation Studios', 'BBC Films', 'TriStar Pictures',
       'New Regency Productions', 'Fox 2000 Pictures', 'The Weinstein Company',
       'Annapurna Pictures', 'Castle Rock Entertainment',
        'oscars_post_release', 'US_open_WE', 'prev_Oscars', 'rel_USA', 'prev_Oscar_wins_nm', 'prev_Oscar_noms_nm',
       'rel__aug', 'rel__dec', 'rel__feb', 'rel__jan', 'rel__jul', 'rel__jun',
       'rel__mar', 'rel__may', 'rel__nov', 'rel__oct', 'rel__sep', 'budget_USD']

In [None]:
pca = PCA(n_components = 5)
movies_pca = pca.fit_transform(movies[X_vars_all])
movies_pca_df = pd.DataFrame(index = movies.index, data = movies_pca)

movies_pca_df['title'] = movies_pca_df.index.map(tconst_title)
movies_pca_df['oscar_movie'] = movies_pca_df.index.isin(movies[movies['Oscars']==1].index).astype(int)
movies_pca_df['symbol'] = movies_pca_df['oscar_movie'].map({0: 'star-open', 1: 'star-open'})
movies_pca_df['size'] = movies_pca_df['oscar_movie'].map({0: 5, 1: 5})
movies_pca_df['color'] = movies_pca_df['oscar_movie'].map({0: '#fcfaf5', 1: '#cdab48'})

movies_pca_df.head()

In [None]:
f = go.FigureWidget([go.Scatter(x=movies_pca_df[1], y= movies_pca_df[3],
                                mode='markers', hoverinfo = 'text', hovertext = movies_pca_df['title'], fill = 'none')])

scatter = f.data[0]
#colors = ['#cdab48'] * movies_pca_df.shape[0]
scatter.marker.symbol = movies_pca_df['symbol']
scatter.marker.color = movies_pca_df['color']
scatter.marker.opacity = 0.5
scatter.marker.size = movies_pca_df['size']
f.layout.hovermode = 'closest'
f.update_layout(width=1600, height=850, plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='#1d1d1d')
f.update_xaxes(showgrid=False, showticklabels=False, visible = True, zerolinecolor = '#5e4c14', zerolinewidth= 2)
f.update_yaxes(showgrid=False, showticklabels=False, visible = True, zerolinecolor = '#5e4c14', zerolinewidth= 2)

# create our callback function
def update_point(trace, points, selector):
    c = list(scatter.marker.color)
    s = list(scatter.marker.size)
    for i in points.point_inds:
        s[i] = 40
        with f.batch_update():
            scatter.marker.color = c
            scatter.marker.size = s

scatter.on_click(update_point)

f.write_html("../visualizations/all_movies_cloud.html")
f

***
***

**BACKUP**

In [None]:
def titles_by_gender(category):

    cnt_by_cat_gen_yr = title_principals[title_principals['category']==category].groupby(['titleYear', 'gender'])['tconst'].count().reset_index()
    cnt_by_cat_gen_yr_p = cnt_by_cat_gen_yr.pivot_table(values = 'tconst', index = 'titleYear', columns = 'gender').reset_index()
    cnt_by_cat_gen_yr_p['F_prop'] = cnt_by_cat_gen_yr_p['F']/ (cnt_by_cat_gen_yr_p['F']+cnt_by_cat_gen_yr_p['M'])
    cnt_by_cat_gen_yr_p['M_prop'] = cnt_by_cat_gen_yr_p['M']/ (cnt_by_cat_gen_yr_p['F']+cnt_by_cat_gen_yr_p['M'])

    plt.figure(figsize = (17,7))
    plt.title('Proportion of movies made each year by female and male '+category+'s', fontsize=20, color='#dfc56e', pad=20)
    plt.bar(cnt_by_cat_gen_yr_p['titleYear'], cnt_by_cat_gen_yr_p['F_prop'], color = '#dbbd5a', label = 'female '+category, width = 0.95)
    plt.bar(cnt_by_cat_gen_yr_p['titleYear'], cnt_by_cat_gen_yr_p['M_prop'], bottom= cnt_by_cat_gen_yr_p['F_prop'], color = '#ae8e25', 
            label = 'male '+category, width = 0.95)
    plt.gca().set_yticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_yticks()])
    plt.yticks(fontsize=11, color='#ebdaa4')
    plt.xticks(fontsize=11, color='#ebdaa4')
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_visible(False)
    plt.gca().spines['left'].set_visible(False)
    plt.tight_layout();

    plt.savefig('../visualizations/mvs_yr_gndr_yr_'+category+'.png', transparent = True)

In [None]:
titles_by_gender('director')