In [83]:
import pandas as pd
import numpy as np

### Download the data and load it to Pandas. 

You can find them [here](https://drive.google.com/file/d/1NY6cmF9Shjw-dD7BD6bNmfcIVz-kQcFR/view?usp=sharing).

In [84]:
titles = pd.read_csv('_data/titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [85]:
cast = pd.read_csv('_data/cast.csv', index_col=None)
cast['n'].fillna(99, inplace=True)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,99.0
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,99.0
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,99.0


### Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years?

In [86]:
cast_superman = cast[cast['character'] == 'Superman'].drop(columns=['name', 'type', 'n'])
cast_superman.head()

Unnamed: 0,title,year,character
46774,Atom Man vs. Superman,1950,Superman
46782,Corky of Gasoline Alley,1951,Superman
46804,Superman,1948,Superman
48385,American Splendor,2003,Superman
50929,My Amnesia Girl,2010,Superman


In [87]:
cast_batman = cast[cast['character'] == 'Batman'].drop(columns=['name', 'type', 'n'])
cast_batman.head()

Unnamed: 0,title,year,character
1030,Batman Fights Dracula,1967,Batman
16133,Batman v Superman: Dawn of Justice,2016,Batman
16159,Justice League,2017,Batman
16175,Suicide Squad,2016,Batman
79363,The Lego Batman Movie,2017,Batman


In [88]:
cast_man_merged = pd.merge(
  cast_superman,
  cast_batman,
  on=['title', 'year'],
  how='outer').rename(columns={
    'character_x' : 'Superman',
    'character_y' : 'Batman'
  })
cast_man_merged

Unnamed: 0,title,year,Superman,Batman
0,Atom Man vs. Superman,1950,Superman,
1,Corky of Gasoline Alley,1951,Superman,
2,Superman,1948,Superman,
3,American Splendor,2003,Superman,Batman
4,My Amnesia Girl,2010,Superman,
...,...,...,...,...
109,Batman: The Movie,1966,,Batman
110,Batman,1943,,Batman
111,The Cannonball Run,1981,,Batman
112,"Fight! Batman, Fight!",1973,,Batman


In [89]:
cast_man_precount = cast_man_merged[['year', 'Superman', 'Batman']].groupby('year').count()
cast_man_count = cast_man_precount[cast_man_precount['Superman'] != cast_man_precount['Batman']]
cast_man_count.head()

Unnamed: 0_level_0,Superman,Batman
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1938,0,1
1940,0,1
1943,0,1
1948,1,0
1949,0,2


In [105]:
cast_man_count.loc[:,'Whatman'] = np.where(cast_man_count['Superman'] > cast_man_count['Batman'], 'Superman', 'Batman')
cast_man_count[cast_man_count['Whatman'] == 'Superman'].head()

Unnamed: 0_level_0,Superman,Batman,Whatman
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1948,1,0,Superman
1950,1,0,Superman
1951,2,0,Superman
1954,5,0,Superman
1955,1,0,Superman


### How many years have been "Batman years", with more Batman characters than Superman characters?

In [104]:
cast_man_count[cast_man_count['Whatman'] == 'Batman'].head()

Unnamed: 0_level_0,Superman,Batman,Whatman
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1938,0,1,Batman
1940,0,1,Batman
1943,0,1,Batman
1949,0,2,Batman
1953,0,2,Batman


### Count the number of actor roles for each year and the number of actress roles for each year over the history of film.

In [106]:
cast[['year', 'type', 'n']].groupby(['year', 'type']).count().head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n
year,type,Unnamed: 2_level_1
1894,actor,2
1894,actress,1
1900,actor,2
1905,actor,1
1906,actor,14


### Find the difference between the number of actor roles and the number of actress roles for each year over the history of film.

In [107]:
cast_actors = cast[cast['type'] == 'actor']
cast_actresses = cast[cast['type'] == 'actress']

In [115]:
cast_actors_count = cast_actors[['year', 'type']].groupby('year').count().rename(columns={'type' : 'actors'})
cast_actresses_count = cast_actresses[['year', 'type']].groupby('year').count().rename(columns={'type' : 'actresses'})

In [121]:
cast_types_count = pd.merge(
  cast_actors_count,
  cast_actresses_count,
  on=['year'],
  how='outer').fillna(0)

In [144]:
cast_types_count.loc[:, 'diff'] = cast_types_count['actors'] - cast_types_count['actresses'].apply(int)
cast_types_count['diff'] = cast_types_count['diff'].apply(int)

In [145]:
cast_types_count[cast_types_count['diff'] == cast_types_count['diff'].max()]

Unnamed: 0_level_0,actors,actresses,diff
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,112411,65318.0,47093


### What is the proportion of roles that have been 'actor' roles for each year in the history of film.

In [158]:
cast_types_count.loc[:,'ratio'] = np.where(cast_types_count['actors'] / cast_types_count['actresses'] != np.inf, cast_types_count['actors'] / cast_types_count['actresses'], np.nan)  
cast_types_count.head()

Unnamed: 0_level_0,actors,actresses,diff,ratio
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1894,2,1.0,1,2.0
1900,2,0.0,2,
1905,1,0.0,1,
1906,14,3.0,11,4.666667
1907,5,0.0,5,


### What is the proportion of supporting (n=2) roles that have been 'actor' roles for each year in the history of film.

In [179]:
cast_actress_2 = cast[
  (cast['type'] == 'actress') &
  (cast['n'] == 2)]

cast_actor_2 = cast[
  (cast['type'] == 'actor') &
  (cast['n'] == 2)
]

In [188]:
cast_actor_2_count = cast_actor_2[['year', 'type']].groupby(['year']).count().rename(columns={'type' : 'actors'})
cast_actress_2_count = cast_actress_2[['year', 'type']].groupby(['year']).count().rename(columns={'type' : 'actresses'})

In [196]:
cast_type_2 = pd.merge(cast_actor_2_count, cast_actress_2_count, on='year', how='outer').fillna(0)
cast_type_2['total'] = cast_type_2['actors'] + cast_type_2['actresses']
cast_type_2['actor_raio'] = round(cast_type_2['actors'] / cast_type_2['total'] * 100, 2)
cast_type_2['actor_raio']

year
1906     66.67
1907    100.00
1908    100.00
1910     50.00
1911     73.68
         ...  
2016     56.18
2017     57.86
2018     57.75
2019     77.78
2020     33.33
Name: actor_raio, Length: 114, dtype: float64