In [229]:
import pandas as pd
import numpy as np
import datetime as dt

### Download the data and load it to Pandas. 

You can find them [here](https://drive.google.com/file/d/1NY6cmF9Shjw-dD7BD6bNmfcIVz-kQcFR/view?usp=sharing).

In [230]:
titles = pd.read_csv('data/titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [231]:
cast = pd.read_csv('data/cast.csv', index_col=None)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


### Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years?

In [232]:
# first, count the number of Superman characters in each year
superman_yearly_count = cast[cast['character'] == 'Superman'].groupby('year')['year'].count().reset_index(name='count')

# second, count the numbero f Batman films in each year
batman_yearly_count =cast[cast['character'] == 'Batman'].groupby('year')['year'].count().reset_index(name='count')

In [233]:
# compare the two tables and create a new column to define which year is a 'Superman year'
supes_bat_comparison = superman_yearly_count.merge(batman_yearly_count, on='year', how='outer', suffixes=('_superman', '_batman')).fillna(0)
supes_bat_comparison.sort_values(by='year')

Unnamed: 0,year,count_superman,count_batman
30,1938,0.0,1.0
31,1940,0.0,1.0
32,1943,0.0,1.0
0,1948,1.0,0.0
33,1949,0.0,2.0
1,1950,1.0,0.0
2,1951,2.0,0.0
34,1953,0.0,2.0
3,1954,5.0,0.0
4,1955,1.0,0.0


In [234]:
# create column called 'superman_year' and determine whether it is one
supes_bat_comparison['superman_year'] = supes_bat_comparison.apply(lambda x: 'Superman year' if x['count_superman'] > x['count_batman'] else '', axis=1)

# sort table by year
supes_bat_comparison.sort_values(by='year')

Unnamed: 0,year,count_superman,count_batman,superman_year
30,1938,0.0,1.0,
31,1940,0.0,1.0,
32,1943,0.0,1.0,
0,1948,1.0,0.0,Superman year
33,1949,0.0,2.0,
1,1950,1.0,0.0,Superman year
2,1951,2.0,0.0,Superman year
34,1953,0.0,2.0,
3,1954,5.0,0.0,Superman year
4,1955,1.0,0.0,Superman year


### How many years have been "Batman years", with more Batman characters than Superman characters?

In [235]:
# create a new column called 'batman_year' that determines whether the year is a batman year
supes_bat_comparison['batman_year'] = supes_bat_comparison.apply(lambda x: 'Batman year' if x['count_batman'] > x['count_superman'] else '', axis=1)

# sort table by year
supes_bat_comparison.sort_values(by='year')

Unnamed: 0,year,count_superman,count_batman,superman_year,batman_year
30,1938,0.0,1.0,,Batman year
31,1940,0.0,1.0,,Batman year
32,1943,0.0,1.0,,Batman year
0,1948,1.0,0.0,Superman year,
33,1949,0.0,2.0,,Batman year
1,1950,1.0,0.0,Superman year,
2,1951,2.0,0.0,Superman year,
34,1953,0.0,2.0,,Batman year
3,1954,5.0,0.0,Superman year,
4,1955,1.0,0.0,Superman year,


### Count the number of actor roles for each year and the number of actress roles for each year over the history of film.

In [236]:
cast['type'].value_counts()

actor      2455335
actress    1179132
Name: type, dtype: int64

In [237]:
actor_table = cast[cast['type'] == 'actor']
actress_table = cast[cast['type'] == 'actress']

In [238]:
actor_counts = actor_table.groupby(by='year').count()['type'].to_frame().reset_index().rename(columns={'type' : 'actor'})
actress_counts = actress_table.groupby(by='year').count()['type'].to_frame().reset_index().rename(columns={'type' : 'actress'})

### Find the difference between the number of actor roles and the number of actress roles for each year over the history of film.

In [239]:
merged_counts = actor_counts.merge(actress_counts, on='year', how='outer').fillna(0).astype('int64')

In [240]:
merged_counts['difference'] = merged_counts.apply(lambda x: x['actor'] - x['actress'], axis=1)
difference_table = merged_counts[merged_counts['difference'] > 0]
difference_table['greater_value'] = difference_table.apply(lambda x: 'actor' if x['actor'] > x['actress'] else 'actress', axis=1)

In [241]:
merged_counts

Unnamed: 0,year,actor,actress,difference
0,1894,2,1,1
1,1900,2,0,2
2,1905,1,0,1
3,1906,14,3,11
4,1907,5,0,5
...,...,...,...,...
118,2021,9,4,5
119,2022,18,11,7
120,2023,6,5,1
121,2025,2,0,2


In [242]:
difference_table

Unnamed: 0,year,actor,actress,difference,greater_value
0,1894,2,1,1,actor
1,1900,2,0,2,actor
2,1905,1,0,1,actor
3,1906,14,3,11,actor
4,1907,5,0,5,actor
...,...,...,...,...,...
118,2021,9,4,5,actor
119,2022,18,11,7,actor
120,2023,6,5,1,actor
121,2025,2,0,2,actor


### What is the proportion of roles that have been 'actor' roles for each year in the history of film.

In [243]:
merged_counts['proportion_of_actors'] = merged_counts.apply(lambda x: x['actor'] / (x['actor'] + x['actress']), axis=1).round(2)

In [244]:
merged_counts

Unnamed: 0,year,actor,actress,difference,proportion_of_actors
0,1894,2,1,1,0.67
1,1900,2,0,2,1.00
2,1905,1,0,1,1.00
3,1906,14,3,11,0.82
4,1907,5,0,5,1.00
...,...,...,...,...,...
118,2021,9,4,5,0.69
119,2022,18,11,7,0.62
120,2023,6,5,1,0.55
121,2025,2,0,2,1.00


### What is the proportion of supporting (n=2) roles that have been 'actor' roles for each year in the history of film.

In [245]:
supporting_cast = cast[cast['n'] == 2]

In [246]:
supporting_actors = supporting_cast[supporting_cast['type'] == 'actor'].groupby(by='year').count()['type'].reset_index().rename(columns={'type' : 'actors'})
supporting_actress = supporting_cast[supporting_cast['type'] == 'actress'].groupby(by='year').count()['type'].reset_index().rename(columns={'type' : 'actresses'})

In [247]:
supporting_counts = supporting_actors.merge(supporting_actress, on='year', how='outer').fillna(0).astype('int64')

In [251]:
supporting_counts['proportion'] = supporting_counts.apply(lambda x: x['actors'] / (x['actors'] + x['actresses']), axis=1).round(2)

In [252]:
supporting_counts

Unnamed: 0,year,actors,actresses,proportion
0,1906,2,1,0.67
1,1907,1,0,1.00
2,1908,2,0,1.00
3,1910,2,2,0.50
4,1911,14,5,0.74
...,...,...,...,...
109,2016,2337,1823,0.56
110,2017,883,643,0.58
111,2018,41,30,0.58
112,2019,7,2,0.78
