In [1]:
import pandas as pd

### Download the data and load it to Pandas. 

You can find them [here](https://drive.google.com/file/d/1NY6cmF9Shjw-dD7BD6bNmfcIVz-kQcFR/view?usp=sharing).

In [2]:
titles = pd.read_csv('titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [3]:
cast = pd.read_csv('cast.csv', index_col=None)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


### Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years?

In [4]:
superman_character_counts = cast.query('title.str.contains("Superman")', engine='python').groupby('year').size()
superman_character_counts

year
1948     65
1950     44
1951     28
1954    130
1960     13
1967      8
1968     14
1973     41
1978    105
1979     10
1980     75
1983     58
1987     80
1997     15
2006    101
2009      5
2011    107
2016    382
2017     10
dtype: int64

In [5]:
batman_character_counts = cast.query('title.str.contains("Batman")', engine='python').groupby('year').size()
batman_character_counts

year
1943     52
1949     40
1964      1
1965      2
1966     85
1967      8
1973      5
1989     68
1991     12
1992     65
1993     24
1995     89
1997    103
2005    144
2010     22
2012     30
2014     33
2016    481
2017     45
dtype: int64

In [6]:
superman_df = pd.DataFrame(data=superman_character_counts, columns=['Superman_Character_Count'])
batman_df = pd.DataFrame(data=batman_character_counts, columns=['Batman_Character_Count'])
combined_df = pd.concat([superman_df, batman_df], axis=1)
combined_df

Unnamed: 0_level_0,Superman_Character_Count,Batman_Character_Count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1943,,52.0
1948,65.0,
1949,,40.0
1950,44.0,
1951,28.0,
1954,130.0,
1960,13.0,
1964,,1.0
1965,,2.0
1966,,85.0


In [7]:
combined_df = combined_df.fillna(value=0)
num_SM_years = len(combined_df[combined_df['Superman_Character_Count'] > combined_df['Batman_Character_Count']])
print(f'Number of Superman years: {num_SM_years}')

Number of Superman years: 15


### How many years have been "Batman years", with more Batman characters than Superman characters?

In [8]:
num_BM_years = len(combined_df[combined_df['Superman_Character_Count'] < combined_df['Batman_Character_Count']])
print(f'Number of Batman years: {num_BM_years}')

Number of Batman years: 17


### Count the number of actor roles for each year and the number of actress roles for each year over the history of film.

In [9]:
cast['type'].unique()

array(['actor', 'actress'], dtype=object)

In [10]:
cast.describe()

Unnamed: 0,year,n
count,3634467.0,2244068.0
mean,1988.197,16.82359
std,27.86474,31.90017
min,1894.0,1.0
25%,1969.0,5.0
50%,2000.0,10.0
75%,2011.0,21.0
max,2026.0,33613.0


In [11]:
actor_actress_counts = cast.groupby(['type','year']).size().reset_index(name='count')
actor_actress_counts

Unnamed: 0,type,year,count
0,actor,1894,2
1,actor,1900,2
2,actor,1905,1
3,actor,1906,14
4,actor,1907,5
...,...,...,...
235,actress,2019,150
236,actress,2020,48
237,actress,2021,4
238,actress,2022,11


In [12]:
actor_counts = actor_actress_counts[actor_actress_counts['type'] == 'actor'].set_index('year').rename(columns={'count':'actor_count'}).drop(columns='type')
actress_counts = actor_actress_counts[actor_actress_counts['type'] == 'actress'].set_index('year').rename(columns={'count':'actress_count'}).drop(columns='type')
actor_counts

Unnamed: 0_level_0,actor_count
year,Unnamed: 1_level_1
1894,2
1900,2
1905,1
1906,14
1907,5
...,...
2021,9
2022,18
2023,6
2025,2


In [19]:
combined_actor_actress_df = pd.concat([actor_counts, actress_counts], axis=1)
combined_actor_actress_df = combined_actor_actress_df.fillna(value=0).astype(int)
combined_actor_actress_df

Unnamed: 0_level_0,actor_count,actress_count
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1894,2,1
1900,2,0
1905,1,0
1906,14,3
1907,5,0
...,...,...
2021,9,4
2022,18,11
2023,6,5
2025,2,0


### Find the difference between the number of actor roles and the number of actress roles for each year over the history of film.

In [21]:
combined_actor_actress_df['difference'] = combined_actor_actress_df['actor_count'] - combined_actor_actress_df['actress_count']
combined_actor_actress_df

Unnamed: 0_level_0,actor_count,actress_count,difference
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1894,2,1,1
1900,2,0,2
1905,1,0,1
1906,14,3,11
1907,5,0,5
...,...,...,...
2021,9,4,5
2022,18,11,7
2023,6,5,1
2025,2,0,2


### What is the proportion of roles that have been 'actor' roles for each year in the history of film.

In [22]:
combined_actor_actress_df['proportion'] = combined_actor_actress_df['actress_count'] / combined_actor_actress_df['actor_count'] * 100

In [23]:
combined_actor_actress_df[combined_actor_actress_df['actor_count'] == 0]

Unnamed: 0_level_0,actor_count,actress_count,difference,proportion
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [24]:
combined_actor_actress_df['proportion'] = combined_actor_actress_df.apply(lambda row: 100 if row['proportion'] == 0 else row['proportion'], axis = 1)

In [25]:
combined_actor_actress_df

Unnamed: 0_level_0,actor_count,actress_count,difference,proportion
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1894,2,1,1,50.000000
1900,2,0,2,100.000000
1905,1,0,1,100.000000
1906,14,3,11,21.428571
1907,5,0,5,100.000000
...,...,...,...,...
2021,9,4,5,44.444444
2022,18,11,7,61.111111
2023,6,5,1,83.333333
2025,2,0,2,100.000000


### What is the proportion of supporting (n=2) roles that have been 'actor' roles for each year in the history of film.

In [29]:
# All types
cast[cast['type'] == 'actor'].groupby(['type','year']).size()

type   year
actor  1894     2
       1900     2
       1905     1
       1906    14
       1907     5
               ..
       2021     9
       2022    18
       2023     6
       2025     2
       2026     1
Length: 123, dtype: int64

In [28]:
# Type n=2
cast[(cast['type'] == 'actor') & (cast['n'] == 2)].groupby(['type','year']).size()

type   year
actor  1906       2
       1907       1
       1908       2
       1910       2
       1911      14
               ... 
       2016    2337
       2017     883
       2018      41
       2019       7
       2020       1
Length: 114, dtype: int64