In [1]:
import pandas as pd

### Download the data and load it to Pandas. 

You can find them [here](https://drive.google.com/file/d/1NY6cmF9Shjw-dD7BD6bNmfcIVz-kQcFR/view?usp=sharing).

In [2]:
titles = pd.read_csv('data/titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [3]:
cast = pd.read_csv('data/cast.csv', index_col=None)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


### Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years?

In [34]:
superman_movies = cast.loc[cast['title'].str.contains('Superman')].groupby('year').character.count().to_frame('SM_Characters')
batman_movies = cast.loc[cast['title'].str.contains('Batman')].groupby('year').character.count().to_frame('BT_Characters')
superman_batman_movies = superman_movies.merge(batman_movies, on = 'year', how='outer').fillna(0)
# superman_batman_movies = superman_batman_movies.fill(0)

In [35]:
def supermanyear(df):
    if df['SM_Characters'] > df['BT_Characters']:
        return 'Superman year'
    if df['BT_Characters'] > df['SM_Characters']:
        return'Batman year'
    else: return 'Tie'

superman_batman_movies['hero_year'] = superman_batman_movies.apply(supermanyear, axis=1)

In [43]:
superman_year = superman_batman_movies.loc[superman_batman_movies['hero_year']=='Superman year'].hero_year.count()
print(f' There are {superman_year} Superman years in History')

 There are 15 Superman years in History


### How many years have been "Batman years", with more Batman characters than Superman characters?

In [45]:
batman_year = superman_batman_movies.loc[superman_batman_movies['hero_year']=='Batman year'].hero_year.count()
print(f' There are {batman_year} Batman years in History')

 There are 17 Batman years in History


### Count the number of actor roles for each year and the number of actress roles for each year over the history of film.

In [50]:
gender_roles_year = cast.loc[(cast['type'] == 'actor') | (cast['type'] == 'actress')].groupby(['year', 'type']).type.count().to_frame('Total-roles')

In [51]:
gender_roles_year

Unnamed: 0_level_0,Unnamed: 1_level_0,Total-roles
year,type,Unnamed: 2_level_1
1894,actor,2
1894,actress,1
1900,actor,2
1905,actor,1
1906,actor,14
...,...,...
2022,actress,11
2023,actor,6
2023,actress,5
2025,actor,2


### Find the difference between the number of actor roles and the number of actress roles for each year over the history of film.

In [81]:
actress_count = cast.loc[cast['type'] == 'actress'].groupby('year').type.count().to_frame('total_actress')
actor_count = cast.loc[cast['type'] == 'actor'].groupby('year').type.count().to_frame('total_actor')
roles_per_gender = actress_count.merge(actor_count, on = 'year', how = 'outer').fillna(0)

In [82]:
def differencegender(df):
    return df['total_actor'] - df['total_actress']
roles_per_gender['geder_differ'] = roles_per_gender.apply(differencegender, axis=1)
roles_per_gender

Unnamed: 0_level_0,total_actress,total_actor,geder_differ
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1894,1.0,2,1.0
1906,3.0,14,11.0
1908,13.0,29,16.0
1910,24.0,44,20.0
1911,66.0,136,70.0
...,...,...,...
1905,0.0,1,1.0
1907,0.0,5,5.0
1909,0.0,3,3.0
2025,0.0,2,2.0


### What is the proportion of roles that have been 'actor' roles for each year in the history of film.

In [83]:
def proportionroles(df):
    x = (df['total_actor'] / (df['total_actor'] +df['total_actress']))*100
    return round(x)

roles_per_gender['actor_proportion'] = roles_per_gender.apply(proportionroles, axis=1)

In [84]:
roles_per_gender

Unnamed: 0_level_0,total_actress,total_actor,geder_differ,actor_proportion
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1894,1.0,2,1.0,67.0
1906,3.0,14,11.0,82.0
1908,13.0,29,16.0,69.0
1910,24.0,44,20.0,65.0
1911,66.0,136,70.0,67.0
...,...,...,...,...
1905,0.0,1,1.0,100.0
1907,0.0,5,5.0,100.0
1909,0.0,3,3.0,100.0
2025,0.0,2,2.0,100.0


### What is the proportion of supporting (n=2) roles that have been 'actor' roles for each year in the history of film.

In [85]:
total_actor_supporting = cast.loc[(cast['type'] == 'actor') & (cast['n'] == 2)].groupby('year').type.count().to_frame('total_supp_actor')
roles_per_gender = roles_per_gender.merge(total_actor_supporting, on = 'year', how = 'outer').fillna(0)
roles_per_gender

Unnamed: 0_level_0,total_actress,total_actor,geder_differ,actor_proportion,total_supp_actor
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1894,1.0,2,1.0,67.0,0.0
1906,3.0,14,11.0,82.0,2.0
1908,13.0,29,16.0,69.0,2.0
1910,24.0,44,20.0,65.0,2.0
1911,66.0,136,70.0,67.0,14.0
...,...,...,...,...,...
1905,0.0,1,1.0,100.0,0.0
1907,0.0,5,5.0,100.0,1.0
1909,0.0,3,3.0,100.0,0.0
2025,0.0,2,2.0,100.0,0.0


In [86]:
def proportionactorsupp(df):
    y = (df['total_supp_actor'] / (df['total_actor'] +df['total_actress']))*100
    return round(y)

roles_per_gender['support_actor_proportion'] = roles_per_gender.apply(proportionactorsupp, axis=1)

roles_per_gender

Unnamed: 0_level_0,total_actress,total_actor,geder_differ,actor_proportion,total_supp_actor,support_actor_proportion
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1894,1.0,2,1.0,67.0,0.0,0.0
1906,3.0,14,11.0,82.0,2.0,12.0
1908,13.0,29,16.0,69.0,2.0,5.0
1910,24.0,44,20.0,65.0,2.0,3.0
1911,66.0,136,70.0,67.0,14.0,7.0
...,...,...,...,...,...,...
1905,0.0,1,1.0,100.0,0.0,0.0
1907,0.0,5,5.0,100.0,1.0,20.0
1909,0.0,3,3.0,100.0,0.0,0.0
2025,0.0,2,2.0,100.0,0.0,0.0
