In [2]:
import pandas as pd

### Download the data and load it to Pandas. 

You can find them [here](https://drive.google.com/file/d/1NY6cmF9Shjw-dD7BD6bNmfcIVz-kQcFR/view?usp=sharing).

In [5]:
#Import titles
titles = pd.read_csv('imdb_pandas/titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [4]:
#Import cast
cast = pd.read_csv('imdb_pandas/cast.csv', index_col=None)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


In [30]:
#Add columns of 1 to cast
cast['count'] = 1

### Define a year as a "Superman year" whose films feature more Superman characters than Batman. How many years in film history have been Superman years?

In [40]:
#Extract all titles containing superman
superman = cast.loc[(cast['title'].str.contains('Superman', regex=False))]

#Apply groupby to get number of characters per year
superman = superman.groupby(['year'])['character'].count().reset_index()

#Rename column
superman = superman.rename(columns={'character': 'Superman Char No'})


In [41]:
#Extract all titles containing batman
batman = cast.loc[(cast['title'].str.contains('Batman', regex=False))]

#Apply groupby to get number of characters per year
batman = batman.groupby(['year'])['character'].count().reset_index()

#Rename columns
batman = batman.rename(columns={'character': 'Batman Char No'})

In [46]:
#Merge superman and batman and fill nulls with 0
heros = pd.concat([superman,batman], axis = 0).fillna(0)
heros.head()

Unnamed: 0,year,Superman Char No,Batman Char No
0,1948,65.0,0.0
1,1950,44.0,0.0
2,1951,28.0,0.0
3,1954,130.0,0.0
4,1960,13.0,0.0


In [48]:
#Calculate difference between number of characters
heros['difference'] = heros['Superman Char No']-heros['Batman Char No']

In [49]:
#Extract positive difference
heros.loc[heros.difference >=0]

Unnamed: 0,year,Superman Char No,Batman Char No,difference
0,1948,65.0,0.0,65.0
1,1950,44.0,0.0,44.0
2,1951,28.0,0.0,28.0
3,1954,130.0,0.0,130.0
4,1960,13.0,0.0,13.0
5,1967,8.0,0.0,8.0
6,1968,14.0,0.0,14.0
7,1973,41.0,0.0,41.0
8,1978,105.0,0.0,105.0
9,1979,10.0,0.0,10.0


### How many years have been "Batman years", with more Batman characters than Superman characters?

In [50]:
#Extract negative difference
heros.loc[heros.difference <0]

Unnamed: 0,year,Superman Char No,Batman Char No,difference
0,1943,0.0,52.0,-52.0
1,1949,0.0,40.0,-40.0
2,1964,0.0,1.0,-1.0
3,1965,0.0,2.0,-2.0
4,1966,0.0,85.0,-85.0
5,1967,0.0,8.0,-8.0
6,1973,0.0,5.0,-5.0
7,1989,0.0,68.0,-68.0
8,1991,0.0,12.0,-12.0
9,1992,0.0,65.0,-65.0


### Count the number of actor roles for each year and the number of actress roles for each year over the history of film.

In [13]:
#Find all rows with actors
actor = cast.loc[cast['type'] == 'actor']

#Find number of actors per year
actor.year.value_counts().reset_index()

Unnamed: 0,index,year
0,2016,112411
1,2015,95679
2,2014,94877
3,2013,90752
4,2012,87314
...,...,...
118,1900,2
119,1894,2
120,2025,2
121,1905,1


In [12]:
#Find all rows with actress
actress = cast.loc[cast['type'] == 'actress']

#Find number of actresses per year
actress.year.value_counts().reset_index()

Unnamed: 0,index,year
0,2016,65318
1,2015,54811
2,2014,53516
3,2013,48718
4,2012,47179
...,...,...
112,2022,11
113,2023,5
114,2021,4
115,1906,3


### Find the difference between the number of actor roles and the number of actress roles for each year over the history of film.

In [20]:
#Create Dataframes from above
A = actor.year.value_counts().reset_index()
B = actress.year.value_counts().reset_index()

#rename year colums
A = A.rename(columns={'year':'actor'})
B = B.rename(columns={'year':'actress'})

In [21]:
#Merge A and B
merged_act = pd.merge(A,B, on='index', how='inner')

merged_act.head(2)

Unnamed: 0,index,actor,actress
0,2016,112411,65318
1,2015,95679,54811


In [22]:
#Find difference between actor and actress for each year
merged_act.actor - merged_act.actress

0      47093
1      40868
2      41361
3      42034
4      40135
       ...  
112        7
113       11
114        5
115        1
116        1
Length: 117, dtype: int64

### What is the proportion of roles that have been 'actor' roles for each year in the history of film.

In [23]:
#Find proportion of actors and actresses in type column
cast.type.value_counts(normalize=True)*100

actor      67.556949
actress    32.443051
Name: type, dtype: float64

In [26]:
#apply groupby for year and find proportion of actors
P_A = cast.groupby('year').apply(lambda x: ((x[x['type'] == 'actor']['count']/x['count'].sum())*100)).reset_index().rename(columns={'count':'Proportion_Actor'})
P_A = P_A[['year', 'Proportion_Actor']]
P_A

Unnamed: 0,year,Proportion_Actor
0,1894,33.333333
1,1894,33.333333
2,1900,50.000000
3,1900,50.000000
4,1905,100.000000
...,...,...
2455330,2023,9.090909
2455331,2023,9.090909
2455332,2025,50.000000
2455333,2025,50.000000


### What is the proportion of supporting (n=2) roles that have been 'actor' roles for each year in the history of film.

In [28]:
#Extract only rows with n=2
support = cast.loc[cast['n'] == 2.0]
support.head()

Unnamed: 0,title,year,name,type,character,n,count
102,Big Apple,2002,Angel 11:11,actor,Angel,2.0,1
126,Asad wa arbaa qutat,2007,4 Cats,actor,Themselves,2.0,1
229,Flutter Green: The Final Chapter,2017,Sergey A.,actor,Demichev,2.0,1
266,Suli,2016,Pragathi A.S.,actor,Shabana,2.0,1
312,Sarrainodu,2016,Aadhi,actor,Vairam Dhanush,2.0,1


In [29]:
#apply groupby for year and find proportion of actors
S_A = support.groupby('year').apply(lambda x: ((x[x['type'] == 'actor']['count']/x['count'].sum())*100)).reset_index().rename(columns={'count':'Proportion_Actor'})
S_A = S_A[['year', 'Proportion_Actor']]
S_A

Unnamed: 0,year,Proportion_Actor
0,1906,33.333333
1,1906,33.333333
2,1907,100.000000
3,1908,50.000000
4,1908,50.000000
...,...,...
74896,2019,11.111111
74897,2019,11.111111
74898,2019,11.111111
74899,2019,11.111111
