In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Download the data

In [4]:
titles = pd.read_csv('titles.csv', index_col=None)
titles.head()

Unnamed: 0,title,year
0,The Rising Son,1990
1,The Thousand Plane Raid,1969
2,Crucea de piatra,1993
3,Country,2000
4,Gaiking II,2011


In [5]:
cast = pd.read_csv('cast.csv', index_col=None)
cast.head()

Unnamed: 0,title,year,name,type,character,n
0,Closet Monster,2015,Buffy #1,actor,Buffy 4,
1,Suuri illusioni,1985,Homo $,actor,Guests,22.0
2,Battle of the Sexes,2017,$hutter,actor,Bobby Riggs Fan,10.0
3,Secret in Their Eyes,2015,$hutter,actor,2002 Dodger Fan,
4,Steve Jobs,2015,$hutter,actor,1988 Opera House Patron,


## What are the ten most common movie names of all time?

In [12]:
# Get the value counts of the top 10 most common movie names

titles.title.value_counts().head(10)

Hamlet                  18
Carmen                  16
Macbeth                 15
Maya                    12
The Outsider            12
The Three Musketeers    11
Temptation              11
Kismet                  11
Freedom                 11
Othello                 11
Name: title, dtype: int64

## Which three years in the 1930s saw the most films released?

In [30]:
# Filter for years between 1930-40, groupby year, aggregate count function on title
# then sort values by title count in descending order.

cast[(cast['year'] >= 1930) & (cast['year'] < 1940)
    ].groupby('year').agg({'title': 'count'}).sort_values('title', ascending=False)[:3]

Unnamed: 0_level_0,title
year,Unnamed: 1_level_1
1937,27462
1936,25755
1938,25206


## Count the number of films that have been released in each decade over the history of cinema.

In [66]:
# First, create a column called 'decade' where 

cast['decade'] = (10 * (cast['year'] // 10)).astype(str) + 's'

# Group by newly created decade column, get total title count

cast.groupby('decade').title.count()

decade
1890s          3
1900s         70
1910s      42604
1920s      71472
1930s     199058
1940s     211842
1950s     211470
1960s     187069
1970s     215147
1980s     283194
1990s     381806
2000s     746571
2010s    1083959
2020s        202
Name: title, dtype: int64

## Count the number of "Hamlet" films made in each decade.

In [71]:
# filter titles for movies called 'Hamlet'. Group by derived 'decade' column.

cast[(cast['title'] == 'Hamlet')].groupby('decade').title.count()

decade
1910s    28
1920s     9
1940s    24
1950s     1
1960s    57
1970s    19
1980s     3
1990s    83
2000s    55
2010s    34
Name: title, dtype: int64

## Count the number of "Rustler" characters in each decade of the history of film.

In [75]:
# Filter for 'Rustler' characters, group by decade. Get character count.

cast[(cast['character'] == 'Rustler')].groupby('decade').character.count()

decade
1920s     6
1930s    84
1940s    18
1950s     8
1960s     2
1970s     9
1980s     1
1990s     4
2000s     2
Name: character, dtype: int64

## Count the number of "Hamlet" characters in each decade.

In [74]:
# filter for characters called 'Hamlet'. Group by derived column 'decade' and get total character count.

cast[(cast['character'] == 'Hamlet')].groupby('decade').character.count()

decade
1910s     6
1920s     1
1930s     6
1940s     1
1950s     3
1960s     7
1970s     8
1980s     2
1990s    11
2000s    19
2010s    28
Name: character, dtype: int64

## What are the 11 most common character names in movie history?

In [83]:
# Get first 11 value counts of character column. Get index and set to list.

cast['character'].value_counts()[:11].index.tolist()

['Himself',
 'Dancer',
 'Extra',
 'Reporter',
 'Doctor',
 'Student',
 'Policeman',
 'Party Guest',
 'Nurse',
 'Bartender',
 'Minor Role']

## Who are the 10 people most often credited as "Herself" in film history?

In [93]:
# Filter for characters called 'Herself'. Get name values, and value counts. Select top 10 and index to list.

Herself = cast[(cast['character'] == 'Herself')].name.value_counts()[:10].index.tolist()
print(f"The 10 most often credited roles as 'Herself' are {Herself}.")

The 10 most often credited roles as 'Herself' are ['Queen Elizabeth II', 'Joyce Brothers', 'Margaret Thatcher', 'Mary Jo Pehl', 'Hillary Clinton', 'Joan Rivers', 'Sumie Sakai', 'Marilyn Monroe', 'Bunny Yeager', 'Caroline Rhea'].


## Who are the 10 people most often credited as "Himself" in film history?

In [95]:
Himself = cast[(cast['character'] == 'Himself')].name.value_counts()[:10].index.tolist()
print(f"The 10 most often credited roles as 'Himself' are {Himself}.")

The 10 most often credited roles as 'Himself' are ['Adolf Hitler', 'Richard Nixon', 'Ronald Reagan', 'John F. Kennedy', 'Winston Churchill', 'George W. Bush', 'Benito Mussolini', 'Ron Jeremy', 'Martin Luther King', 'Franklin D. Roosevelt'].


## Which actors or actresses appeared in the most movies in the year 1945?

In [107]:
cast[(cast['year'] == 1945)].groupby('name').agg({'type':'count'}).sort_values('type', ascending=False).head(5)

Unnamed: 0_level_0,type
name,Unnamed: 1_level_1
Emmett Vogan,39
Sam (II) Harris,32
Bess Flowers,29
Harold Miller,29
Nolan Leary,27


## Which actors or actresses appeared in the most movies in the year 1985?

In [110]:
cast[(cast['year'] == 1985)].groupby('name').agg({'type':'count'}).sort_values('type', ascending=False).head(5)

Unnamed: 0_level_0,type
name,Unnamed: 1_level_1
Mammootty,22
Shakti Kapoor,21
Sukumari,20
Raj Babbar,15
Lou Scheimer,15


## How many roles "Mammootty" has played in each year of his career.

In [118]:
cast[(cast['name'] == 'Mammootty')].groupby('year').agg({'character':'count'})

Unnamed: 0_level_0,character
year,Unnamed: 1_level_1
1971,1
1980,2
1981,3
1982,7
1983,18
1984,16
1985,22
1986,18
1987,9
1988,7


## What are the 10 most frequent roles that start with the phrase "Patron in"?

In [126]:
cast[(cast['character'].str.contains('Patron in'))].groupby('character').agg({'year':'count'}).sort_values('year',ascending=False).head(10)

Unnamed: 0_level_0,year
character,Unnamed: 1_level_1
Patron in Frisky Rabbit,16
Patron in the Coffee House,9
Patron in Chinese Restaurant,9
Patron in Billiard Parlor,5
Patron in Bar,4
Patron in restaurant,4
Patron in Club,3
Patron in Restaurant,3
Patron in cabaret,3
Patron in Audience,2


## What are the 10 most frequent roles that start with the word "Science"?

In [127]:
cast[(cast['character'].str.contains('Science'))].groupby('character').agg({'year':'count'}).sort_values('year',ascending=False).head(10)

Unnamed: 0_level_0,year
character,Unnamed: 1_level_1
Science Teacher,60
The Science Fair Contestants & Families,9
Science Fair Student,9
Science Student,9
Science Fair Judge,6
Science Club Member,5
Science Kid,5
Science Promo Cadet,5
Science Reporter,5
Science,4


## Count number of n-values of the roles that Judi Dench has played over her career.

In [130]:
cast[(cast['name'] == 'Judi Dench')].n.count()

48

## Count number of n-values of Cary Grant's roles through his career.

In [131]:
cast[(cast['name'] == 'Cary Grant')].n.count()

75

## Count number of n-value of the roles that Sidney Poitier has acted over the years.

In [132]:
cast[(cast['name'] == 'Sidney Poitier')].n.count()

42

## How many leading (n=1) roles were available to actors, and how many to actresses, in the 1950s?

In [146]:
actors = cast[(cast['type'] == 'actor') & (cast['n'] == 1.0)].groupby('decade').title.count().iloc[5]
actresses = cast[(cast['type'] == 'actress') & (cast['n'] == 1.0)].groupby('decade').title.count().iloc[5]

print(f'There were {actors} leading roles for actors in the 1950s and {actresses} leading roles for actresses in the 1950s.')

There were 6616 leading roles for actors in the 1950s and 2965 leading roles for actresses in the 1950s.


## How many supporting (n=2) roles were available to actors, and how many to actresses, in the 1950s?

In [None]:
actors_2 = cast[(cast['type'] == 'actor') & (cast['n'] == 1.0)].groupby('decade').title.count().iloc[5]
actresses_2 = cast[(cast['type'] == 'actress') & (cast['n'] == 1.0)].groupby('decade').title.count().iloc[5]

print(f'There were {actors} leading roles for actors in the 1950s and {actresses} leading roles for actresses in the 1950s.')