# Estimating how active actors will be

In [3]:
! wget "https://datasets.imdbws.com/title.principals.tsv.gz"
! wget "https://datasets.imdbws.com/title.basics.tsv.gz"
! wget "https://datasets.imdbws.com/name.basics.tsv.gz"

--2019-04-02 03:15:03--  https://datasets.imdbws.com/title.principals.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 54.230.75.166, 54.230.75.80, 54.230.75.240, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|54.230.75.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 280248846 (267M) [binary/octet-stream]
Saving to: ‘title.principals.tsv.gz’


2019-04-02 03:15:05 (150 MB/s) - ‘title.principals.tsv.gz’ saved [280248846/280248846]

--2019-04-02 03:15:06--  https://datasets.imdbws.com/title.basics.tsv.gz
Resolving datasets.imdbws.com (datasets.imdbws.com)... 54.230.75.166, 54.230.75.80, 54.230.75.240, ...
Connecting to datasets.imdbws.com (datasets.imdbws.com)|54.230.75.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 103185075 (98M) [binary/octet-stream]
Saving to: ‘title.basics.tsv.gz’


2019-04-02 03:15:07 (147 MB/s) - ‘title.basics.tsv.gz’ saved [103185075/103185075]

--2019-04-02 03:15:08--  https://

In [4]:
! gunzip title.principals.tsv.gz
! gunzip title.basics.tsv.gz
! gunzip name.basics.tsv.gz

gzip: title.principals.tsv already exists; do you wish to overwrite (y or n)? ^C
gzip: title.basics.tsv already exists; do you wish to overwrite (y or n)? ^C
gzip: name.basics.tsv already exists; do you wish to overwrite (y or n)? ^C


In [0]:
import pandas as pd
import numpy as np

In [0]:
cast = pd.read_csv('title.principals.tsv', sep='\t', na_values = "\\N", keep_default_na = False, 
                   usecols = ['tconst','ordering','nconst','category','job'],
                   dtype = {'tconst': str, 'ordering': 'Int64', 'nconst': str, 'category': str, 'job': str})
titles = pd.read_csv('title.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                     usecols = ['tconst','primaryTitle','startYear'],
                    dtype = {'tconst': str, 'primaryTitle': str, 'startYear': 'Int64'})
names = pd.read_csv('name.basics.tsv', sep='\t', na_values = "\\N", keep_default_na = False,
                    usecols = ['nconst','primaryName','birthYear','deathYear'],
                   dtype = {'nconst': str, 'primaryName': str, 'birthYear': 'Int64', 'deathYear': 'Int64'})

In [9]:
cast.head()

Unnamed: 0,tconst,ordering,nconst,category,job
0,tt0000001,1,nm1588970,self,
1,tt0000001,2,nm0005690,director,
2,tt0000001,3,nm0374658,cinematographer,director of photography
3,tt0000002,1,nm0721526,director,
4,tt0000002,2,nm1335271,composer,


In [10]:
titles.head()

Unnamed: 0,tconst,primaryTitle,startYear
0,tt0000001,Carmencita,1894
1,tt0000002,Le clown et ses chiens,1892
2,tt0000003,Pauvre Pierrot,1892
3,tt0000004,Un bon bock,1892
4,tt0000005,Blacksmith Scene,1893


In [11]:
names.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear
0,nm0000001,Fred Astaire,1899,1987.0
1,nm0000002,Lauren Bacall,1924,2014.0
2,nm0000003,Brigitte Bardot,1934,
3,nm0000004,John Belushi,1949,1982.0
4,nm0000005,Ingmar Bergman,1918,2007.0


In [0]:
actors = ["Lauren Bacall", "Cary Grant"]

In [0]:
names_actors = names[names.primaryName.isin(actors)].head(len(actors))

In [0]:
names_actors

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0038355,tt0037382,tt0117057,tt0071877"
25,nm0000026,Cary Grant,1904.0,1986.0,"actor,soundtrack,producer","tt0056923,tt0036613,tt0048728,tt0053125"


In [0]:
cast_actors = pd.merge(names_actors, cast, on = 'nconst')

In [0]:
movie_actors = pd.merge(cast_actors, titles, on = 'tconst')

In [0]:
movie_list = movie_actors[['primaryName','birthYear','deathYear','category','primaryTitle','startYear']]

In [0]:
movie_list = movie_list[movie_list.category.isin(['actor','actress'])]

In [0]:
movie_list

Unnamed: 0,primaryName,birthYear,deathYear,category,primaryTitle,startYear
0,Lauren Bacall,1924.0,2014.0,actress,To Have and Have Not,1944.0
1,Lauren Bacall,1924.0,2014.0,actress,Confidential Agent,1945.0
2,Lauren Bacall,1924.0,2014.0,actress,The Big Sleep,1946.0
3,Lauren Bacall,1924.0,2014.0,actress,Dark Passage,1947.0
4,Lauren Bacall,1924.0,2014.0,actress,Key Largo,1948.0
5,Lauren Bacall,1924.0,2014.0,actress,Bright Leaf,1950.0
6,Lauren Bacall,1924.0,2014.0,actress,Young Man with a Horn,1950.0
7,Lauren Bacall,1924.0,2014.0,actress,How to Marry a Millionaire,1953.0
8,Lauren Bacall,1924.0,2014.0,actress,Woman's World,1954.0
9,Lauren Bacall,1924.0,2014.0,actress,Blood Alley,1955.0


In [0]:
movie_list.loc["startYear"] = movie_list.startYear.astype(int) 

In [0]:
startYear = 1950
endYear = 1960

In [0]:
movie_list_observed = movie_list[movie_list.startYear.isin(range(startYear,endYear))]
movie_list_unobserved = movie_list[movie_list.startYear >= endYear]

In [0]:
movie_counts = movie_list_observed.groupby(['primaryName','startYear'])['primaryTitle'].count().reset_index()

In [0]:
movie_counts.groupby('primaryName')['primaryTitle'].sum()

primaryName
Cary Grant       13
Lauren Bacall    13
Name: primaryTitle, dtype: int64