In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import pickle

#### _This notebook was used for cleaning and enriching the dataset describing each professional identifiable thorugh the `nconst` key._

***
### Step 1: Basic cleaning

**NOTE:** <br>
Due to the size of the raw data, it is not included in the repo. The `name.basics.tsv` can be obtained from https://www.imdb.com/interfaces/, within the limits of the T&Cs listed under the link.

In [2]:
name_basics = pd.read_csv('../data/name.basics.tsv', delimiter = '\t') 

In [3]:
name_basics.head(10)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0072308,tt0031983,tt0050419,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0071877,tt0117057,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0054452,tt0049189,tt0057345,tt0059956"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0072562,tt0080455,tt0077975"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"
5,nm0000006,Ingrid Bergman,1915,1982,"actress,soundtrack,producer","tt0038787,tt0038109,tt0077711,tt0034583"
6,nm0000007,Humphrey Bogart,1899,1957,"actor,soundtrack,producer","tt0034583,tt0043265,tt0040897,tt0033870"
7,nm0000008,Marlon Brando,1924,2004,"actor,soundtrack,director","tt0068646,tt0047296,tt0078788,tt0070849"
8,nm0000009,Richard Burton,1925,1984,"actor,soundtrack,producer","tt0057877,tt0059749,tt0087803,tt0061184"
9,nm0000010,James Cagney,1899,1986,"actor,soundtrack,director","tt0029870,tt0031867,tt0042041,tt0035575"


In [4]:
name_basics.shape

(10653189, 6)

In [5]:
name_basics['birthYear'] = name_basics['birthYear'].apply(lambda x: int(x) if x != '\\N' else np.nan)

In [6]:
name_basics['deathYear'] = name_basics['deathYear'].apply(lambda x: int(x) if x != '\\N' else np.nan)

In [7]:
name_basics.dtypes

nconst                object
primaryName           object
birthYear            float64
deathYear            float64
primaryProfession     object
knownForTitles        object
dtype: object

***
### Step 2: Data engineering

#### 2.1. Extracting gender information from `primaryName` (for all names) and `primaryProfession` (only for "actor"/"actress")

2.1.1. Extracting first name from `primaryName`

In [8]:
name_basics['first_name'] = name_basics['primaryName'].apply(lambda x: x.split(" ")[0])

2.1.2. Evaluating columns with multiple values

In [9]:
name_basics['primaryProfession'].apply(lambda x: str(x).count(",")).max() # there is a max of 3 professions per name

2

In [10]:
name_basics['knownForTitles'].apply(lambda x: str(x).count(",")).max() # there is a max of 7 titles per name

6

2.1.3. Extracting unique professions from column `primaryProfession` for future use in gender determination ("actor" vs "actress")

In [11]:
professions = []
for i in range(len(name_basics)):
    if type(name_basics['primaryProfession'][i]) == str:
        for j in range(name_basics['primaryProfession'][i].count(",")+1):
            professions.append(name_basics['primaryProfession'][i].split(",")[j])

In [12]:
ctr_prof = Counter(professions)
ctr_prof

Counter({'soundtrack': 136782,
         'actor': 2453911,
         'miscellaneous': 1085561,
         'actress': 1487353,
         'music_department': 200268,
         'writer': 698775,
         'director': 569304,
         'producer': 922375,
         'make_up_department': 180459,
         'composer': 256861,
         'assistant_director': 206944,
         'camera_department': 631929,
         'editor': 279750,
         'cinematographer': 298429,
         'casting_director': 25881,
         'script_department': 64208,
         'art_director': 75587,
         'stunts': 72052,
         'editorial_department': 150806,
         'costume_department': 122174,
         'animation_department': 165753,
         'art_department': 368408,
         'executive': 25623,
         'special_effects': 56827,
         'production_designer': 66670,
         'production_manager': 173603,
         'sound_department': 316864,
         'talent_agent': 12063,
         'casting_department': 45290,
         'co

2.1.4. Assigning gender to professionals identified as "actor" or  "actress"

In [13]:
name_basics['gender'] = name_basics['primaryProfession'].apply(lambda x: 'F' if 'actress' in str(x) else 'M' if 'actor' in str(x) else 'tbd')

In [14]:
name_basics['gender'].value_counts(normalize=True)

tbd    0.630057
M      0.230327
F      0.139616
Name: gender, dtype: float64

2.1.5. Assigning gender to professionals not identified as "actor" or "actress", using the `first_name` column

In [15]:
# loading the gender classifier model using last one, two, three, and four letters of first name as determinants of gender (see notebook "Gender from name")

gender_classifier = pickle.load(open('../pickles/gender_from_name.p', 'rb'))

In [16]:
# the function below takes in a first name and builds from it a dictionary of the last one, two, three, and four letters, which reliably identify 
# whether the holder of the name is a man or a woman

def gender_indicators(name):
    return {'last': name[-1],'last_two': name[-2:],'last_three': name[-3:], 'last_four': name[-4:]}

In [17]:
# applying `gender_classifier` to those names in the dataset whose gender is not identifiable from `primaryProfession`

name_basics['gender'] = name_basics.apply(lambda x: gender_classifier.classify(gender_indicators(x.first_name)) if x.gender == 'tbd' else x.gender, axis = 1)

In [18]:
# standardizing `primaryProfession` by replacing gender-specific "actress" label with a general "actor" (all other profession labels are gender-neutral)

name_basics['primaryProfession'] = name_basics['primaryProfession'].str.replace('actress', 'actor')

In [19]:
name_basics.set_index('nconst', inplace = True)
name_basics.head()

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,first_name,gender
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0072308,tt0031983,tt0050419,tt0053137",Fred,M
nm0000002,Lauren Bacall,1924.0,2014.0,"actor,soundtrack","tt0038355,tt0071877,tt0117057,tt0037382",Lauren,F
nm0000003,Brigitte Bardot,1934.0,,"actor,soundtrack,music_department","tt0054452,tt0049189,tt0057345,tt0059956",Brigitte,F
nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0078723,tt0072562,tt0080455,tt0077975",John,M
nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986",Ingmar,M


In [20]:
name_basics.to_csv('../data/name_basics_ce.csv', index_label = 'nconst')

***
### Step 3: Exporting useful information for easy retriefal during analysis of other datasets

Creating a dictionary mapping the unique `nconst` identifier to an individual's name and saving it as a pickle:

In [22]:
nconst_name = {nconst: name for nconst, name in zip(name_basics.index, name_basics['primaryName'])}

In [23]:
nconst_name['nm0000136']

'Johnny Depp'

In [24]:
pickle.dump(nconst_name, open('../pickles/nconst_name.p', 'wb'))