# IMDb datasets normalization: 'name_basics' Table

## Modules import

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path


### First Objective:
- Professions
- Title_types
- Genres
- Regions
- Languages

#### Importing dataset name_basics.tsv:

In [2]:
name_basics = pd.read_csv(Path('../data/name_basics.tsv').absolute(), dtype={'nconst': str,
                                                                            'primaryName': str, 
                                                                            'birthYear': float, 
                                                                            'deathYear': float,
                                                                            'primaryProfession': str,
                                                                            'knownForTitles': str },
                                                                    
                                                                     na_values='\\N', 
                                                                     sep='\t'
                                                                     )
name_basics.dtypes

nconst                object
primaryName           object
birthYear            float64
deathYear            float64
primaryProfession     object
knownForTitles        object
dtype: object

In [3]:
name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0045537,tt0053137,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0038355,tt0117057,tt0075213,tt0037382"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0060827,tt0050986,tt0069467,tt0083922"
...,...,...,...,...,...,...
12489117,nm9993714,Romeo del Rosario,,,"animation_department,art_department","tt14069590,tt11657662,tt2455546"
12489118,nm9993716,Essias Loberg,,,,
12489119,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744
12489120,nm9993718,Aayush Nair,,,cinematographer,


Extracting professions

In [4]:
professions = name_basics.primaryProfession[name_basics.primaryProfession.notna()].str.lower().apply(lambda x: x.split(',')).explode().value_counts()
print(f'There are {professions.index.size} professions:')
professions.index

There are 43 professions:


Index(['actor', 'actress', 'miscellaneous', 'producer', 'writer',
       'camera_department', 'director', 'art_department', 'sound_department',
       'cinematographer', 'editor', 'composer', 'music_department',
       'assistant_director', 'visual_effects', 'make_up_department',
       'production_manager', 'animation_department', 'editorial_department',
       'soundtrack', 'costume_department', 'transportation_department',
       'art_director', 'stunts', 'script_department', 'location_management',
       'production_designer', 'costume_designer', 'special_effects',
       'casting_department', 'set_decorator', 'executive', 'casting_director',
       'manager', 'talent_agent', 'publicist', 'legal', 'assistant',
       'music_artist', 'podcaster', 'production_department',
       'electrical_department', 'choreographer'],
      dtype='object', name='primaryProfession')

Creating SQL Script

In [5]:
txt = ''
for prof in professions.index:
    txt += f"INSERT INTO Profession(profession_name) VALUES('{prof}');\n"

txt

"INSERT INTO Profession(profession_name) VALUES('actor');\nINSERT INTO Profession(profession_name) VALUES('actress');\nINSERT INTO Profession(profession_name) VALUES('miscellaneous');\nINSERT INTO Profession(profession_name) VALUES('producer');\nINSERT INTO Profession(profession_name) VALUES('writer');\nINSERT INTO Profession(profession_name) VALUES('camera_department');\nINSERT INTO Profession(profession_name) VALUES('director');\nINSERT INTO Profession(profession_name) VALUES('art_department');\nINSERT INTO Profession(profession_name) VALUES('sound_department');\nINSERT INTO Profession(profession_name) VALUES('cinematographer');\nINSERT INTO Profession(profession_name) VALUES('editor');\nINSERT INTO Profession(profession_name) VALUES('composer');\nINSERT INTO Profession(profession_name) VALUES('music_department');\nINSERT INTO Profession(profession_name) VALUES('assistant_director');\nINSERT INTO Profession(profession_name) VALUES('visual_effects');\nINSERT INTO Profession(profession

In [6]:
with open('../sql_scripts/insert_profession.sql', 'w') as file:
    file.write(txt)

In [7]:
name_basics.primaryName.value_counts()

primaryName
Alex             406
David Smith      385
Michael Smith    383
Chris            367
David            363
                ... 
Mervingitha        1
Sherman Xie        1
Brian Graft        1
Ivan Berest        1
Aayush Nair        1
Name: count, Length: 9688145, dtype: int64

In [8]:
name_basics['knownForTitles'] = name_basics['knownForTitles'].str.split(',') 
name_basics['primaryProfession'] = name_basics['primaryProfession'].str.split(',').apply(lambda x: x[0] if type(x) == list else x)


In [10]:
name_basics = name_basics.explode('knownForTitles').drop_duplicates(['primaryName','primaryProfession','knownForTitles'])
name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,soundtrack,tt0045537
0,nm0000001,Fred Astaire,1899.0,1987.0,soundtrack,tt0053137
0,nm0000001,Fred Astaire,1899.0,1987.0,soundtrack,tt0072308
0,nm0000001,Fred Astaire,1899.0,1987.0,soundtrack,tt0050419
1,nm0000002,Lauren Bacall,1924.0,2014.0,actress,tt0038355
...,...,...,...,...,...,...
12489117,nm9993714,Romeo del Rosario,,,animation_department,tt11657662
12489117,nm9993714,Romeo del Rosario,,,animation_department,tt2455546
12489118,nm9993716,Essias Loberg,,,,
12489119,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744


In [13]:
names_to_person_table = name_basics.drop_duplicates(['primaryProfession','primaryName'])
names_to_person_table

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,soundtrack,tt0045537
1,nm0000002,Lauren Bacall,1924.0,2014.0,actress,tt0038355
2,nm0000003,Brigitte Bardot,1934.0,,actress,tt0057345
3,nm0000004,John Belushi,1949.0,1982.0,actor,tt0080455
4,nm0000005,Ingmar Bergman,1918.0,2007.0,writer,tt0060827
...,...,...,...,...,...,...
12489116,nm9993713,Sambit Mishra,,,writer,tt15134202
12489117,nm9993714,Romeo del Rosario,,,animation_department,tt14069590
12489118,nm9993716,Essias Loberg,,,,
12489119,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744


In [15]:
sql_profession = pd.read_csv('../sql_tables_export/profession_table.csv')
sql_profession
dict_to_replace = {x[1].profession_name: x[1].profession_id for x in sql_profession.iterrows()}
dict_to_replace

{'actor': 1,
 'actress': 2,
 'miscellaneous': 3,
 'producer': 4,
 'writer': 5,
 'camera_department': 6,
 'director': 7,
 'art_department': 8,
 'sound_department': 9,
 'cinematographer': 10,
 'editor': 11,
 'composer': 12,
 'music_department': 13,
 'assistant_director': 14,
 'visual_effects': 15,
 'make_up_department': 16,
 'production_manager': 17,
 'animation_department': 18,
 'editorial_department': 19,
 'soundtrack': 20,
 'costume_department': 21,
 'transportation_department': 22,
 'art_director': 23,
 'stunts': 24,
 'script_department': 25,
 'location_management': 26,
 'production_designer': 27,
 'costume_designer': 28,
 'special_effects': 29,
 'casting_department': 30,
 'set_decorator': 31,
 'executive': 32,
 'casting_director': 33,
 'manager': 34,
 'talent_agent': 35,
 'publicist': 36,
 'legal': 37,
 'assistant': 38,
 'music_artist': 39,
 'podcaster': 40,
 'production_department': 41,
 'electrical_department': 42,
 'choreographer': 43}

## Ver para que se uso

In [12]:
# sample_titles = pd.read_csv('../sample_titles.csv')

In [13]:
# sample_titles

In [14]:
# regex = '|'.join(sample_titles.tconst)


In [15]:
# regex = '|'.join(sample_titles.tconst)
# name_selection = name_basics[name_basics.knownForTitles.str.contains(regex)]
# name_selection

In [16]:
# name_selection = name_selection[name_selection.primaryProfession.notna()]
# name_selection

## Seguimos

In [16]:
names_to_person_table['primaryProfession'] = names_to_person_table['primaryProfession'].replace(dict_to_replace).astype(float)
names_to_person_table

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_to_person_table['primaryProfession'] = names_to_person_table['primaryProfession'].replace(dict_to_replace).astype(float)


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,20.0,tt0045537
1,nm0000002,Lauren Bacall,1924.0,2014.0,2.0,tt0038355
2,nm0000003,Brigitte Bardot,1934.0,,2.0,tt0057345
3,nm0000004,John Belushi,1949.0,1982.0,1.0,tt0080455
4,nm0000005,Ingmar Bergman,1918.0,2007.0,5.0,tt0060827
...,...,...,...,...,...,...
12489116,nm9993713,Sambit Mishra,,,5.0,tt15134202
12489117,nm9993714,Romeo del Rosario,,,18.0,tt14069590
12489118,nm9993716,Essias Loberg,,,,
12489119,nm9993717,Harikrishnan Rajan,,,10.0,tt8736744


In [17]:
names_to_person_table = names_to_person_table[names_to_person_table.primaryProfession.notna()]
names_to_person_table

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,20.0,tt0045537
1,nm0000002,Lauren Bacall,1924.0,2014.0,2.0,tt0038355
2,nm0000003,Brigitte Bardot,1934.0,,2.0,tt0057345
3,nm0000004,John Belushi,1949.0,1982.0,1.0,tt0080455
4,nm0000005,Ingmar Bergman,1918.0,2007.0,5.0,tt0060827
...,...,...,...,...,...,...
12489112,nm9993709,Lu Bevins,,,4.0,tt17717854
12489116,nm9993713,Sambit Mishra,,,5.0,tt15134202
12489117,nm9993714,Romeo del Rosario,,,18.0,tt14069590
12489119,nm9993717,Harikrishnan Rajan,,,10.0,tt8736744


In [18]:
names_to_person_table['primaryProfession'] = names_to_person_table['primaryProfession'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names_to_person_table['primaryProfession'] = names_to_person_table['primaryProfession'].astype(int)


In [21]:
names_to_person_table[['primaryProfession','primaryName','birthYear', 'deathYear']].to_csv('../csv_tables_to_import/person/person_import.csv')

In [93]:
# name_selection.primaryName = name_selection.primaryName.str.replace('"', '\'')
# print(name_selection.primaryName.str.contains('"').sum())

0


In [77]:
# txt = 'INSERT INTO Person(primary_profession_id, primary_name, birth_year, death_year) VALUES'
# for idx, item in name_selection.iterrows():
#     txt += f'({item.primaryProfession}, "{item.primaryName}", {item.birthYear}, {item.deathYear}),'
# txt = txt[:-1] + ';'
# txt

# with open('./sql_scripts/insert_person.sql', 'w') as file:
#     file.write(txt)

In [78]:
# Max name length
# name_selection.primaryName.str.len().max()

33

In [94]:
names_to_person_table.to_csv('../csv_tables_to_import/person/person_table.csv')