# SQL scripts creation with Python for IMDb datasets normalization

## Modules import

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc

from pathlib import Path


### First Objective - extraction of:
- Professions
- Title_types
- Genres
- Regions
- Languages

#### Importing dataset name_basics.tsv:

In [4]:
name_basics = pd.read_csv(Path('./data/name_basics.tsv').absolute(), sep='\t')
name_basics.dtypes

nconst               object
primaryName          object
birthYear            object
deathYear            object
primaryProfession    object
knownForTitles       object
dtype: object

In [5]:
name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0045537,tt0053137,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0117057,tt0075213,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0060827,tt0050986,tt0069467,tt0083922"
...,...,...,...,...,...,...
12489117,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt14069590,tt11657662,tt2455546"
12489118,nm9993716,Essias Loberg,\N,\N,,\N
12489119,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
12489120,nm9993718,Aayush Nair,\N,\N,cinematographer,\N


Extracting professions

In [7]:
professions = name_basics.primaryProfession[name_basics.primaryProfession.notna()].str.lower().apply(lambda x: x.split(',')).explode().value_counts()
print(f'There are {professions.index.size} professions:')
professions.index

There are 43 professions:


Index(['actor', 'actress', 'miscellaneous', 'producer', 'writer',
       'camera_department', 'director', 'art_department', 'sound_department',
       'cinematographer', 'editor', 'composer', 'music_department',
       'assistant_director', 'visual_effects', 'make_up_department',
       'production_manager', 'animation_department', 'editorial_department',
       'soundtrack', 'costume_department', 'transportation_department',
       'art_director', 'stunts', 'script_department', 'location_management',
       'production_designer', 'costume_designer', 'special_effects',
       'casting_department', 'set_decorator', 'executive', 'casting_director',
       'manager', 'talent_agent', 'publicist', 'legal', 'assistant',
       'music_artist', 'podcaster', 'production_department',
       'electrical_department', 'choreographer'],
      dtype='object', name='primaryProfession')

Creating SQL Script

In [8]:
txt = ''
for prof in professions.index:
    txt += f"INSERT INTO Profession(profession_name) VALUES('{prof}');\n"

txt

"INSERT INTO Profession(profession_name) VALUES('actor');\nINSERT INTO Profession(profession_name) VALUES('actress');\nINSERT INTO Profession(profession_name) VALUES('miscellaneous');\nINSERT INTO Profession(profession_name) VALUES('producer');\nINSERT INTO Profession(profession_name) VALUES('writer');\nINSERT INTO Profession(profession_name) VALUES('camera_department');\nINSERT INTO Profession(profession_name) VALUES('director');\nINSERT INTO Profession(profession_name) VALUES('art_department');\nINSERT INTO Profession(profession_name) VALUES('sound_department');\nINSERT INTO Profession(profession_name) VALUES('cinematographer');\nINSERT INTO Profession(profession_name) VALUES('editor');\nINSERT INTO Profession(profession_name) VALUES('composer');\nINSERT INTO Profession(profession_name) VALUES('music_department');\nINSERT INTO Profession(profession_name) VALUES('assistant_director');\nINSERT INTO Profession(profession_name) VALUES('visual_effects');\nINSERT INTO Profession(profession

In [10]:
with open('./sql_scripts/insert_profession.sql', 'w') as file:
    file.write(txt)

In [None]:
del name_basics

### TODO SEGUIR DESPUES CUANDO TENGA LAS OTRAS COSAS

In [20]:
name_basics.primaryName.value_counts()

primaryName
Alex             406
David Smith      385
Michael Smith    383
Chris            367
David            363
                ... 
Mervingitha        1
Sherman Xie        1
Brian Graft        1
Ivan Berest        1
Aayush Nair        1
Name: count, Length: 9688145, dtype: int64

In [24]:
name_basics[name_basics.primaryName == 'David Smith']

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
761051,nm0807866,David Smith,\N,\N,"visual_effects,camera_department,editorial_dep...","tt0119051,tt0240510,tt0206634,tt0114746"
761056,nm0807871,David Smith,1872,1930,"director,cinematographer,writer","tt0009833,tt0015155,tt0014243,tt0014759"
761057,nm0807872,David Smith,\N,\N,"editorial_department,editor,miscellaneous","tt0218327,tt0271061,tt0116530,tt0103893"
761058,nm0807873,David Smith,\N,\N,"camera_department,cinematographer","tt4718770,tt0159876,tt0498964,tt0165042"
761059,nm0807874,David Smith,1933,\N,,"tt0978075,tt1815526"
...,...,...,...,...,...,...
12361216,nm9844468,David Smith,\N,\N,visual_effects,\N
12383201,nm9870098,David Smith,\N,\N,actor,tt7062722
12440169,nm9936275,David Smith,\N,\N,"visual_effects,animation_department","tt11428036,tt7536920,tt4167720"
12462193,nm9962197,David Smith,\N,\N,miscellaneous,tt0227730


In [29]:
# Cantidad de primaryProfession de las personas 
(name_basics.primaryProfession.apply(lambda x: len(x.split(',')) if type(x) != float else 0)).value_counts()

primaryProfession
1    7290290
0    2720336
3    1242437
2    1236059
Name: count, dtype: int64

## Title AKAS

#### Importing dataset title_akas.tsv:

In [13]:
title_akas = pd.read_csv(Path('./data/title_akas.tsv').absolute(), sep='\t')
title_akas.dtypes

  title_akas = pd.read_csv(Path('./data/title_akas.tsv').absolute(), sep='\t')


titleId            object
ordering            int64
title              object
region             object
language           object
types              object
attributes         object
isOriginalTitle    object
dtype: object

In [16]:
title_akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
35764621,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
35764622,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
35764623,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
35764624,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


In [39]:
regions_codes = list(title_akas.region.unique())
# regions_codes.pop('\\N')
regions_codes = [x for x in regions_codes if x != '\\N']
regions_codes[:10]

['UA', 'DE', 'HU', 'GR', 'RU', 'US', 'JP', 'FR', 'RO', 'GB']

Creating sql script

In [42]:
txt = 'INSERT INTO Regions(region_code) VALUES'
for item in regions_codes:
    txt += f"('{item}'),"
txt = txt[:-1] + ';'
txt

"INSERT INTO Regions(region_name) VALUES('UA'),('DE'),('HU'),('GR'),('RU'),('US'),('JP'),('FR'),('RO'),('GB'),('CA'),('PT'),('AU'),('ES'),('FI'),('PL'),('AR'),('RS'),('UY'),('IT'),('BR'),('DK'),('TR'),('XWW'),('XEU'),('SK'),('CZ'),('SE'),('NZ'),('MX'),('NO'),('XYU'),('AT'),('VE'),('CSHH'),('SI'),('SUHH'),('IN'),('TW'),('LT'),('NL'),('CO'),('IR'),('BG'),('SG'),('BE'),('VN'),('HR'),('DZ'),('CH'),('BF'),('PH'),('XWG'),('HK'),('CN'),('XSA'),('EC'),('EE'),('IS'),('PR'),('DDDE'),('IL'),('EG'),('XKO'),('CL'),('IE'),('JM'),('KR'),('PE'),('GE'),('BY'),('BA'),('AE'),('PA'),('TH'),('ZA'),('TJ'),('XSI'),('MY'),('LV'),('ID'),('PK'),('BD'),('CU'),('AL'),('BO'),('XAS'),('CR'),('PY'),('DO'),('GT'),('SV'),('UZ'),('BUMM'),('YUCS'),('XPI'),('BJ'),('AZ'),('SY'),('NG'),('CM'),('MA'),('GL'),('MN'),('LI'),('LU'),('MZ'),('BM'),('KZ'),('MD'),('LB'),('IQ'),('TM'),('MK'),('TN'),('HT'),('AM'),('LK'),('ME'),('CG'),('CI'),('NP'),('QA'),('TO'),('SN'),('GH'),('JO'),('KP'),('KG'),('NE'),('GN'),('VDVN'),('TD'),('SO'),(

In [43]:
with open('./sql_scripts/insert_region.sql', 'w') as file:
    file.write(txt)

Extracting language codes

In [44]:
language_codes = list(title_akas.language.unique())
# language_codes.pop('\\N')
language_codes = [x for x in language_codes if x != '\\N']
language_codes[:10]

['ja', 'en', 'sv', 'tr', 'es', 'sr', 'cs', 'ru', 'fr', 'hi']

Creating sql script

In [48]:
txt = 'INSERT INTO Languages(language_code) VALUES'
for item in language_codes:
    txt += f"('{item}'),"
txt = txt[:-1] + ';'
txt

with open('./sql_scripts/insert_language.sql', 'w') as file:
    file.write(txt)

In [50]:
del title_akas

Maybe there are some titles that are incorrect. Check.

In [30]:
#ttid_aka_camencita = title_akas.loc[title_akas.title == 'Carmencita', :].titleId.values

## Title BASICS

In [51]:
title_basics = pd.read_csv(Path('./data/title_basics.tsv').absolute(), sep='\t')
title_basics.dtypes

  title_basics = pd.read_csv(Path('./data/title_basics.tsv').absolute(), sep='\t')


tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [52]:
title_basics#.originalTitle.str.len().plot(kind='box')

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9815514,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9815515,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9815516,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9815517,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [53]:
# sum(title_basics.primaryTitle == title_basics.originalTitle) 
# title_basics.shape[0] - sum(title_basics.primaryTitle == title_basics.originalTitle) 

Extracting title_types

In [54]:
title_types = title_basics.titleType.value_counts().index
title_types


Index(['tvEpisode', 'short', 'movie', 'video', 'tvSeries', 'tvMovie',
       'tvMiniSeries', 'tvSpecial', 'videoGame', 'tvShort', 'tvPilot'],
      dtype='object', name='titleType')

Creating sql script:

In [55]:
txt = 'INSERT INTO Title_types(type_name) VALUES'
for item in title_types:
    txt += f"('{item}'),"
txt = txt[:-1] + ';'
txt

with open('./sql_scripts/insert_title_types.sql', 'w') as file:
    file.write(txt)

Extracting genres:


In [60]:
genres = title_basics.genres.str.split(',').explode().value_counts().index
genres = [x for x in genres if x != '\\N']
genres

Creating sql script:

In [62]:
txt = 'INSERT INTO Genres(genre_name) VALUES'
for item in genres:
    txt += f"('{item}'),"
txt = txt[:-1] + ';'
txt

with open('./sql_scripts/insert_genres.sql', 'w') as file:
    file.write(txt)

In [63]:
del title_basics

In [79]:
gc.collect()

1436

## Importing tables created with SQL

### Creating Titles table

In [4]:
sql_types = pd.read_csv('./sql_tables_export/title_type_table.csv')
sql_types

Unnamed: 0,type_id,type_name,last_modified
0,1,tvEpisode,2023-05-27 19:39:01
1,2,short,2023-05-27 19:39:01
2,3,movie,2023-05-27 19:39:01
3,4,video,2023-05-27 19:39:01
4,5,tvSeries,2023-05-27 19:39:01
5,6,tvMovie,2023-05-27 19:39:01
6,7,tvMiniSeries,2023-05-27 19:39:01
7,8,tvSpecial,2023-05-27 19:39:01
8,9,videoGame,2023-05-27 19:39:01
9,10,tvShort,2023-05-27 19:39:01


In [5]:
dict_to_replace = {x[1].type_name: x[1].type_id for x in sql_types.iterrows()}
dict_to_replace

{'tvEpisode': 1,
 'short': 2,
 'movie': 3,
 'video': 4,
 'tvSeries': 5,
 'tvMovie': 6,
 'tvMiniSeries': 7,
 'tvSpecial': 8,
 'videoGame': 9,
 'tvShort': 10,
 'tvPilot': 11}

In [14]:
title_basics = pd.read_csv(Path('./data/title_basics.tsv').absolute(), sep='\t')
title_basics

  title_basics = pd.read_csv(Path('./data/title_basics.tsv').absolute(), sep='\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9815514,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9815515,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9815516,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9815517,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [15]:
title_basics = title_basics.loc[:, ['titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes']]
title_basics

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes
0,short,Carmencita,Carmencita,0,1894,\N,1
1,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5
2,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4
3,short,Un bon bock,Un bon bock,0,1892,\N,12
4,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1
...,...,...,...,...,...,...,...
9815514,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N
9815515,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N
9815516,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N
9815517,short,The Wind,The Wind,0,2015,\N,27


In [16]:
title_basics.titleType = title_basics.titleType.replace(dict_to_replace)
title_basics

NameError: name 'dict_to_replace' is not defined

In [None]:
title_basics.startYear = title_basics.startYear.replace('\\N', 'NULL')#.astype(float)
print(title_basics.dtypes)
title_basics.startYear.value_counts()

titleType          int64
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
dtype: object


startYear
NULL    1329182
2021     454186
2018     431651
2017     427994
2019     422818
         ...   
1884          1
1874          1
1883          1
2028          1
1885          1
Name: count, Length: 153, dtype: int64

In [None]:
title_basics.startYear[title_basics.startYear.isna()]

Series([], Name: startYear, dtype: object)

In [None]:
title_basics.isAdult.value_counts().index

Index([0, 1, '0', '1', 2019, 2023, 2020, 1981, 2017, '\N', 2022, 2014, 2005], dtype='object', name='isAdult')

In [None]:
title_basics.loc[title_basics.isAdult.isin([ 2019,2023,2020,1981,2017,2022,2014,2005,'\\N']), 'isAdult'] = 0

In [None]:
title_basics.isAdult.value_counts().index

Index([0, 1, '0', '1'], dtype='object', name='isAdult')

In [None]:
title_basics.loc[title_basics.isAdult == '0', 'isAdult'] = 0
title_basics.loc[title_basics.isAdult == '1', 'isAdult'] = 1
title_basics.isAdult.value_counts().index

Index([0, 1], dtype='object', name='isAdult')

In [None]:
title_basics.isAdult = title_basics.isAdult.astype(int)
title_basics.dtypes

titleType          int64
primaryTitle      object
originalTitle     object
isAdult            int64
startYear         object
endYear           object
runtimeMinutes    object
dtype: object

In [None]:
title_basics.isAdult.value_counts()

isAdult
0    9509066
1     306453
Name: count, dtype: int64

In [17]:
title_basics.runtimeMinutes = title_basics.runtimeMinutes.replace('\\N', 'NULL')
title_basics.runtimeMinutes.value_counts()

runtimeMinutes
NULL    6922775
30       217768
22       159126
60       155975
44        75682
         ...   
1554          1
612           1
850           1
562           1
2088          1
Name: count, Length: 886, dtype: int64

In [18]:
title_basics.endYear = title_basics.endYear.replace('\\N', 'NULL')
title_basics.endYear.value_counts()

endYear
NULL    9709498
2018       6243
2017       6229
2019       6171
2020       5877
         ...   
1944          1
2028          1
1906          1
1941          1
2030          1
Name: count, Length: 96, dtype: int64

In [19]:
title_basics

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes
0,short,Carmencita,Carmencita,0,1894,,1
1,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5
2,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4
3,short,Un bon bock,Un bon bock,0,1892,,12
4,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1
...,...,...,...,...,...,...,...
9815514,tvEpisode,Episode #3.17,Episode #3.17,0,2010,,
9815515,tvEpisode,Episode #3.19,Episode #3.19,0,2010,,
9815516,tvEpisode,Episode #3.20,Episode #3.20,0,2010,,
9815517,short,The Wind,The Wind,0,2015,,27


In [20]:
title_basics.primaryTitle = title_basics.primaryTitle.str.replace('"', '\'')
print(title_basics.primaryTitle.str.contains('"').sum())

title_basics.originalTitle = title_basics.originalTitle.str.replace('"', '\'')
print(title_basics.originalTitle.str.contains('"').sum())


0
0


In [21]:
title_basics.loc[~title_basics.runtimeMinutes.str.isnumeric() & ~title_basics.runtimeMinutes.str.contains('NULL') ]

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes
1097327,tvEpisode,Rolling in the Deep Dish\tRolling in the Deep ...,0,2019,\N,,Reality-TV
1507899,tvEpisode,Die Bauhaus-Stadt Tel Aviv - Vorbild für die M...,0,2019,\N,,Talk-Show
1895784,tvEpisode,...ein angenehmer Unbequemer...\t...ein angene...,0,1981,\N,,Documentary
2006749,tvEpisode,GGN Heavyweight Championship Lungs With Mike T...,0,2020,\N,,Talk-Show
2160886,tvEpisode,Jeopardy! College Championship Semifinal Game ...,0,2020,\N,,"Game-Show,Short"
2306033,tvEpisode,Anthony Davis High Brow Tank\tAnthony Davis Hi...,0,2017,\N,,Reality-TV
2994978,tvEpisode,Bay of the Triffids/Doctor of Doom\tBay of the...,0,\N,\N,,"Animation,Comedy,Family"
6470569,tvEpisode,LATINO Accents QUIZ! w@MrHReviews @EchoBaseNet...,0,2023,\N,,"News,Talk-Show"
6523749,tvEpisode,Nord-Koreas röda prinsessa\tNord-Koreas röda p...,0,2022,\N,,Documentary
6560761,tvEpisode,War Room Round Table: Building an AI Networkin...,0,2023,\N,,Talk-Show


In [22]:
title_basics.loc[~title_basics.runtimeMinutes.str.isnumeric(), 'runtimeMinutes'] = 'NULL'

In [23]:
title_basics.loc[~title_basics.runtimeMinutes.str.isnumeric() & ~title_basics.runtimeMinutes.str.contains('NULL') ]

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes


In [24]:
#it = title_basics.iterrows()
# next(it)[1].originalTitle
title_basics[title_basics.endYear == "2018"]

Unnamed: 0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes
80179,tvSeries,Strike Force,Strike Force,0,1981,2018,60
92428,tvSeries,Murphy Brown,Murphy Brown,0,1988,2018,30
92453,tvSeries,Roseanne,Roseanne,0,1988,2018,21
103809,tvSeries,The X-Files,The X Files,0,1993,2018,45
118023,tvSeries,The Jerry Springer Show,The Jerry Springer Show,0,1991,2018,60
...,...,...,...,...,...,...,...
9809498,tvSeries,Roadworthy: What Matters is Inside,Roadworthy: What Matters is Inside,0,2018,2018,
9809852,tvMiniSeries,Better Call Saul: Madrigal Electromotive Secur...,Better Call Saul: Madrigal Electromotive Secur...,0,2018,2018,3
9811561,tvMiniSeries,The Minds Behind: Fear The Walking Dead,The Minds Behind: Fear The Walking Dead,0,2018,2018,
9811818,tvSeries,Girls on Food,Girls on Food,0,2016,2018,


In [25]:
title_basics = title_basics[title_basics.endYear == "2018"]

In [27]:
txt = 'INSERT INTO Titles(title_type_id, title_primary, title_original, title_adult, title_start_year, title_end_year, title_runtime) VALUES'
for idx, item in title_basics.iterrows():
    txt += f'({item.titleType}, "{item.primaryTitle}", "{item.originalTitle}", {item.isAdult}, {item.startYear}, {item.endYear}, {item.runtimeMinutes}),'
txt = txt[:-1] + ';'
txt

with open('./sql_scripts/insert_title.sql', 'w') as file:
    file.write(txt)

In [47]:
txt[-10:]

NameError: name 'txt' is not defined

### Creating Person table

In [4]:
sql_profession = pd.read_csv('./sql_tables_export/profession_table.csv')
sql_profession

Unnamed: 0,profession_id,profession_name,last_modified
0,1,actor,2023-05-28 23:48:55
1,2,actress,2023-05-28 23:48:55
2,3,miscellaneous,2023-05-28 23:48:55
3,4,producer,2023-05-28 23:48:55
4,5,writer,2023-05-28 23:48:55
5,6,camera_department,2023-05-28 23:48:55
6,7,director,2023-05-28 23:48:55
7,8,art_department,2023-05-28 23:48:55
8,9,sound_department,2023-05-28 23:48:55
9,10,cinematographer,2023-05-28 23:48:55


In [83]:
dict_to_replace = {x[1].profession_name: x[1].profession_id for x in sql_profession.iterrows()}
dict_to_replace

{'actor': 1,
 'actress': 2,
 'miscellaneous': 3,
 'producer': 4,
 'writer': 5,
 'camera_department': 6,
 'director': 7,
 'art_department': 8,
 'sound_department': 9,
 'cinematographer': 10,
 'editor': 11,
 'composer': 12,
 'music_department': 13,
 'assistant_director': 14,
 'visual_effects': 15,
 'make_up_department': 16,
 'production_manager': 17,
 'animation_department': 18,
 'editorial_department': 19,
 'soundtrack': 20,
 'costume_department': 21,
 'transportation_department': 22,
 'art_director': 23,
 'stunts': 24,
 'script_department': 25,
 'location_management': 26,
 'production_designer': 27,
 'costume_designer': 28,
 'special_effects': 29,
 'casting_department': 30,
 'set_decorator': 31,
 'executive': 32,
 'casting_director': 33,
 'manager': 34,
 'talent_agent': 35,
 'publicist': 36,
 'legal': 37,
 'assistant': 38,
 'music_artist': 39,
 'podcaster': 40,
 'production_department': 41,
 'electrical_department': 42,
 'choreographer': 43}

In [5]:
name_basics = pd.read_csv(Path('./data/name_basics.tsv').absolute(), sep='\t')
name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0045537,tt0053137,tt0072308,tt0050419"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0038355,tt0117057,tt0075213,tt0037382"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0057345,tt0056404,tt0054452,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0080455,tt0072562,tt0077975,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0060827,tt0050986,tt0069467,tt0083922"
...,...,...,...,...,...,...
12489117,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department","tt14069590,tt11657662,tt2455546"
12489118,nm9993716,Essias Loberg,\N,\N,,\N
12489119,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
12489120,nm9993718,Aayush Nair,\N,\N,cinematographer,\N


In [13]:
#name_basics.primaryName.value_counts().head(50)
name_basics[name_basics.primaryName == 'David Johnson'].sort_values('birthYear')#.nconst.value_counts()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
401830,nm0424900,David Johnson,1954,\N,"cinematographer,camera_department,director","tt0370263,tt0432021,tt2474932,tt0120804"
1148402,nm10243353,David Johnson,1960,2020,producer,\N
1883953,nm11022542,David Johnson,1987,\N,,tt0407423
7735590,nm4117801,David Johnson,1987,\N,,"tt0065323,tt0407423,tt0905590,tt0896893"
10957098,nm8106133,David Johnson,1991,\N,,"tt0896893,tt0407423,tt8777482,tt0905590"
...,...,...,...,...,...,...
4627651,nm13919866,David Johnson,\N,\N,actor,tt0171219
4620882,nm13912678,David Johnson,\N,\N,actor,\N
4596220,nm13886315,David Johnson,\N,\N,"director,writer,producer","tt22456434,tt22940282,tt21942336,tt25729196"
4517725,nm13800878,David Johnson,\N,\N,actor,tt14120534


In [41]:
#name_basics[name_basics.knownForTitles.str.contains('tt0370263')]
name_basics[(name_basics.knownForTitles.notna()) & (name_basics.knownForTitles.str.contains('tt0370263'))]
#name_basics.knownForTitles.str.contains('tt0370263')

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
447,nm0000448,Lance Henriksen,1940,\N,"actor,producer,miscellaneous","tt0370263,tt0114214,tt0090605,tt0107076"
1969,nm0001971,Ewen Bremner,1972,\N,"actor,producer,soundtrack","tt0117951,tt2763304,tt0451279,tt0370263"
3777,nm0003792,Dino Herrmann,\N,\N,"sound_department,music_department,composer","tt3758172,tt0319262,tt2980210,tt0370263"
5100,nm0005125,Sanaa Lathan,1971,\N,"actress,producer,soundtrack","tt0370263,tt0437777,tt3862750,tt0199725"
6805,nm0006832,Martin Hobbs,\N,\N,"visual_effects,miscellaneous,special_effects","tt0120053,tt0370263,tt1343727,tt1386703"
...,...,...,...,...,...,...
10397956,nm7397610,Jeff Stroot,\N,\N,"editorial_department,miscellaneous,editor","tt1856101,tt0480249,tt0796366,tt0370263"
11533694,nm8833997,Daniele Riva,\N,\N,"visual_effects,transportation_department","tt0370263,tt0443734"
11535294,nm8836044,Brano Danis,\N,\N,"editorial_department,miscellaneous,visual_effects","tt1706620,tt0355295,tt0167190,tt0370263"
12102116,nm9536892,Lukas Katakalidis,\N,\N,art_department,"tt0338526,tt1229238,tt1706620,tt0370263"


In [112]:
title_basics = pd.read_csv(Path('./data/title_basics.tsv').absolute(), sep='\t')
title_basics = title_basics.loc[title_basics.endYear == "2018", ['tconst','titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']]
title_basics

  title_basics = pd.read_csv(Path('./data/title_basics.tsv').absolute(), sep='\t')


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
80179,tt0081940,tvSeries,Strike Force,Strike Force,0,1981,2018,60,"Action,Crime,Drama"
92428,tt0094514,tvSeries,Murphy Brown,Murphy Brown,0,1988,2018,30,Comedy
92453,tt0094540,tvSeries,Roseanne,Roseanne,0,1988,2018,21,"Comedy,Drama"
103809,tt0106179,tvSeries,The X-Files,The X Files,0,1993,2018,45,"Crime,Drama,Mystery"
118023,tt0120974,tvSeries,The Jerry Springer Show,The Jerry Springer Show,0,1991,2018,60,Talk-Show
...,...,...,...,...,...,...,...,...,...
9809498,tt9903854,tvSeries,Roadworthy: What Matters is Inside,Roadworthy: What Matters is Inside,0,2018,2018,\N,Reality-TV
9809852,tt9904638,tvMiniSeries,Better Call Saul: Madrigal Electromotive Secur...,Better Call Saul: Madrigal Electromotive Secur...,0,2018,2018,3,"Comedy,Short"
9811561,tt9908252,tvMiniSeries,The Minds Behind: Fear The Walking Dead,The Minds Behind: Fear The Walking Dead,0,2018,2018,\N,Talk-Show
9811818,tt9908812,tvSeries,Girls on Food,Girls on Food,0,2016,2018,\N,Reality-TV


In [45]:
name_selection = name_basics[name_basics.knownForTitles.str.contains('|'.join(title_basics.tconst))]
name_selection

In [55]:
name_selection = name_selection[name_selection.primaryProfession.notna()]
name_selection

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
95,nm0000096,Gillian Anderson,1968,\N,"actress,producer,soundtrack","tt0106179,tt0455590,tt0442632,tt2294189"
140,nm0000141,David Duchovny,1960,\N,"actor,producer,director","tt0120902,tt0904208,tt0106179,tt0443701"
297,nm0000298,Candice Bergen,1946,\N,"actress,producer,soundtrack","tt0094514,tt0901476,tt0212346,tt0256415"
1224,nm0001226,James Foley,1953,\N,"director,writer,producer","tt1856010,tt0104348,tt0098994,tt0090670"
1342,nm0001344,Gregg Henry,1952,\N,"actor,soundtrack,music_department","tt0439815,tt0120784,tt2015381,tt1837576"
...,...,...,...,...,...,...
12488628,nm9993167,Reno Sommerhalder,\N,\N,cinematographer,"tt19152310,tt8742988,tt0765274"
12488789,nm9993342,Anna Gyllenklev,\N,\N,"set_decorator,art_department,costume_department","tt11386694,tt3580332,tt7938078,tt8737938"
12489033,nm9993621,Nicole Pilande,\N,\N,actress,"tt8694116,tt11965688"
12489045,nm9993635,Linda Georgeson,\N,\N,actress,tt6268466


In [58]:
name_selection.primaryProfession = name_selection.primaryProfession.apply(lambda x: x.split(',')[0] )

In [61]:
name_selection.birthYear = name_selection.birthYear.replace('\\N', 'NULL')
name_selection.deathYear = name_selection.deathYear.replace('\\N', 'NULL')

name_selection

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
95,nm0000096,Gillian Anderson,1968,,actress,"tt0106179,tt0455590,tt0442632,tt2294189"
140,nm0000141,David Duchovny,1960,,actor,"tt0120902,tt0904208,tt0106179,tt0443701"
297,nm0000298,Candice Bergen,1946,,actress,"tt0094514,tt0901476,tt0212346,tt0256415"
1224,nm0001226,James Foley,1953,,director,"tt1856010,tt0104348,tt0098994,tt0090670"
1342,nm0001344,Gregg Henry,1952,,actor,"tt0439815,tt0120784,tt2015381,tt1837576"
...,...,...,...,...,...,...
12488628,nm9993167,Reno Sommerhalder,,,cinematographer,"tt19152310,tt8742988,tt0765274"
12488789,nm9993342,Anna Gyllenklev,,,set_decorator,"tt11386694,tt3580332,tt7938078,tt8737938"
12489033,nm9993621,Nicole Pilande,,,actress,"tt8694116,tt11965688"
12489045,nm9993635,Linda Georgeson,,,actress,tt6268466


In [65]:
name_repeated = name_selection.primaryName.value_counts()[name_selection.primaryName.value_counts() > 1].index

Index(['Grace Chang', 'David Clarke', 'Balder Fors', 'Aaron Smith',
       'Mark Johnson', 'Carlos Ruiz', 'Sarah Jones', 'James Boyd',
       'Joshua Johnson', 'Matt Thompson',
       ...
       'Alex McArdle', 'Walter Hill', 'Michael Chan', 'Matt Hunter',
       'Tony Perez', 'Nathan Palmer', 'James Bishop', 'Yiming Qin',
       'Fernandes André', 'Anders Hansen'],
      dtype='object', name='primaryName', length=1345)

In [70]:
name_selection[name_selection.primaryName == 'Aaron Smith']

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
8252135,nm4721345,Aaron Smith,,,actor,"tt0108709,tt1587000,tt6015808"
8259373,nm4729460,Aaron Smith,,,miscellaneous,tt0363307
9086777,nm5663376,Aaron Smith,,,camera_department,"tt3044834,tt5126254,tt3212026,tt2896970"
9602842,nm6344615,Aaron Smith,,,art_department,"tt2374744,tt3170832,tt6599482,tt7908628"
12362524,nm9846092,Aaron Smith,,,actor,"tt1415000,tt8169088"


In [80]:
name_selection = name_selection.loc[name_selection.primaryName.drop_duplicates().index.to_list(), :]
name_selection

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
95,nm0000096,Gillian Anderson,1968,,actress,"tt0106179,tt0455590,tt0442632,tt2294189"
140,nm0000141,David Duchovny,1960,,actor,"tt0120902,tt0904208,tt0106179,tt0443701"
297,nm0000298,Candice Bergen,1946,,actress,"tt0094514,tt0901476,tt0212346,tt0256415"
1224,nm0001226,James Foley,1953,,director,"tt1856010,tt0104348,tt0098994,tt0090670"
1342,nm0001344,Gregg Henry,1952,,actor,"tt0439815,tt0120784,tt2015381,tt1837576"
...,...,...,...,...,...,...
12488628,nm9993167,Reno Sommerhalder,,,cinematographer,"tt19152310,tt8742988,tt0765274"
12488789,nm9993342,Anna Gyllenklev,,,set_decorator,"tt11386694,tt3580332,tt7938078,tt8737938"
12489033,nm9993621,Nicole Pilande,,,actress,"tt8694116,tt11965688"
12489045,nm9993635,Linda Georgeson,,,actress,tt6268466


In [87]:
name_selection.primaryProfession = name_selection.primaryProfession.replace(dict_to_replace).astype(int)
name_selection

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
95,nm0000096,Gillian Anderson,1968,,2,"tt0106179,tt0455590,tt0442632,tt2294189"
140,nm0000141,David Duchovny,1960,,1,"tt0120902,tt0904208,tt0106179,tt0443701"
297,nm0000298,Candice Bergen,1946,,2,"tt0094514,tt0901476,tt0212346,tt0256415"
1224,nm0001226,James Foley,1953,,7,"tt1856010,tt0104348,tt0098994,tt0090670"
1342,nm0001344,Gregg Henry,1952,,1,"tt0439815,tt0120784,tt2015381,tt1837576"
...,...,...,...,...,...,...
12488628,nm9993167,Reno Sommerhalder,,,10,"tt19152310,tt8742988,tt0765274"
12488789,nm9993342,Anna Gyllenklev,,,31,"tt11386694,tt3580332,tt7938078,tt8737938"
12489033,nm9993621,Nicole Pilande,,,2,"tt8694116,tt11965688"
12489045,nm9993635,Linda Georgeson,,,2,tt6268466


In [100]:
# YEAR admite valores desde 1901 a 2155 en mysql
name_selection.birthYear[name_selection.birthYear != 'NULL'].sort_values(ascending=True).head(50)

10403494    1858
8888633     1860
707812      1872
566122      1874
4472753     1891
4066192     1893
483946      1896
463198      1897
435986      1898
306827      1900
318710      1900
1767382     1900
211737      1905
473544      1906
11287998    1907
5744276     1907
737073      1907
4474872     1908
309941      1908
6921551     1909
73220       1911
424383      1911
8930176     1913
615495      1914
7700808     1915
5826404     1916
7027062     1916
7842754     1916
2368428     1917
104445      1917
410135      1920
5172173     1920
704951      1920
158763      1921
7168265     1921
172669      1921
802247      1921
4231688     1922
139333      1922
4040741     1922
915154      1922
11870037    1922
4473059     1922
407730      1923
288062      1923
218905      1923
4477765     1923
689778      1923
339539      1923
36446       1924
Name: birthYear, dtype: object

In [109]:
name_selection.deathYear[name_selection.deathYear != 'NULL'].sort_values(ascending=True).head(50)

73220       1969
7700808     1971
2368428     1973
10653162    1974
424383      1979
36446       1980
6236114     1982
211737      1984
180572      1989
5744276     1989
708132      1990
473544      1990
758663      1990
2726883     1990
436723      1991
337820      1992
6921551     1992
638835      1993
104445      1993
1651433     1995
7168265     1995
9499775     1995
5460126     1996
449561      1997
6515331     1998
336039      1998
309941      1998
6138786     1998
666539      1998
310496      1999
245400      1999
158763      1999
8571232     1999
410135      1999
6771531     2000
885316      2000
692822      2000
413494      2000
497371      2000
90200       2000
706447      2001
11314776    2001
526754      2001
7524580     2001
663692      2002
338544      2002
397525      2002
10208084    2002
6665439     2002
10803732    2003
Name: deathYear, dtype: object

In [107]:
name_selection = name_selection[~((name_selection.birthYear.str.startswith('18')) | (name_selection.birthYear =='1900'))]

In [91]:
name_selection.primaryName = name_selection.primaryName.str.replace('"', '\'')
print(name_selection.primaryName.str.contains('"').sum())

0


In [108]:
txt = 'INSERT INTO Person(primary_profession_id, primary_name, birth_year, death_year) VALUES'
for idx, item in name_selection.iterrows():
    txt += f'({item.primaryProfession}, "{item.primaryName}", {item.birthYear}, {item.deathYear}),'
txt = txt[:-1] + ';'
txt

with open('./sql_scripts/insert_person.sql', 'w') as file:
    file.write(txt)

In [93]:
name_selection.primaryName.str.len().max()

46

### Creando Tabla Title Genres


In [125]:
sql_titles = pd.read_csv('./sql_tables_export/titles_table.csv')
sql_titles

Unnamed: 0,title_id,title_type_id,title_primary,title_original,title_adult,title_start_year,title_end_year,title_runtime,last_modified
0,1,5,Strike Force,Strike Force,0,1981,2018,60.0,2023-05-29 19:20:35
1,2,5,Murphy Brown,Murphy Brown,0,1988,2018,30.0,2023-05-29 19:20:35
2,3,5,Roseanne,Roseanne,0,1988,2018,21.0,2023-05-29 19:20:35
3,4,5,The X-Files,The X Files,0,1993,2018,45.0,2023-05-29 19:20:35
4,5,5,The Jerry Springer Show,The Jerry Springer Show,0,1991,2018,60.0,2023-05-29 19:20:35
...,...,...,...,...,...,...,...,...,...
6238,6239,5,Roadworthy: What Matters is Inside,Roadworthy: What Matters is Inside,0,2018,2018,,2023-05-29 19:20:35
6239,6240,7,Better Call Saul: Madrigal Electromotive Secur...,Better Call Saul: Madrigal Electromotive Secur...,0,2018,2018,3.0,2023-05-29 19:20:35
6240,6241,7,The Minds Behind: Fear The Walking Dead,The Minds Behind: Fear The Walking Dead,0,2018,2018,,2023-05-29 19:20:35
6241,6242,5,Girls on Food,Girls on Food,0,2016,2018,,2023-05-29 19:20:35


In [135]:
dict_to_replace_titles = {x[1].title_primary: x[1].title_id for x in sql_titles.iterrows()}
dict_to_replace_titles

{'Strike Force': 1,
 'Murphy Brown': 2,
 'Roseanne': 3,
 'The X-Files': 4,
 'The Jerry Springer Show': 5,
 'Room 101': 6,
 'Robot Wars': 7,
 'Une famille formidable': 8,
 "Les Guignols de l'info": 9,
 "Film '72": 10,
 'Oggy and the Cockroaches': 11,
 'Evening Magazine': 12,
 'The Old Grey Whistle Test': 13,
 'Bob the Builder': 14,
 'Caillou': 15,
 'Night Flight': 16,
 "Russell Coight's All Aussie Adventures": 17,
 'Trailer Park Boys': 18,
 'ABC TGIF': 19,
 'The Wright Stuff': 20,
 'De drie wijzen': 21,
 'Televisión registrada': 22,
 "America's Next Top Model": 23,
 'City of Men': 24,
 'Kokkisota': 25,
 'MythBusters': 26,
 'Voetbal Inside': 27,
 'Macabre Theatre': 28,
 'La academia': 29,
 'The Staircase': 30,
 'Homo Zapping': 31,
 'Daily Politics': 32,
 'C.I.D.': 33,
 'The Venture Bros.': 34,
 'The X Factor UK': 35,
 'Iron Chef America: The Series': 36,
 'Pressiklubi': 37,
 "Canada's Worst Driver": 38,
 'Seconds from Disaster': 39,
 'Arucitys': 40,
 'Primer plano': 41,
 'Aeschbacher': 4

In [118]:
title_basics.genres = title_basics.genres.str.split(',')
title_basics[['primaryTitle', 'genres']]

Unnamed: 0,primaryTitle,genres
80179,Strike Force,"[Action, Crime, Drama]"
92428,Murphy Brown,[Comedy]
92453,Roseanne,"[Comedy, Drama]"
103809,The X-Files,"[Crime, Drama, Mystery]"
118023,The Jerry Springer Show,[Talk-Show]
...,...,...
9809498,Roadworthy: What Matters is Inside,[Reality-TV]
9809852,Better Call Saul: Madrigal Electromotive Secur...,"[Comedy, Short]"
9811561,The Minds Behind: Fear The Walking Dead,[Talk-Show]
9811818,Girls on Food,[Reality-TV]


In [124]:
title_basics = title_basics.explode('genres')
title_basics[['primaryTitle', 'genres']]

Unnamed: 0,primaryTitle,genres
80179,Strike Force,Action
80179,Strike Force,Crime
80179,Strike Force,Drama
92428,Murphy Brown,Comedy
92453,Roseanne,Comedy
...,...,...
9809852,Better Call Saul: Madrigal Electromotive Secur...,Comedy
9809852,Better Call Saul: Madrigal Electromotive Secur...,Short
9811561,The Minds Behind: Fear The Walking Dead,Talk-Show
9811818,Girls on Food,Reality-TV


In [133]:
genres_titles = title_basics.copy()

In [136]:
genres_titles.primaryTitle = genres_titles.primaryTitle.replace(dict_to_replace_titles)
genres_titles

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
80179,tt0081940,tvSeries,1,Strike Force,0,1981,2018,60,Action
80179,tt0081940,tvSeries,1,Strike Force,0,1981,2018,60,Crime
80179,tt0081940,tvSeries,1,Strike Force,0,1981,2018,60,Drama
92428,tt0094514,tvSeries,2,Murphy Brown,0,1988,2018,30,Comedy
92453,tt0094540,tvSeries,3,Roseanne,0,1988,2018,21,Comedy
...,...,...,...,...,...,...,...,...,...
9809852,tt9904638,tvMiniSeries,6240,Better Call Saul: Madrigal Electromotive Secur...,0,2018,2018,3,Comedy
9809852,tt9904638,tvMiniSeries,6240,Better Call Saul: Madrigal Electromotive Secur...,0,2018,2018,3,Short
9811561,tt9908252,tvMiniSeries,6241,The Minds Behind: Fear The Walking Dead,0,2018,2018,\N,Talk-Show
9811818,tt9908812,tvSeries,6242,Girls on Food,0,2016,2018,\N,Reality-TV


In [147]:
genres_titles = genres_titles[['primaryTitle', 'genres']].drop_duplicates()#.value_counts()

In [148]:
genres_titles

Unnamed: 0,primaryTitle,genres
80179,1,Action
80179,1,Crime
80179,1,Drama
92428,2,Comedy
92453,3,Comedy
...,...,...
9809852,6240,Comedy
9809852,6240,Short
9811561,6241,Talk-Show
9811818,6242,Reality-TV


In [152]:
sql_genres = pd.read_csv('./sql_tables_export/genres_table.csv')
sql_genres

Unnamed: 0,genre_id,genre_name,last_modified
0,1,Drama,2023-05-29 19:20:15
1,2,Comedy,2023-05-29 19:20:15
2,3,Talk-Show,2023-05-29 19:20:15
3,4,Short,2023-05-29 19:20:15
4,5,Documentary,2023-05-29 19:20:15
5,6,Romance,2023-05-29 19:20:15
6,7,News,2023-05-29 19:20:15
7,8,Family,2023-05-29 19:20:15
8,9,Reality-TV,2023-05-29 19:20:15
9,10,Animation,2023-05-29 19:20:15


In [153]:
dict_to_replace_genres = {x[1].genre_name: x[1].genre_id for x in sql_genres.iterrows()}
dict_to_replace_genres

{'Drama': 1,
 'Comedy': 2,
 'Talk-Show': 3,
 'Short': 4,
 'Documentary': 5,
 'Romance': 6,
 'News': 7,
 'Family': 8,
 'Reality-TV': 9,
 'Animation': 10,
 'Crime': 11,
 'Action': 12,
 'Adventure': 13,
 'Music': 14,
 'Game-Show': 15,
 'Adult': 16,
 'Sport': 17,
 'Fantasy': 18,
 'Mystery': 19,
 'Horror': 20,
 'Thriller': 21,
 'History': 22,
 'Sci-Fi': 23,
 'Biography': 24,
 'Musical': 25,
 'War': 26,
 'Western': 27,
 'Film-Noir': 28}

In [154]:
genres_titles.genres = genres_titles.genres.replace(dict_to_replace_genres)
genres_titles

Unnamed: 0,primaryTitle,genres
80179,1,12
80179,1,11
80179,1,1
92428,2,2
92453,3,2
...,...,...
9809852,6240,2
9809852,6240,4
9811561,6241,3
9811818,6242,9


In [158]:
genres_titles.genres.value_counts()

genres
2     1404
1     1390
5     1094
6      538
9      495
10     371
3      350
11     341
12     324
\N     265
13     256
17     237
8      230
22     178
19     159
18     154
21     152
14     146
15     136
4      116
20     101
23      94
7       78
24      77
16      30
26      28
25      26
27       5
Name: count, dtype: int64

In [159]:
genres_titles = genres_titles[genres_titles.genres != '\\N']
genres_titles.genres.value_counts()

genres
2     1404
1     1390
5     1094
6      538
9      495
10     371
3      350
11     341
12     324
13     256
17     237
8      230
22     178
19     159
18     154
21     152
14     146
15     136
4      116
20     101
23      94
7       78
24      77
16      30
26      28
25      26
27       5
Name: count, dtype: int64

In [162]:
txt = 'INSERT INTO Title_Genres(title_id, genre_id) VALUES'
for idx, item in genres_titles.iterrows():
    txt += f'({item.primaryTitle}, {item.genres}),'
txt = txt[:-1] + ';'
txt

with open('./sql_scripts/insert_title_genres.sql', 'w') as file:
    file.write(txt)

## Title_CREW

In [45]:
crew = pd.read_csv(Path('./data/title_crew.tsv').absolute(), sep='\t')
crew.dtypes

tconst       object
directors    object
writers      object
dtype: object

In [46]:
crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N
...,...,...,...
9818204,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
9818205,tt9916850,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
9818206,tt9916852,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
9818207,tt9916856,nm10538645,nm6951431


## EPISODE

In [42]:
title_episode = pd.read_csv(Path('./data/title_episode.tsv').absolute(), sep='\t')
title_episode


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0041951,tt0041038,1,9
1,tt0042816,tt0989125,1,17
2,tt0042889,tt0989125,\N,\N
3,tt0043426,tt0040051,3,42
4,tt0043631,tt0989125,2,16
...,...,...,...,...
7452150,tt9916846,tt1289683,3,18
7452151,tt9916848,tt1289683,3,17
7452152,tt9916850,tt1289683,3,19
7452153,tt9916852,tt1289683,3,20


## Title_Principals

In [2]:
principals = pd.read_csv(Path('./data/title_principals.tsv').absolute(), sep='\t')
principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N
...,...,...,...,...,...,...
55919902,tt9916880,4,nm10535738,actress,\N,"[""Horrid Henry""]"
55919903,tt9916880,5,nm0996406,director,principal director,\N
55919904,tt9916880,6,nm1482639,writer,\N,\N
55919905,tt9916880,7,nm2586970,writer,books,\N


In [6]:
jobs = principals.category.value_counts().index
jobs

Index(['actor', 'self', 'actress', 'writer', 'director', 'producer',
       'cinematographer', 'composer', 'editor', 'production_designer',
       'archive_footage', 'archive_sound'],
      dtype='object', name='category')