In [2]:
#import necessary packages for data cleaning
import pandas as pd
import numpy as np

In [5]:
#loading in the main data set from https://datasets.imdbws.com/
df = pd.read_csv('title.basics.tsv', delimiter='\t', low_memory=False)

In [6]:
#preview df to ensure that data was brought in properly
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [7]:
#analyze the length of df
len(df)

6721152

In [35]:
#create a new df with nothing but the titleType movie
dfMovie = df[(df['titleType'] == 'movie')]

In [36]:
#preview of dfMovie
dfMovie.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
145,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,20,"Documentary,News,Sport"
332,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900,\N,\N,"Biography,Drama"
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Biography,Crime,Drama"


In [37]:
#additional preview of dfMovie
dfMovie.info()
print(len(dfMovie))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 548292 entries, 8 to 6721102
Data columns (total 9 columns):
tconst            548292 non-null object
titleType         548292 non-null object
primaryTitle      548292 non-null object
originalTitle     548292 non-null object
isAdult           548292 non-null int64
startYear         548292 non-null object
endYear           548292 non-null object
runtimeMinutes    548292 non-null object
genres            548292 non-null object
dtypes: int64(1), object(8)
memory usage: 41.8+ MB
548292


In [38]:
#need to convert startYear as integers to narrow down data set to movies released in 2005-2020
dfMovie['startYear'].dropna()

8          1894
145        1897
332        1900
499        1905
571        1906
           ... 
6721041    2015
6721068    2007
6721080    2013
6721091    2017
6721102    2013
Name: startYear, Length: 548292, dtype: object

In [39]:
#drop rows with placeholder values
dfMovie = dfMovie[(dfMovie['startYear'] != '\\N')]

In [40]:
#Preview changes to dfMovie
dfMovie.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
145,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,20,"Documentary,News,Sport"
332,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900,\N,\N,"Biography,Drama"
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Biography,Crime,Drama"


In [41]:
#change all values to integers
dfMovie.startYear = dfMovie.startYear.astype('int32', copy=False)

In [42]:
#verify change 
dfMovie.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 485061 entries, 8 to 6721102
Data columns (total 9 columns):
tconst            485061 non-null object
titleType         485061 non-null object
primaryTitle      485061 non-null object
originalTitle     485061 non-null object
isAdult           485061 non-null int64
startYear         485061 non-null int32
endYear           485061 non-null object
runtimeMinutes    485061 non-null object
genres            485061 non-null object
dtypes: int32(1), int64(1), object(7)
memory usage: 35.2+ MB


In [43]:
#Narrow dataset to only include movies within desired dates
dfMovie = dfMovie[(dfMovie['startYear'] >= 2005) & (dfMovie['startYear'] <= 2020)]

In [45]:
#ensure there are no values I do not want
dfMovie.startYear.unique()

array([2014, 2020, 2010, 2018, 2005, 2017, 2006, 2008, 2019, 2007, 2009,
       2012, 2013, 2011, 2015, 2016])

In [46]:
#export dfMovie to a CSV for future work
dfMovie.to_csv('movie.csv', index=False)

In [48]:
#this ensures that there are no duplicate values within the df
dfMovie.duplicated(keep='first').unique()

array([False])

In [126]:
#here we will reload our csv file for additional work.
dfMaster = pd.read_csv('movie.csv', low_memory=False)

In [127]:
dfMaster.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,\N,80,"Comedy,Musical"
1,tt0062336,movie,El tango del viudo y su espejo deformante,El tango del viudo y su espejo deformante,0,2020,\N,70,Drama
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,\N,80,"Drama,Thriller"
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,\N,\N,"Comedy,Drama"


In [128]:
#here we will drop the endYear table as it is not necessary to have.
dfMaster.drop(columns = 'endYear')

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80,"Comedy,Musical"
1,tt0062336,movie,El tango del viudo y su espejo deformante,El tango del viudo y su espejo deformante,0,2020,70,Drama
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,80,"Drama,Thriller"
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama"
...,...,...,...,...,...,...,...,...
209201,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary
209202,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary
209203,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,Comedy
209204,tt9916730,movie,6 Gunn,6 Gunn,0,2017,116,\N


In [72]:
#here we will begin to process additional data
dfWritersDir = pd.read_csv('title.crew.tsv', delimiter = '\t', low_memory=False)

In [108]:
dfWritersDir.tail()

Unnamed: 0,tconst,directors,writers
6724395,tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
6724396,tt9916850,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
6724397,tt9916852,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
6724398,tt9916856,nm10538645,nm6951431
6724399,tt9916880,nm0996406,"nm1482639,nm2586970"


In [74]:
dfNames = pd.read_csv('name.basics.tsv', delimiter = '\t', low_memory=False)

In [75]:
dfNames.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0054452,tt0059956,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"


In [129]:
dfMaster.set_index('tconst')

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
tt0016906,movie,Frivolinas,Frivolinas,0,2014,\N,80,"Comedy,Musical"
tt0062336,movie,El tango del viudo y su espejo deformante,El tango del viudo y su espejo deformante,0,2020,\N,70,Drama
tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,\N,80,"Drama,Thriller"
tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama
tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,\N,\N,"Comedy,Drama"
...,...,...,...,...,...,...,...,...
tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,\N,57,Documentary
tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,\N


In [130]:
dfWritersDir.set_index('tconst')

Unnamed: 0_level_0,directors,writers
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,nm0005690,\N
tt0000002,nm0721526,\N
tt0000003,nm0721526,\N
tt0000004,nm0721526,\N
tt0000005,nm0005690,\N
...,...,...
tt9916848,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
tt9916850,"nm5519454,nm5519375","nm6182221,nm1628284,nm2921377"
tt9916852,"nm5519375,nm5519454","nm6182221,nm1628284,nm2921377"
tt9916856,nm10538645,nm6951431


In [131]:
dfMaster = dfMaster.merge(dfWritersDir,on='tconst',how='left')


In [132]:
dfMaster.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,directors,writers
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,\N,80,"Comedy,Musical",nm0136068,\N
1,tt0062336,movie,El tango del viudo y su espejo deformante,El tango del viudo y su espejo deformante,0,2020,\N,70,Drama,"nm0749914,nm0765384","nm0749914,nm1146177"
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,\N,80,"Drama,Thriller",nm0863604,nm0863604
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama,nm0000080,"nm0000080,nm0462648"
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,\N,\N,"Comedy,Drama",nm0611531,nm0347899


In [133]:
dfMaster.drop(columns='endYear')

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
0,tt0016906,movie,Frivolinas,Frivolinas,0,2014,80,"Comedy,Musical",nm0136068,\N
1,tt0062336,movie,El tango del viudo y su espejo deformante,El tango del viudo y su espejo deformante,0,2020,70,Drama,"nm0749914,nm0765384","nm0749914,nm1146177"
2,tt0064322,movie,The Woman with the Knife,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama,nm0000080,"nm0000080,nm0462648"
4,tt0069204,movie,Sabse Bada Sukh,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama",nm0611531,nm0347899
...,...,...,...,...,...,...,...,...,...,...
209201,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary,"nm9272491,nm9272490","nm9272490,nm9272491"
209202,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary,nm0652213,"nm0652213,nm10538576"
209203,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,Comedy,nm7764440,nm7933903
209204,tt9916730,movie,6 Gunn,6 Gunn,0,2017,116,\N,nm10538612,nm10538612


In [134]:
dfMaster = dfMaster[(dfMaster['genres'] !=r'\N')]

In [135]:
dfMaster  = dfMaster.drop(columns = 'endYear')

In [136]:
dfMaster  = dfMaster.drop(columns = 'primaryTitle')

In [137]:
dfMaster.head()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
0,tt0016906,movie,Frivolinas,0,2014,80,"Comedy,Musical",nm0136068,\N
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,"nm0749914,nm0765384","nm0749914,nm1146177"
2,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604
3,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,nm0000080,"nm0000080,nm0462648"
4,tt0069204,movie,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama",nm0611531,nm0347899


In [138]:
dfMaster.set_index('directors')

Unnamed: 0_level_0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,writers
directors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
nm0136068,tt0016906,movie,Frivolinas,0,2014,80,"Comedy,Musical",\N
"nm0749914,nm0765384",tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,"nm0749914,nm1146177"
nm0863604,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604
nm0000080,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,"nm0000080,nm0462648"
nm0611531,tt0069204,movie,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama",nm0347899
...,...,...,...,...,...,...,...,...
nm4457074,tt9916538,movie,Kuambil Lagi Hatiku,0,2019,123,Drama,"nm4843252,nm4900525,nm2679404"
"nm9272491,nm9272490",tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary,"nm9272490,nm9272491"
nm0652213,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary,"nm0652213,nm10538576"
nm7764440,tt9916706,movie,Dankyavar Danka,0,2013,\N,Comedy,nm7933903


In [139]:
def splitDataFrameList(df,target_column,separator):
    ''' df = dataframe to split,
    target_column = the column containing the values to split
    separator = the symbol used to perform the split
    returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
    The values in the other columns are duplicated across the newly divided rows.
    '''
    def splitListToRows(row,row_accumulator,target_column,separator):
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)
    new_rows = []
    df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
    new_df = pd.DataFrame(new_rows)
    return new_df
#code found https://gist.github.com/jlln/338b4b0b55bd6984f883 Thank you for the awesome function


In [140]:
dfMaster = dfMaster[(dfMaster['directors'] !=r'\N')]

In [141]:
dfMaster['directors'] = dfMaster.directors.astype('str', copy=False)

In [142]:
dfMaster = splitDataFrameList(dfMaster, 'directors', ',')

In [143]:
dfMaster.head()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
0,tt0016906,movie,Frivolinas,0,2014,80,"Comedy,Musical",nm0136068,\N
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,"nm0749914,nm1146177"
2,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,"nm0749914,nm1146177"
3,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604
4,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,nm0000080,"nm0000080,nm0462648"


In [144]:
dfMaster = dfMaster[(dfMaster['writers'] !=r'\N')]

In [145]:
dfMaster.head()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,"nm0749914,nm1146177"
2,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,"nm0749914,nm1146177"
3,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604
4,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,nm0000080,"nm0000080,nm0462648"
5,tt0069204,movie,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama",nm0611531,nm0347899


In [146]:
dfMaster.dropna()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,"nm0749914,nm1146177"
2,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,"nm0749914,nm1146177"
3,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604
4,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,nm0000080,"nm0000080,nm0462648"
5,tt0069204,movie,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama",nm0611531,nm0347899
...,...,...,...,...,...,...,...,...,...
226122,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary,nm9272490,"nm9272490,nm9272491"
226123,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary,nm0652213,"nm0652213,nm10538576"
226124,tt9916706,movie,Dankyavar Danka,0,2013,\N,Comedy,nm7764440,nm7933903
226125,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm9272490,"nm8349149,nm9272490"


In [147]:
dfMaster.head()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,"nm0749914,nm1146177"
2,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,"nm0749914,nm1146177"
3,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604
4,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,nm0000080,"nm0000080,nm0462648"
5,tt0069204,movie,Sabse Bada Sukh,0,2018,\N,"Comedy,Drama",nm0611531,nm0347899


In [149]:
dfMaster['writers'] = dfMaster.writers.astype('str', copy=False)

In [150]:
dfMaster = splitDataFrameList(dfMaster, 'writers', ',')

In [151]:
dfMaster.head()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
0,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,nm0749914
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,nm1146177
2,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,nm0749914
3,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,nm1146177
4,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604


In [154]:
dfNames.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0054452,tt0059956,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"


In [156]:
dfNames.set_index('nconst')

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0054452,tt0059956,tt0057345,tt0049189"
nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"
...,...,...,...,...,...
nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department",tt2455546
nm9993716,Essias Loberg,\N,\N,,\N
nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744
nm9993718,Aayush Nair,\N,\N,cinematographer,\N


In [157]:
dfNames.dropna()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,producer","tt0054452,tt0059956,tt0057345,tt0049189"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"
...,...,...,...,...,...,...
10022850,nm9993709,Lu Bevins,\N,\N,"director,writer,cinematographer","tt11772940,tt11772858,tt11702702,tt11772904"
10022854,nm9993713,Sambit Mishra,\N,\N,writer,tt8325250
10022855,nm9993714,Romeo del Rosario,\N,\N,"animation_department,art_department",tt2455546
10022857,nm9993717,Harikrishnan Rajan,\N,\N,cinematographer,tt8736744


In [158]:
dfNames = dfNames[(dfNames['knownForTitles'] !=r'\N')]


In [161]:
dfNames = dfNames[(dfNames['birthYear'] !=r'\N')]

In [162]:
dfNames = dfNames[(dfNames['deathYear'] !=r'\N')]

In [165]:
dfNames.set_index('nconst')

Unnamed: 0_level_0,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"
nm0000006,Ingrid Bergman,1915,1982,"actress,soundtrack,producer","tt0038787,tt0038109,tt0034583,tt0036855"
...,...,...,...,...,...
nm9990008,Olavo Bilac,1865,1918,writer,tt8735938
nm9993432,Albert Minns,1920,1985,,tt0189339
nm9993434,Richard Blackmarr,1929,2013,,tt0189339
nm9993435,William Riva,1919,1999,set_decorator,tt0189339


In [166]:
dfMaster.set_index('directors')

Unnamed: 0_level_0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,writers
directors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
nm0749914,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914
nm0749914,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm1146177
nm0765384,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914
nm0765384,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm1146177
nm0863604,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604
...,...,...,...,...,...,...,...,...
nm7764440,tt9916706,movie,Dankyavar Danka,0,2013,\N,Comedy,nm7933903
nm9272490,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm8349149
nm9272490,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm9272490
nm8349149,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm8349149


In [169]:
dfTest = dfMaster.join(dfNames,how='left')
# dfMaster = dfMaster.merge(dfWritersDir,on='tconst',how='left')

In [173]:
dfTest.head()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,nm0749914,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,nm1146177,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
2,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,nm0749914,,,,,,
3,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,nm1146177,nm0000004,John Belushi,1949.0,1982.0,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"


In [174]:
dfTest.tail()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
350199,tt9916706,movie,Dankyavar Danka,0,2013,\N,Comedy,nm7764440,nm7933903,,,,,,
350200,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm9272490,nm8349149,,,,,,
350201,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm9272490,nm9272490,,,,,,
350202,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm8349149,nm8349149,nm0369260,Doris Haug,1927.0,2014.0,miscellaneous,tt0056913
350203,tt9916754,movie,Chico Albuquerque - Revelações,0,2013,49,Documentary,nm8349149,nm9272490,,,,,,


In [175]:
dfTest.dropna()

Unnamed: 0,tconst,titleType,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,nm0749914,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0043044,tt0050419,tt0053137,tt0072308"
1,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0749914,nm1146177,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0117057,tt0071877"
3,tt0062336,movie,El tango del viudo y su espejo deformante,0,2020,70,Drama,nm0765384,nm1146177,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0078723,tt0080455,tt0077975,tt0072562"
4,tt0064322,movie,La femme au couteau,0,2010,80,"Drama,Thriller",nm0863604,nm0863604,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0060827,tt0050986"
5,tt0069049,movie,The Other Side of the Wind,0,2018,122,Drama,nm0000080,nm0000080,nm0000006,Ingrid Bergman,1915,1982,"actress,soundtrack,producer","tt0038787,tt0038109,tt0034583,tt0036855"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350169,tt9914286,movie,Sokagin Çocuklari,0,2019,98,"Drama,Family",nm4394529,nm1902682,nm0369224,Jochen Hauer,1899,1966,actor,"tt0026517,tt0027581,tt0140096,tt0044647"
350174,tt9914662,movie,Wien is 't Hof van Commerce,0,2018,\N,Comedy,nm10537404,nm10537404,nm0369229,Robert Hauer-Riedl,1942,2005,actor,"tt0090554,tt0094784,tt0240794,tt0108829"
350179,tt9914828,movie,The War of Godzilla,0,2015,102,"Action,Comedy,Family",nm10537549,nm10537598,nm0369234,Angelika Hauff,1922,1983,actress,"tt0044641,tt0158245,tt0037696,tt0317139"
350193,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary,nm9272491,nm9272490,nm0369250,Max Haufler,1910,1965,"actor,director,writer","tt0403271,tt0195057,tt0057427,tt0030118"


In [176]:
#from here we will join the table with the writers
dfMaster.to_csv('Master.csv', index=False)