# Merging different IMDb dataframes into one dataset

### *Author: Hank Hoang*
### *Date created: 28/06/2023*
### *Date finished: 29/06/2023*


## Note
* Due to limited facilities(insufficient RAM), the principal.tsv dataset is too large (2.51GB) for my laptop, my Jupiter notebook couldn't load it, even Google colab can't bear this case, and the dataset made it collapse. So I skipped this dataset for this time.
* Merge Crew and Name_basic datasets into one dataset to retrieve director names.
* Using inner join to merge datasets.
* \N’ is used to denote that a particular field is missing or null for that title/name.

In [1]:
# Import libraries
import pandas as pd

In [2]:
data1=pd.read_csv('title_basic.tsv',sep='\t')
data2=pd.read_csv('rating.tsv',sep='\t')

  data1=pd.read_csv('title_basic.tsv',sep='\t')


In [3]:
data1.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


Basically, originalTitle is pretty much the same as primaryTitle but primaryTitle is more well-known for commercial purposes.Therefore, delete originalTitle column

In [4]:
# Delete 'originalTitle' column
data1.drop('originalTitle', inplace=True, axis=1)

In [5]:
data1.head()

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
data2.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1982
1,tt0000002,5.8,265
2,tt0000003,6.5,1838
3,tt0000004,5.5,178
4,tt0000005,6.2,2625


In [7]:
df_inner = pd.merge(data1, data2, on='tconst', how='inner')
df_inner

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1982
1,tt0000002,short,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,265
2,tt0000003,short,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1838
3,tt0000004,short,Un bon bock,0,1892,\N,12,"Animation,Short",5.5,178
4,tt0000005,short,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2625
...,...,...,...,...,...,...,...,...,...,...
1324193,tt9916730,movie,6 Gunn,0,2017,\N,116,\N,8.3,10
1324194,tt9916766,tvEpisode,Episode #10.15,0,2019,\N,43,"Family,Game-Show,Reality-TV",7.0,21
1324195,tt9916778,tvEpisode,Escape,0,2019,\N,\N,"Crime,Drama,Mystery",7.2,36
1324196,tt9916840,tvEpisode,Horrid Henry's Comic Caper,0,2014,\N,11,"Adventure,Animation,Comedy",7.5,7


In [8]:
data2

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1982
1,tt0000002,5.8,265
2,tt0000003,6.5,1838
3,tt0000004,5.5,178
4,tt0000005,6.2,2625
...,...,...,...
1324193,tt9916730,8.3,10
1324194,tt9916766,7.0,21
1324195,tt9916778,7.2,36
1324196,tt9916840,7.5,7


In [9]:
data3=pd.read_csv('name_basic.tsv',sep='\t')

In [10]:
data3.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0050419,tt0072308,tt0053137"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack","tt0037382,tt0038355,tt0075213,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,soundtrack,music_department","tt0056404,tt0049189,tt0057345,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer","tt0077975,tt0072562,tt0078723,tt0080455"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050976,tt0083922,tt0050986,tt0069467"


In [11]:
#Delete unnecessary columns
data3.drop(['birthYear','deathYear','primaryProfession','knownForTitles'], inplace=True, axis=1)

In [12]:
data3.head()

Unnamed: 0,nconst,primaryName
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


In [13]:
data4=pd.read_csv('crew.tsv',sep='\t')

In [14]:
data4.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [15]:
# Delete 'writers' column
data4.drop('writers', inplace=True, axis=1)

In [16]:
#Change name for director column into nconst
data4.rename(columns={'directors': 'nconst'}, inplace=True)

In [17]:
data4

Unnamed: 0,tconst,nconst
0,tt0000001,nm0005690
1,tt0000002,nm0721526
2,tt0000003,nm0721526
3,tt0000004,nm0721526
4,tt0000005,nm0005690
...,...,...
9963675,tt9916848,nm1485677
9963676,tt9916850,nm1485677
9963677,tt9916852,nm1485677
9963678,tt9916856,nm10538645


### Merge crew and name_basic datasets into one

In [18]:
df_inner2 = pd.merge(data3, data4, on='nconst', how='inner')
df_inner2

Unnamed: 0,nconst,primaryName,tconst
0,nm0000005,Ingmar Bergman,tt0038468
1,nm0000005,Ingmar Bergman,tt0038675
2,nm0000005,Ingmar Bergman,tt0039834
3,nm0000005,Ingmar Bergman,tt0040418
4,nm0000005,Ingmar Bergman,tt0040622
...,...,...,...
4617723,nm9993694,Chinmay Mishra,tt18361688
4617724,nm9993694,Chinmay Mishra,tt18687502
4617725,nm9993696,Ibrahim-Aloduley,tt8744160
4617726,nm9993708,Eli Bevins,tt9046122


In [19]:
# Change column name to denote directors' name
df_inner2.rename(columns={'primaryName': 'Director Name'}, inplace=True)

In [20]:
df_inner2.head()

Unnamed: 0,nconst,Director Name,tconst
0,nm0000005,Ingmar Bergman,tt0038468
1,nm0000005,Ingmar Bergman,tt0038675
2,nm0000005,Ingmar Bergman,tt0039834
3,nm0000005,Ingmar Bergman,tt0040418
4,nm0000005,Ingmar Bergman,tt0040622


In [21]:
# Delete 'nconst' column
df_inner2.drop('nconst', inplace=True, axis=1)

In [22]:
df_inner2

Unnamed: 0,Director Name,tconst
0,Ingmar Bergman,tt0038468
1,Ingmar Bergman,tt0038675
2,Ingmar Bergman,tt0039834
3,Ingmar Bergman,tt0040418
4,Ingmar Bergman,tt0040622
...,...,...
4617723,Chinmay Mishra,tt18361688
4617724,Chinmay Mishra,tt18687502
4617725,Ibrahim-Aloduley,tt8744160
4617726,Eli Bevins,tt9046122


## Merge all transformed datasets into the final dataset

In [23]:
df = pd.merge(df_inner, df_inner2, on='tconst', how='inner')
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,Director Name
0,tt0000001,short,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1982,William K.L. Dickson
1,tt0000002,short,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,265,Émile Reynaud
2,tt0000003,short,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1838,Émile Reynaud
3,tt0000004,short,Un bon bock,0,1892,\N,12,"Animation,Short",5.5,178,Émile Reynaud
4,tt0000005,short,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2625,William K.L. Dickson


In [24]:
# Change column names 
df.rename(columns={'genres': 'Genres','primaryTitle':'Title','titleType':'Type','runtimeMinutes':'Runtime','averageRating':'Rating','numVotes':'Number of Votes'}, inplace=True)

In [25]:
df.head()

Unnamed: 0,tconst,Type,Title,isAdult,startYear,endYear,Runtime,Genres,Rating,Number of Votes,Director Name
0,tt0000001,short,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1982,William K.L. Dickson
1,tt0000002,short,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,265,Émile Reynaud
2,tt0000003,short,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1838,Émile Reynaud
3,tt0000004,short,Un bon bock,0,1892,\N,12,"Animation,Short",5.5,178,Émile Reynaud
4,tt0000005,short,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2625,William K.L. Dickson


In [26]:
#Saving dataset
df.to_csv('IMDb_data.csv', index=False)