In [23]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

**Dataset: IMDB - Principles**

In [37]:
prin = pd.read_csv('zippedData/imdb.title.principals.csv.gz')
prin.head(12)

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,
2,tt0111414,3,nm3739909,producer,producer,
3,tt0323808,10,nm0059247,editor,,
4,tt0323808,1,nm3579312,actress,,"[""Beth Boothby""]"
5,tt0323808,2,nm2694680,actor,,"[""Steve Thomson""]"
6,tt0323808,3,nm0574615,actor,,"[""Sir Lachlan Morrison""]"
7,tt0323808,4,nm0502652,actress,,"[""Lady Delia Morrison""]"
8,tt0323808,5,nm0362736,director,,
9,tt0323808,6,nm0811056,producer,producer,


In [9]:
prin.ordering = prin.ordering.apply(lambda x: int(x))

In [14]:
prin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB


In [26]:
len(prin[(prin.category != prin.job) & prin.job.notna()].head())

5

Removing the job column only affects 5 rows (of <1 million)

In [29]:
prin.drop(columns='job', inplace = True)

In [31]:
len(prin[prin.category.isin(['actor', 'actress'])])

402926

At first glance we see characters column is more than half Null, but looking closer, we find that most actors have an assoiciated character. Even if 15% are Him/herself

In [75]:
prin.characters.value_counts().head()

Himself     43584
Herself     16127
Narrator     2218
Alex          656
David         621
Name: characters, dtype: int64

In [39]:
prin.characters =prin.characters.apply(lambda x: x.strip('["]') if isinstance(x, str) else x)

In [33]:
prin.ordering.unique()

array([ 1,  2,  3, 10,  4,  5,  6,  7,  8,  9], dtype=int64)

In [41]:
prin.ordering[prin.category =='director'].unique()

array([ 2,  5,  3,  6,  1,  4,  7, 10,  8,  9], dtype=int64)

-No connection between ordering and category

I'm tempted to convert the dataset to one line per film, but I think it suits our needs as it is

In [52]:
prin.to_csv('clean_data/clean_principles.csv', index = False)

**Dataset: IMDB - title basics**

In [76]:
bas = pd.read_csv('zippedData/imdb.title.basics.csv.gz')

In [124]:
bas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 146143 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146143 non-null object
primary_title      146143 non-null object
original_title     146143 non-null object
start_year         146143 non-null int64
runtime_minutes    146143 non-null float64
genres             140735 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 12.8+ MB


In [79]:
bas.head()

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"
2,tt0069049,The Other Side of the Wind,The Other Side of the Wind,2018,122.0,Drama
3,tt0069204,Sabse Bada Sukh,Sabse Bada Sukh,2018,,"Comedy,Drama"
4,tt0100275,The Wandering Soap Opera,La Telenovela Errante,2017,80.0,"Comedy,Drama,Fantasy"


In [85]:
bas.primary_title.value_counts
bas.original_title.value_counts()
bas.start_year.value_counts()
bas.runtime_minutes.value_counts()

90.0     7131
80.0     3526
85.0     2915
100.0    2662
95.0     2549
         ... 
382.0       1
724.0       1
808.0       1
287.0       1
540.0       1
Name: runtime_minutes, Length: 367, dtype: int64

In [112]:
#fill null 'original_title' entries with the primary title
bas.loc[(bas.original_title.isna()),
        'original_title'] = bas.loc[(bas.original_title.isna()),
                                    'primary_title']

In [88]:
bas[bas.start_year == 2115]
#Checked it out, "upcoming experimental science fiction film" -year is correct

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
89506,tt5174640,100 Years,100 Years,2115,,Drama


In [92]:
#But will remove as an extreme outlier; MS probably shan't want to replicate it
bas.drop(89506, inplace = True)

In [94]:
bas.describe()

Unnamed: 0,start_year,runtime_minutes
count,146143.0,114405.0
mean,2014.621111,86.187247
std,2.720952,166.36059
min,2010.0,1.0
25%,2012.0,70.0
50%,2015.0,87.0
75%,2017.0,99.0
max,2027.0,51420.0


In [120]:
#replacing null runtimes with the median
bas.loc[bas.runtime_minutes.isna(), 'runtime_minutes'] =87.0

In [121]:
bas.describe()
#Note: lowered the std, and interquartile range

Unnamed: 0,start_year,runtime_minutes
count,146143.0,146143.0
mean,2014.621111,86.363753
std,2.720952,147.192154
min,2010.0,1.0
25%,2012.0,75.0
50%,2015.0,87.0
75%,2017.0,95.0
max,2027.0,51420.0


Genres are something I'll being looking at and only a small percentage are missing, so I'll drop rows

In [127]:
#create list of all genres
bas.dropna(inplace = True)

In [167]:
#clean list to remove multiples with only differing spaces
genres = []
for f in bas.genres:
    for g in f.split(','):
        if not g in genres:
            genres.append(g)

In [182]:
#create a column for each genre, give value True if the string appears 
#...in the original genre_ids column
for g in genres:
    bas[g] = bas.genres.apply(lambda x: g in x)

In [194]:
bas[['genres']+ genres].head()

Unnamed: 0,genres,Action,Crime,Drama,Biography,Comedy,Fantasy,Horror,Thriller,Adventure,...,Music,Sport,Western,Musical,Adult,News,Talk-Show,Reality-TV,Game-Show,Short
0,"Action,Crime,Drama",True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Biography,Drama",False,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Drama,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,"Comedy,Drama",False,False,True,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,"Comedy,Drama,Fantasy",False,False,True,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [195]:
bas.to_csv('clean_data/clean_basics.csv', index = False)

**Dataset: IMDB - name**

In [131]:
nam = pd.read_csv('zippedData/imdb.name.basics.csv.gz')

In [132]:
nam.head()

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"
3,nm0062195,Axel Baumann,,,"camera_department,cinematographer,art_department","tt0114371,tt2004304,tt1618448,tt1224387"
4,nm0062798,Pete Baxter,,,"production_designer,art_department,set_decorator","tt0452644,tt0452692,tt3458030,tt2178256"


In [134]:
nam.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
nconst                606648 non-null object
primary_name          606648 non-null object
birth_year            82736 non-null float64
death_year            6783 non-null float64
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: float64(2), object(4)
memory usage: 27.8+ MB


The main use we'll have for this dataset is just to attach names to other tables via nconst

In [135]:
nam.describe()

Unnamed: 0,birth_year,death_year
count,82736.0,6783.0
mean,1967.043826,2000.523367
std,22.12219,43.95153
min,1.0,17.0
25%,1957.0,2001.0
50%,1971.0,2013.0
75%,1981.0,2016.0
max,2014.0,2019.0


In [151]:
nam[nam.birth_year < 1650]

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
1244,nm0963344,John Bunyan,1628.0,1688.0,"writer,soundtrack","tt0393584,tt0194913,tt0234464,tt1336619"
11901,nm0549265,Christopher Marlowe,1564.0,1593.0,"writer,soundtrack","tt0114279,tt0062898,tt0287837,tt0101798"
21094,nm0613556,Shikibu Murasaki,973.0,1031.0,writer,"tt0092875,tt0043580,tt0094100,tt0256385"
38106,nm0674518,Charles Perrault,1628.0,1703.0,writer,"tt0053285,tt1587310,tt0042332,tt8651654"
39848,nm0596942,Molière,1622.0,1673.0,"writer,soundtrack,miscellaneous","tt0154249,tt0026069,tt0017448,tt0370853"
52088,nm1063158,Cheng'en Wu,1506.0,1581.0,writer,"tt1163129,tt6168860,tt0112778,tt1753783"
52523,nm0148859,Miguel de Cervantes y Saavedra,1547.0,1616.0,"writer,soundtrack","tt0204285,tt6151214,tt0023956,tt0039330"
59713,nm0000636,William Shakespeare,1564.0,1616.0,"writer,soundtrack,miscellaneous","tt8632012,tt3894536,tt5932378,tt7344870"
71418,nm0478548,Madame de La Fayette,1634.0,1693.0,writer,"tt1599975,tt0158117,tt0204761,tt1263778"
74627,nm0017266,Mariana Alcoforado,1640.0,1723.0,writer,"tt0062836,tt0076312,tt2219674"


As I go back looking for implausible birth years, the more I find they are correct (e.g. Murasaki Shikibu (紫 式部, English: Lady Murasaki; c. 973 or 978 – c. 1014 or 1031) was a Japanese novelist, poet and lady-in-waiting at the Imperial court during the Heian period. -Wikipedia). Except for Michael Vignola, born (Nov 1st) 1980

In [150]:
nam.loc[nam['birth_year'] == 1.0, 'birth_year'] = 1980.0

In [152]:
nam.death_year.value_counts()

2018.0    624
2017.0    621
2016.0    592
2015.0    540
2013.0    437
         ... 
1847.0      1
1848.0      1
1850.0      1
1854.0      1
1031.0      1
Name: death_year, Length: 214, dtype: int64

Despite only a small fraction having birth years, it's still a large enough number if to do something separate with, so I'll leave it as is. However I don't think there's enough data in the death_year column to be of use.

In [160]:
nam.drop(columns = 'death_year', inplace = True)

For primary_profession and known_for_titles, I'll leave the null values as they are. Too many entries to drop the column, not worth dropping the rows for a column that might not be used.

In [163]:
nam.primary_profession.value_counts()
nam.known_for_titles.value_counts()

In [164]:
nam.to_csv('clean_data/clean_names.csv', index = False)

**Dataset: TMDB -Movies**

In [3]:
tmdb = pd.read_csv('zippedData/tmdb.movies.csv.gz')
tmdb.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [4]:
tmdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
Unnamed: 0           26517 non-null int64
genre_ids            26517 non-null object
id                   26517 non-null int64
original_language    26517 non-null object
original_title       26517 non-null object
popularity           26517 non-null float64
release_date         26517 non-null object
title                26517 non-null object
vote_average         26517 non-null float64
vote_count           26517 non-null int64
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB


In [5]:
tmdb.genre_ids.value_counts()

[99]                 3700
[]                   2479
[18]                 2268
[35]                 1660
[27]                 1145
                     ... 
[27, 80, 18]            1
[10751, 80, 18]         1
[878, 80]               1
[10402, 14, 35]         1
[9648, 18, 10770]       1
Name: genre_ids, Length: 2477, dtype: int64

In [6]:
genre_dict = {'28':"Action", '12':"Adventure", '16':"Animation", '35':"Comedy", 
              '80':"Crime", '99':"Documentary", '18':"Drama", '10751':"Family", 
              '14':"Fantasy", '36':"History", '27':"Horror", '10402':"Music", 
              '9648':"Mystery", '10749':"Romance", '878':"Science Fiction", 
              '10770':"TV Movie", '53':"Thriller", '10752':"War",'37':"Western"}

In [7]:
#create list of all genres
genres = []
for f in tmdb.genre_ids:
    for g in f.strip('[]').split(','):
        if not g in genres:
            genres.append(g)

In [8]:
#clean list to remove multiples with only differing spaces
genres = set(list(map(lambda x: x.strip(' '), genres)))

In [9]:
#create a column for each genre, give value True if the string appears 
#...in the original genre_ids column
for g in genres:
    tmdb[g] = tmdb.genre_ids.apply(lambda x: g in x)

In [10]:
tmdb.head(2)

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Unnamed: 11,35,80,37,53,28,18,10402,10751,36,27,99,10752,14,878,10770,16,12,10749,9648
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,True,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,True,False,False


In [11]:
tmdb.drop(columns ='', inplace = True)

In [12]:
tmdb.rename(columns = genre_dict, inplace = True)

In [13]:
tmdb.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Comedy,Crime,Western,Thriller,Action,Drama,Music,Family,History,Horror,Documentary,War,Fantasy,Science Fiction,TV Movie,Animation,Adventure,Romance,Mystery
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,True,True,False,False
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False


In [14]:
tmdb.iloc[:,:10]
tmdb.iloc[:,10:].sort_index(axis=1)  #genre columns in alphabetical order
tmdb = tmdb.iloc[:,:10].join(tmdb.iloc[:,10:].sort_index(axis=1))

In [15]:
tmdb.drop(columns = ['Unnamed: 0', 'genre_ids'], inplace = True)

In [16]:
tmdb.id.value_counts()

292086    3
463839    3
11976     3
391872    3
416572    3
         ..
356987    1
350846    1
479871    1
500353    1
524288    1
Name: id, Length: 25497, dtype: int64

In [17]:
tmdb[tmdb.id == 292086] #duplicate entries found

Unnamed: 0,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
5838,292086,en,Come Morning,3.013,2012-10-21,Come Morning,6.3,5,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False
11836,292086,en,Come Morning,3.013,2012-10-21,Come Morning,6.3,5,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False
21565,292086,en,Come Morning,3.013,2012-10-21,Come Morning,6.3,5,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False


In [18]:
tmdb.drop_duplicates(inplace = True)

In [34]:
#check for common implausible values
tmdb.original_language.value_counts()
tmdb.original_title.value_counts()
tmdb.popularity.value_counts()
tmdb.release_date.value_counts()
tmdb.title.value_counts()
tmdb.vote_average.value_counts()
tmdb.vote_count.value_counts()

1       6438
2       2994
3       1720
4       1303
5        945
        ... 
953        1
969        1
985        1
1001       1
2049       1
Name: vote_count, Length: 1693, dtype: int64

In [41]:
tmdb.to_csv('clean_data/clean_tmdb.cs', index = False)

In [19]:
tmdb.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,25497.0,25497.0,25497.0,25497.0
mean,294203.960505,3.043279,5.979331,178.79578
std,154690.24966,4.261045,1.866094,914.150311
min,27.0,0.6,0.0,1.0
25%,154770.0,0.6,5.0,1.0
50%,307125.0,1.321,6.0,5.0
75%,420707.0,3.49,7.0,25.0
max,608444.0,80.773,10.0,22186.0


In [21]:
tmdb.sort_values(by=['vote_count']).tail(20)

Unnamed: 0,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
14172,140607,en,Star Wars: The Force Awakens,32.281,2015-12-18,Star Wars: The Force Awakens,7.4,12641,True,True,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False
14184,150540,en,Inside Out,24.797,2015-06-19,Inside Out,8.0,12691,False,False,True,True,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False
2468,10195,en,Thor,38.068,2011-05-06,Thor,6.7,12764,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2474,1771,en,Captain America: The First Avenger,25.808,2011-07-22,Captain America: The First Avenger,6.9,12810,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
14169,99861,en,Avengers: Age of Ultron,44.383,2015-05-01,Avengers: Age of Ultron,7.3,13457,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
17437,297761,en,Suicide Squad,16.53,2016-08-05,Suicide Squad,5.9,13533,True,True,False,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,False
5182,49026,en,The Dark Knight Rises,26.22,2012-07-20,The Dark Knight Rises,7.7,13933,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,True,False,False
23811,299536,en,Avengers: Infinity War,80.773,2018-04-27,Avengers: Infinity War,8.3,13948,True,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
17382,271110,en,Captain America: Civil War,39.137,2016-05-06,Captain America: Civil War,7.4,14000,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
14193,135397,en,Jurassic World,20.709,2015-06-12,Jurassic World,6.6,14056,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False


In [25]:
tmdb[tmdb['Western']].sort_values(by = ['release_date'])

Unnamed: 0,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
525,71244,en,Monte Walsh,3.728,1970-10-07,Monte Walsh,6.3,20,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2434,79540,en,The Righteous and the Wicked,0.6,2010-01-01,The Righteous and the Wicked,2.0,1,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True
1634,114219,en,Straight to Hell Returns,0.665,2010-01-01,Straight to Hell Returns,2.0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2202,248412,en,Western X,0.6,2010-01-02,Western X,5.3,2,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
862,29776,en,6 Guns,1.995,2010-03-30,6 Guns,4.7,17,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True
1445,39049,en,Gunfight at La Mesa,0.889,2010-05-18,Gunfight at La Mesa,5.0,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
884,40852,en,American Bandits: Frank and Jesse James,1.96,2010-05-18,American Bandits: Frank and Jesse James,3.5,3,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1150,175987,en,Red Dead Redemption: The Man from Blackwater,1.383,2010-05-29,Red Dead Redemption: The Man from Blackwater,4.4,4,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
836,101520,en,Reel Injun,2.069,2010-06-18,Reel Injun,7.1,14,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
53,20533,en,Jonah Hex,13.467,2010-06-18,Jonah Hex,4.6,708,True,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,True,False,True
