# Modules and data

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import os
import pickle
import scraping_functions # Let's also import the script with our scraping functions

Loading the first dataframe (i.e. the one with the API data)

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/JacopoMalatesta/imdb_most_popular_films/main/data/df_api.csv",
              index_col = 0,
              dtype = 'object')

Loading the second dataframe (i.e. the first dataframe of scraped data)

In [3]:
df2 = pd.read_csv("https://raw.githubusercontent.com/JacopoMalatesta/imdb_most_popular_films/main/data/scraped_df_1.csv", 
                  index_col = 0,
                  dtype = 'object')

Loading the third dataframe (i.e. the second dataframe of scraped data)

In [4]:
df3 = pd.read_csv("https://raw.githubusercontent.com/JacopoMalatesta/imdb_most_popular_films/main/data/scraped_df_2.csv",
                  index_col = 0,
                  dtype = 'object')

Let's also load the film IDs

In [8]:
with open('data/film_ids.txt', 'rb') as f:
    film_ids = pickle.load(f)

# First datraframe

In [9]:
df.head()

Unnamed: 0,id,title,release_date,runtime,country,language,genre,studios,budget,revenue
0,tt0111161,The Shawshank Redemption,1994-09-23,142,United States of America,English,Drama;Crime,Castle Rock Entertainment,25000000,28341469
1,tt0468569,The Dark Knight,2008-07-14,152,United Kingdom;United States of America,English;Mandarin,Drama;Action;Crime;Thriller,DC Comics;Legendary Pictures;Syncopy;Isobel Gr...,185000000,1004558444
2,tt1375666,Inception,2010-07-15,148,United Kingdom;United States of America,English;Japanese,Action;Science Fiction;Adventure,Legendary Pictures;Syncopy;Warner Bros. Pictures,160000000,825532764
3,tt0137523,Fight Club,1999-10-15,139,Germany;United States of America,English,Drama,Regency Enterprises;Fox 2000 Pictures;Taurus F...,63000000,100853753
4,tt0109830,Forrest Gump,1994-07-06,142,United States of America,English,Comedy;Drama;Romance,Paramount;The Steve Tisch Company,55000000,677387716


All IDs are unique

In [10]:
df["id"].duplicated().sum()

0

We have virtually zero null values

In [11]:
df.apply(lambda col: col.isnull().sum() / df.shape[0])

id              0.0000
title           0.0000
release_date    0.0000
runtime         0.0000
country         0.0000
language        0.0006
genre           0.0000
studios         0.0016
budget          0.0000
revenue         0.0000
dtype: float64

# Second dataframe

Let's now have a look at second dataframe. This is the first of two datasets containing scraped data.

In [13]:
df2.head()

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
0,tt0111161,Frank Darabont,Frank Darabont;Stephen King,9.3,2508904,80,9750,190,Color,1.85 : 1,2021-12-22
1,tt0468569,Christopher Nolan,David S. Goyer;Christopher Nolan;Jonathan Nolan,9.0,2458874,84,7767,427,,2.39 : 1,2021-12-22
2,tt1375666,Christopher Nolan,Christopher Nolan,8.8,2205560,74,67,479,Color,1.85 : 1,2021-12-22
3,tt0137523,David Fincher,Chuck Palahniuk;Jim Uhls,8.8,1973247,66,4131,366,Color,2.39 : 1,2021-12-22
4,tt0109830,Robert Zemeckis,Eric Roth;Winston Groom,8.8,1936080,82,2811,164,Color,2.39 : 1,2021-12-22


Same number of rows as the dataframe of API data

In [14]:
df.shape[0] == df2.shape[0]

True

There's one duplicate ID.

In [15]:
df2["id"].duplicated().sum()

1

This is because we have two null values in the ID column

In [16]:
df2["id"].isnull().sum()

2

11% of the rows have missing values in the color variable, 7% in the metascore variable

In [17]:
df2.apply(lambda col: col.isnull().sum() / df2.shape[0]).sort_values(ascending = False)

color                  0.1096
metascore              0.0708
aspect_ratio           0.0348
user_review_count      0.0044
writer                 0.0038
critic_review_count    0.0018
imdb_rating_count      0.0014
imdb_rating            0.0014
director               0.0014
id                     0.0004
last_updated           0.0000
dtype: float64

Are these missing values the result of some mistake we made while scraping or are they actually missing on IMDB?

The color information is missing from the IMDB pages of these films. So the NAs in this Series are not due to mistakes in the scraping process.

In [18]:
df2[df2["color"].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
1,tt0468569,Christopher Nolan,David S. Goyer;Christopher Nolan;Jonathan Nolan,9.0,2458874,84,7767,427,,2.39 : 1,2021-12-22
9,tt0068646,Francis Ford Coppola,Francis Ford Coppola;Mario Puzo,9.2,1728848,100,4727,249,,1.85 : 1,2021-12-22
10,tt0816692,Christopher Nolan,Christopher Nolan;Jonathan Nolan,8.6,1656264,74,4840,626,,2.39 : 1,2021-12-22
21,tt0076759,George Lucas,George Lucas,8.6,1291693,90,2010,202,,,2021-12-22
44,tt0103064,James Cameron,William Wisher;James Cameron,8.5,1045418,75,1421,302,,2.39 : 1,2021-12-22
...,...,...,...,...,...,...,...,...,...,...,...
4960,tt1361336,Tim Story,Joseph Barbera;William Hanna;Kevin Costello,5.3,31161,32,669,144,,1.85 : 1,2021-12-22
4964,tt0068638,Sam Peckinpah,Walter Hill;Jim Thompson,7.4,31115,55,319,84,,2.35 : 1,2021-12-22
4974,tt0061655,Roman Polanski,Roman Polanski;Gérard Brach,7.2,31032,56,170,100,,2.35 : 1,2021-12-22
4981,tt0063518,Franco Zeffirelli,Franco Brusati;Masolino D'Amico;William Shakes...,7.6,30962,69,230,38,,1.85 : 1,2021-12-22


Same thing for the 'metascore' Series: the Metascore of the films with missing values in this column is also missing on IMDB.

In [19]:
df2[df2["metascore"].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
622,tt1028528,Quentin Tarantino,Quentin Tarantino,7.0,281646,,742,229,Color,2.35 : 1,2021-12-22
648,tt1028532,Lasse Hallström,Kaneto Shindô;Stephen P. Lindsey,8.1,270182,,568,95,Color,1.85 : 1,2021-12-22
664,tt0061722,,,,,,,,,,2021-12-22
908,tt0032553,Charles Chaplin,Charles Chaplin,8.4,215483,,305,118,Black and White,1.37 : 1,2021-12-22
919,tt0043014,Billy Wilder,Charles Brackett;Billy Wilder;D.M. Marshman Jr.,8.4,213558,,668,190,Black and White,1.37 : 1,2021-12-22
...,...,...,...,...,...,...,...,...,...,...,...
4947,tt0038559,Charles Vidor,E.A. Ellington;Marion Parsonnet;Jo Eisinger,7.6,31279,,187,103,Black and White,1.37 : 1,2021-12-22
4956,tt2359810,Aanand L. Rai,Himanshu Sharma,7.6,31194,,159,22,Color,,2021-12-22
4985,tt2101569,Brin Hill,Joss Whedon,7.0,30923,,81,34,Color,1.33 : 1,2021-12-22
4994,tt3767372,Shoojit Sircar,Juhi Chaturvedi,7.6,30877,,132,37,Color,2.35 : 1,2021-12-22


Same thing for the 'aspect_ratio' Series

In [20]:
df2[df2['aspect_ratio'].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
21,tt0076759,George Lucas,George Lucas,8.6,1291693,90,2010,202,,,2021-12-22
23,tt0108052,Steven Spielberg,Steven Zaillian;Thomas Keneally,8.9,1283317,94,2053,167,Black and White,,2021-12-22
27,tt0080684,Irvin Kershner,Lawrence Kasdan;George Lucas;Leigh Brackett,8.7,1219988,82,1360,233,Color,,2021-12-22
32,tt0088763,Robert Zemeckis,Bob Gale;Robert Zemeckis,8.5,1129288,87,1362,257,Color,,2021-12-22
57,tt0081505,Stanley Kubrick,Diane Johnson;Stanley Kubrick;Stephen King,8.4,959512,66,2062,350,Color,,2021-12-22
...,...,...,...,...,...,...,...,...,...,...,...
4858,tt1024715,Clark Gregg,Clark Gregg;Chuck Palahniuk,6.4,32284,47,82,148,,,2021-12-22
4896,tt7725596,Amit Ravindernath Sharma,Jyoti Kapoor;Akshat Ghildial;Shantanu Srivastava,8.0,31913,,247,21,Color,,2021-12-22
4911,tt6485666,Atlee Kumar,Vijayendra Prasad;Ramanagirivasan;Atlee Kumar,7.8,31740,,28,13,Color,,2021-12-22
4920,tt1620719,Abhinav Kashyap,Abhinav Kashyap;Dilip Shukla,6.2,31659,,156,37,Color,,2021-12-22


## Filling missing values

### Missing films

For some reason we couldn't scrape any info about seven films.

In [21]:
df2[df2['director'].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
664,tt0061722,,,,,,,,,,2021-12-22
991,,,,,,,,,,,2021-12-22
1057,tt0099810,,,,,,,,,,2021-12-22
1376,tt0102138,,,,,,,,,,2021-12-22
3162,,,,,,,,,,,2021-12-22
3172,tt5461944,,,,,,,,,,2021-12-22
4062,tt8368512,,,,,,,,,,2021-12-22


Let's extract the film IDs of the three missing films into a Pandas Series

In [22]:
missing_film_ids = list(df2[df2['director'].isnull()]['id'])

Let's fill the two missing IDs

In [23]:
missing_film_ids[1] = film_ids[991]
missing_film_ids[4] = film_ids[3162]

Let's fill them in the dataframe as well

In [24]:
df2.loc[991, 'id'] = film_ids[991]
df2.loc[3162, 'id'] = film_ids[3162]

Let's scrape those pieces of data again 

In [25]:
for i in range(len(missing_film_ids)):
    
    content = requests.get(f"https://www.imdb.com/title/{missing_film_ids[i]}").content
    soup = BeautifulSoup(content)
    
    df2.loc[df2['id'] == missing_film_ids[i], 'director'] = scraping_functions.scrape_director(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'writer'] = scraping_functions.scrape_writer(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'imdb_rating'] = scraping_functions.scrape_imdb_rating(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'imdb_rating_count'] = scraping_functions.scrape_rating_count(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'metascore'] = scraping_functions.scrape_metascore(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'user_review_count'] = scraping_functions.scrape_user_review_count(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'critic_review_count'] = scraping_functions.scrape_critic_review_count(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'color'] = scraping_functions.scrape_color(soup)
    df2.loc[df2['id'] == missing_film_ids[i], 'aspect_ratio'] = scraping_functions.scrape_aspect_ratio(soup)

We successfully scraped the data for all films.

In [26]:
df2[df2['id'].isin(missing_film_ids)]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
664,tt0061722,Mike Nichols,Buck Henry;Charles Webb;Calder Willingham,8.0,265496,83,820,189,Color,1.78 : 1,2021-12-22
991,tt0079817,Sylvester Stallone,Sylvester Stallone,7.3,201537,61,245,93,,,2021-12-22
1057,tt0099810,John McTiernan,Tom Clancy;Donald E. Stewart;Larry Ferguson,7.6,190974,58,329,116,,2.35 : 1,2021-12-22
1376,tt0102138,Oliver Stone,Jim Marrs;Oliver Stone;Jim Garrison,8.0,150318,72,529,89,Color,2.39 : 1,2021-12-22
3162,tt0060176,Michelangelo Antonioni,Julio Cortázar;Tonino Guerra;Michelangelo Anto...,7.6,59866,82,311,146,,1.85 : 1,2021-12-22
3172,tt5461944,Anthony Maras,Anthony Maras;John Collee,7.6,59630,62,444,168,Color,2.35 : 1,2021-12-22
4062,tt8368512,Dominic Cooke,Tom O'Connor,7.2,42483,65,299,149,Color,2.39 : 1,2021-12-22


### Missing user review counts

In [28]:
df2[df2['user_review_count'].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
84,tt1454468,Alfonso Cuarón,Jonás Cuarón;Alfonso Cuarón,7.7,798771,96.0,,783,,2.39 : 1,2021-12-22
86,tt0458339,Joe Johnston,Joe Simon;Stephen McFeely;Christopher Markus,6.9,795242,66.0,,562,,2.39 : 1,2021-12-22
93,tt2582802,Damien Chazelle,Damien Chazelle,8.5,778287,88.0,,583,,2.39 : 1,2021-12-22
109,tt0083658,Ridley Scott,Philip K. Dick;David Webb Peoples;Hampton Fancher,8.1,732620,84.0,,274,,2.39 : 1,2021-12-22
110,tt0947798,Darren Aronofsky,John J. McLaughlin;Mark Heyman;Andres Heinz,8.0,731790,79.0,,687,,2.35 : 1,2021-12-22
111,tt0211915,Jean-Pierre Jeunet,Guillaume Laurant;Jean-Pierre Jeunet,8.3,731124,69.0,,147,,2.39 : 1,2021-12-22
1180,tt0061852,Wolfgang Reitherman,Ken Anderson;Ralph Wright;Larry Clemmons,7.6,174666,65.0,,118,Color,,2021-12-22
1679,tt0063350,George A. Romero,George A. Romero;John A. Russo,7.9,123932,89.0,,232,Black and White,,2021-12-22
1848,tt1622979,Steven Quale,Jeffrey Reddick;Eric Heisserer,5.8,112379,50.0,,279,Color,,2021-12-22
1882,tt10295212,Vishnuvardhan,Sandeep Shrivastava,8.7,109992,,,25,Color,,2021-12-22


Let's extract the IDs of these films in a Pandas series

In [29]:
missing_user_review_ids = df2.loc[df2["user_review_count"].isnull(), "id"]

Let's scrape the user review count again

In [30]:
for i in range(len(missing_user_review_ids)):
    content = requests.get(f"https://www.imdb.com/title/{missing_user_review_ids.iloc[i]}").content
    soup = BeautifulSoup(content)
    
    df2.loc[df2['id'] == missing_user_review_ids.iloc[i], 'user_review_count'] = scraping_functions.scrape_user_review_count(soup)

We managed to retrieve the user review count for all those films.

In [31]:
df2[df2["id"].isin(missing_user_review_ids)]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio,last_updated
84,tt1454468,Alfonso Cuarón,Jonás Cuarón;Alfonso Cuarón,7.7,798771,96.0,2206,783,,2.39 : 1,2021-12-22
86,tt0458339,Joe Johnston,Joe Simon;Stephen McFeely;Christopher Markus,6.9,795242,66.0,1006,562,,2.39 : 1,2021-12-22
93,tt2582802,Damien Chazelle,Damien Chazelle,8.5,778287,88.0,1429,583,,2.39 : 1,2021-12-22
109,tt0083658,Ridley Scott,Philip K. Dick;David Webb Peoples;Hampton Fancher,8.1,732620,84.0,1675,274,,2.39 : 1,2021-12-22
110,tt0947798,Darren Aronofsky,John J. McLaughlin;Mark Heyman;Andres Heinz,8.0,731790,79.0,1379,687,,2.35 : 1,2021-12-22
111,tt0211915,Jean-Pierre Jeunet,Guillaume Laurant;Jean-Pierre Jeunet,8.3,731124,69.0,1549,147,,2.39 : 1,2021-12-22
1180,tt0061852,Wolfgang Reitherman,Ken Anderson;Ralph Wright;Larry Clemmons,7.6,174666,65.0,3555,118,Color,,2021-12-22
1679,tt0063350,George A. Romero,George A. Romero;John A. Russo,7.9,123932,89.0,710,232,Black and White,,2021-12-22
1848,tt1622979,Steven Quale,Jeffrey Reddick;Eric Heisserer,5.8,112379,50.0,192,279,Color,,2021-12-22
1882,tt10295212,Vishnuvardhan,Sandeep Shrivastava,8.7,109992,,3089,25,Color,,2021-12-22


These are the % of null values now

In [33]:
df2.apply(lambda x: x.isnull().sum() / df2.shape[0]).sort_values(ascending=False)

color                  0.1088
metascore              0.0694
aspect_ratio           0.0336
writer                 0.0024
critic_review_count    0.0004
last_updated           0.0000
user_review_count      0.0000
imdb_rating_count      0.0000
imdb_rating            0.0000
director               0.0000
id                     0.0000
dtype: float64

# Third dataframe

In [32]:
df3.apply(lambda x: x.isnull().sum() / df3.shape[0]).sort_values(ascending=False)

art_director           0.1236
costume_designer       0.1098
production_designer    0.0912
composer               0.0438
cinematographer        0.0278
editor                 0.0046
producers              0.0014
actors                 0.0000
id                     0.0000
dtype: float64

No duplicate IDs

In [34]:
df['id'].duplicated().sum()

0

Films with missing 'cinematographer' are mostly animated films (which do not have a cinematographer)

In [35]:
df3[df3['cinematographer'].isnull()]

Unnamed: 0,id,actors,cinematographer,editor,composer,producers,production_designer,art_director,costume_designer
50,tt0110357,Rowan Atkinson;Matthew Broderick;Niketa Calame...,,Ivan Bilancio,Hans Zimmer,Alice Dewey Goldstone;Don Hahn;Sarah McArthur;...,Chris Sanders,Andy Gaskill,
52,tt1049413,Ed Asner;Christopher Plummer;Jordan Nagai;Bob ...,,Kevin Nolting,Michael Giacchino,John Lasseter;Kori Rae;Denise Ream;Jonas River...,Ricky Nierva,Daniel Lopez Muñoz,
62,tt0114709,Tom Hanks;Tim Allen;Don Rickles;Jim Varney;Wal...,,Robert Gordon;Lee Unkrich,Randy Newman,Bonnie Arnold;Ed Catmull;Ralph Guggenheim;Stev...,,Ralph Eggleston,
72,tt0198781,John Goodman;Billy Crystal;Mary Gibbs;Steve Bu...,,Jim Stewart,Randy Newman,Darla K. Anderson;Karen Dufilho-Rosen;John Las...,Harley Jessup;Bob Pauley,Tia W. Kratter;Dominique Louis,
87,tt0435761,Tom Hanks;Tim Allen;Joan Cusack;Ned Beatty;Don...,,Ken Schretzmann,Randy Newman,Darla K. Anderson;John Lasseter;Nicole Paradis...,Bob Pauley,Daniel Arriaga;Robert Kondo;Daisuke 'Dice' Tsu...,
...,...,...,...,...,...,...,...,...,...
4676,tt0485601,Evan McGuire;Christen Mooney;Brendan Gleeson;M...,,Fabienne Alvarez-Giro,Bruno Coulais,Didier Brunner;James Flynn;Tomm Moore;Ivan Rou...,,Ross Stewart,
4723,tt0082509,Don Francks;Caroline Semple;Richard Romanus;Su...,,Ian Llande;Mick Manning;Gerald Tripp,Elmer Bernstein,Vic Atkinson;John Coates;Michael C. Gross;John...,Michael C. Gross,Pat Gavin,
4729,tt3666024,Emmanuel Garijo;Tom Hudson;Baptiste Goy;Axel D...,,Céline Kélépikis,Laurent Perez Del Mar,Rémi Burah;Pascal Caucheteux;Christophe Jankov...,Michael Dudok de Wit,,
4763,tt0066473,Martin Balsam;Sô Yamamura;Jason Robards;Joseph...,,Pembroke J. Herring;Shinya Inoue;James E. Newcom,Jerry Goldsmith,Richard Fleischer;Keinosuke Kubo;Otto Lang;Mas...,,Richard Day;Taizô Kawashima;Yoshirô Muraki;Jac...,


# Joining the dataframes

Let's join the second dataframe to the first one by using the film id as our key. They have exactly the same IDs so a left, inner and right join would all result in the same result.

In [36]:
cleaned_df = pd.merge(left = df, right = df2, how = 'inner', on = 'id')

Let's now join the third dataframe to the other two.

In [37]:
cleaned_df = pd.merge(left = cleaned_df, right = df3, how = 'inner', on = 'id')

Let's have a look at the output

In [40]:
cleaned_df.head()

Unnamed: 0,id,title,release_date,runtime,country,language,genre,studios,budget,revenue,...,aspect_ratio,last_updated,actors,cinematographer,editor,composer,producers,production_designer,art_director,costume_designer
0,tt0111161,The Shawshank Redemption,1994-09-23,142,United States of America,English,Drama;Crime,Castle Rock Entertainment,25000000,28341469,...,1.85 : 1,2021-12-22,Tim Robbins;Morgan Freeman;Bob Gunton;William ...,Roger Deakins,Richard Francis-Bruce,Thomas Newman,Liz Glotzer;David V. Lester;Niki Marvin,Terence Marsh;Soheil,Peter Landsdown Smith,Elizabeth McBride
1,tt0468569,The Dark Knight,2008-07-14,152,United Kingdom;United States of America,English;Mandarin,Drama;Action;Crime;Thriller,DC Comics;Legendary Pictures;Syncopy;Isobel Gr...,185000000,1004558444,...,2.39 : 1,2021-12-22,Christian Bale;Heath Ledger;Aaron Eckhart;Mich...,Wally Pfister,Lee Smith,James Newton Howard;Hans Zimmer,Kevin de la Noy;Jordan Goldberg;Philip Lee;Ben...,Nathan Crowley,Mark Bartholomew;James Hambidge;Craig Jackson;...,Lindy Hemming
2,tt1375666,Inception,2010-07-15,148,United Kingdom;United States of America,English;Japanese,Action;Science Fiction;Adventure,Legendary Pictures;Syncopy;Warner Bros. Pictures,160000000,825532764,...,1.85 : 1,2021-12-22,Leonardo DiCaprio;Joseph Gordon-Levitt;Elliot ...,Wally Pfister,Lee Smith,Hans Zimmer,Zakaria Alaoui;John Bernard;Chris Brigham;Jord...,Guy Hendrix Dyas,Luke Freeborn;Matthew Gray;Brad Ricker;Dean Wo...,Jeffrey Kurland
3,tt0137523,Fight Club,1999-10-15,139,Germany;United States of America,English,Drama,Regency Enterprises;Fox 2000 Pictures;Taurus F...,63000000,100853753,...,2.39 : 1,2021-12-22,Edward Norton;Brad Pitt;Meat Loaf;Zach Grenier...,Jeff Cronenweth,James Haygood,Dust Brothers;John King;Michael Simpson,Ross Grayson Bell;Ceán Chaffin;John S. Dorsey;...,Alex McDowell,Melique Berger;Chris Gorak,Michael Kaplan
4,tt0109830,Forrest Gump,1994-07-06,142,United States of America,English,Comedy;Drama;Romance,Paramount;The Steve Tisch Company,55000000,677387716,...,2.39 : 1,2021-12-22,Tom Hanks;Rebecca Williams;Sally Field;Michael...,Don Burgess,Arthur Schmidt,Alan Silvestri,Wendy Finerman;Charles Newirth;Steve Starkey;S...,Rick Carter,Leslie McDonald;William James Teegarden,Joanna Johnston


We ended up with the right number of rows

In [41]:
cleaned_df.shape

(5000, 28)

In [51]:
cleaned_df.columns

Index(['id', 'title', 'release_date', 'runtime', 'country', 'language',
       'genre', 'studios', 'budget', 'revenue', 'director', 'writer',
       'imdb_rating', 'imdb_rating_count', 'metascore', 'user_review_count',
       'critic_review_count', 'color', 'aspect_ratio', 'last_updated',
       'actors', 'cinematographer', 'editor', 'composer', 'producers',
       'production_designer', 'art_director', 'costume_designer'],
      dtype='object')

In [53]:
cleaned_df = cleaned_df.reindex(columns= ['id', 'title', 'release_date', 'runtime', 'country', 'language', 'genre', 'studios',
                             'color', 'aspect_ratio', 'budget', 'revenue', 'imdb_rating', 'imdb_rating_count', 
                             'metascore', 'user_review_count', 'critic_review_count', 'director', 'writer', 
                             'actors', 'cinematographer', 'editor', 'composer', 'production_designer', 'art_director',
                             'costume_designer', 'producers', 'last_updated'])

# Casting

Right now all the columns are of the 'object' type. We need to cast them to the correct dtype.

In [42]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   5000 non-null   object
 1   title                5000 non-null   object
 2   release_date         5000 non-null   object
 3   runtime              5000 non-null   object
 4   country              5000 non-null   object
 5   language             4997 non-null   object
 6   genre                5000 non-null   object
 7   studios              4992 non-null   object
 8   budget               5000 non-null   object
 9   revenue              5000 non-null   object
 10  director             5000 non-null   object
 11  writer               4988 non-null   object
 12  imdb_rating          5000 non-null   object
 13  imdb_rating_count    5000 non-null   object
 14  metascore            4653 non-null   object
 15  user_review_count    5000 non-null   object
 16  critic

Let's first cast the following columns to integers. The ones with missing values will actually be cast to floats.

In [43]:
integer_cols = ['runtime', 'budget', 'revenue', 'imdb_rating_count', 'metascore', 'user_review_count', 'critic_review_count']

cleaned_df[integer_cols] = cleaned_df[integer_cols].apply(lambda x: pd.to_numeric(arg = x, downcast='integer'))

Let's cast the release date to a datetime variable

In [48]:
cleaned_df[['release_date', 'last_updated']] = cleaned_df[['release_date', 'last_updated']].apply(pd.to_datetime)

Let's cast the IMDB avg rating to float

In [45]:
cleaned_df['imdb_rating'] = pd.to_numeric(cleaned_df['imdb_rating'])

Everything looks fine

In [50]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   5000 non-null   object        
 1   title                5000 non-null   object        
 2   release_date         5000 non-null   datetime64[ns]
 3   runtime              5000 non-null   int16         
 4   country              5000 non-null   object        
 5   language             4997 non-null   object        
 6   genre                5000 non-null   object        
 7   studios              4992 non-null   object        
 8   budget               5000 non-null   int32         
 9   revenue              5000 non-null   int64         
 10  director             5000 non-null   object        
 11  writer               4988 non-null   object        
 12  imdb_rating          5000 non-null   float64       
 13  imdb_rating_count    5000 non-nul

# Saving the cleaned dataframes

In [55]:
cleaned_df.to_csv("data/cleaned_df.csv")