# Modules and data

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import os

Loading the first dataframe (i.e. the one with the API data)

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/JacopoMalatesta/imdb_most_popular_films/main/data/df_api.csv",
              index_col = 0,
              dtype = 'object')

Loading the second dataframe (i.e. the first dataframe of scraped data)

In [3]:
df2 = pd.read_csv("https://raw.githubusercontent.com/JacopoMalatesta/imdb_most_popular_films/main/data/scraped_df_1.csv", 
                  index_col = 0,
                  dtype = 'object')

Loading the third dataframe (i.e. the second dataframe of scraped data)

In [4]:
df3 = pd.read_csv("https://raw.githubusercontent.com/JacopoMalatesta/imdb_most_popular_films/main/data/scraped_df_2.csv",
                  index_col = 0,
                  dtype = 'object')

# First datraframe

In [5]:
df.head()

Unnamed: 0,id,title,release_date,runtime,country,language,genre,studios,budget,revenue
0,tt0111161,The Shawshank Redemption,1994-09-23,142,United States of America,English,Drama;Crime,Castle Rock Entertainment,25000000,28341469
1,tt0468569,The Dark Knight,2008-07-14,152,United Kingdom;United States of America,English;Mandarin,Drama;Action;Crime;Thriller,DC Comics;Legendary Pictures;Syncopy;Isobel Gr...,185000000,1004558444
2,tt1375666,Inception,2010-07-15,148,United Kingdom;United States of America,English;Japanese,Action;Science Fiction;Adventure,Legendary Pictures;Syncopy;Warner Bros. Pictures,160000000,825532764
3,tt0137523,Fight Club,1999-10-15,139,Germany;United States of America,English,Drama,Regency Enterprises;Fox 2000 Pictures;Taurus F...,63000000,100853753
4,tt0109830,Forrest Gump,1994-07-06,142,United States of America,English,Comedy;Drama;Romance,Paramount;The Steve Tisch Company,55000000,677387716


All IDs are unique

In [6]:
df["id"].duplicated().sum()

0

We have virtually zero null values

In [7]:
df.apply(lambda col: col.isnull().sum() / df.shape[0])

id              0.0000
title           0.0000
release_date    0.0000
runtime         0.0000
country         0.0000
language        0.0006
genre           0.0000
studios         0.0016
budget          0.0000
revenue         0.0000
dtype: float64

# Second dataframe

Let's now have a look at second dataframe. This is the first of two datasets containing scraped data.

In [8]:
df2.head()

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
0,tt0111161,Frank Darabont,Stephen King;Frank Darabont,9.3,2506833,80,9750,190,Color,1.85 : 1
1,tt0468569,Christopher Nolan,David S. Goyer;Jonathan Nolan;Christopher Nolan,9.0,2456425,84,7764,427,,2.39 : 1
2,tt1375666,Christopher Nolan,Christopher Nolan,8.8,2203914,74,4466,479,Color,2.39 : 1
3,tt0137523,David Fincher,Jim Uhls;Chuck Palahniuk,8.8,1971788,66,4127,366,Color,2.39 : 1
4,tt0109830,Robert Zemeckis,Winston Groom;Eric Roth,8.8,1934719,82,2807,164,Color,2.39 : 1


Same number of rows as the dataframe of API data

In [9]:
df.shape[0] == df2.shape[0]

True

No duplicated IDs

In [10]:
df2["id"].duplicated().sum()

0

12% of the rows have missing values in the color variable, 7% in the metascore variable

In [11]:
df2.apply(lambda col: col.isnull().sum() / df2.shape[0]).sort_values(ascending = False)

color                  0.1182
metascore              0.0698
aspect_ratio           0.0334
writer                 0.0030
user_review_count      0.0020
critic_review_count    0.0010
imdb_rating_count      0.0006
imdb_rating            0.0006
director               0.0006
id                     0.0000
dtype: float64

Are these missing values the result of some mistake we made while scraping or are they actually missing on IMDB website?

If we open the pages of the films with missing values in the 'color' variable we can see that the color information is missing. So the missing values in this Series are not due to mistakes in the scraping process.

In [12]:
df2[df2["color"].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
1,tt0468569,Christopher Nolan,David S. Goyer;Jonathan Nolan;Christopher Nolan,9.0,2456425,84,7764,427,,2.39 : 1
9,tt0068646,Francis Ford Coppola,Francis Ford Coppola;Mario Puzo,9.2,1727603,100,4728,249,,1.85 : 1
10,tt0816692,Christopher Nolan,Jonathan Nolan;Christopher Nolan,8.6,1654589,74,4838,626,,2.39 : 1
21,tt0076759,George Lucas,George Lucas,8.6,1290992,90,2011,202,,
45,tt0114814,Bryan Singer,Christopher McQuarrie,8.5,1039305,77,1447,147,,2.39 : 1
...,...,...,...,...,...,...,...,...,...,...
4957,tt0033045,Ernst Lubitsch,Samson Raphaelson;Ben Hecht;Miklós László,8.1,31144,96,173,82,,1.37 : 1
4963,tt0068638,Sam Peckinpah,Jim Thompson;Walter Hill,7.4,31105,55,162,84,,2.35 : 1
4964,tt1361336,Tim Story,Joseph Barbera;Kevin Costello;William Hanna,5.3,31093,32,669,144,,1.85 : 1
4981,tt0063518,Franco Zeffirelli,Masolino D'Amico;Franco Brusati;William Shakes...,7.6,30939,69,230,38,,1.85 : 1


Same thing for the 'metascore' Series: the Metascore of the films with missing values in this column is also missing on IMDB.

In [13]:
df2[df2["metascore"].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
559,tt0448115,,,,,,,,,
622,tt1028528,Quentin Tarantino,Quentin Tarantino,7.0,281554,,742,229,Color,2.35 : 1
648,tt1028532,Lasse Hallström,Kaneto Shindô;Stephen P. Lindsey,8.1,270004,,568,95,Color,1.85 : 1
906,tt0032553,Charles Chaplin,Charles Chaplin,8.4,215387,,305,118,,1.37 : 1
918,tt0043014,Billy Wilder,Charles Brackett;D.M. Marshman Jr.;Billy Wilder,8.4,213461,,668,190,,1.37 : 1
...,...,...,...,...,...,...,...,...,...,...
4947,tt0038559,Charles Vidor,Marion Parsonnet;E.A. Ellington;Jo Eisinger,7.6,31257,,187,103,Color,1.37 : 1
4955,tt2359810,Aanand L. Rai,Himanshu Sharma,7.6,31175,,159,22,Color,
4983,tt2101569,Brin Hill,Joss Whedon,7.0,30920,,81,34,Color,
4991,tt3767372,Shoojit Sircar,Juhi Chaturvedi,7.6,30869,,132,37,Color,2.35 : 1


Same thing for the 'aspect_ratio' Series

In [14]:
df2[df2['aspect_ratio'].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
21,tt0076759,George Lucas,George Lucas,8.6,1290992,90,2011,202,,
23,tt0108052,Steven Spielberg,Thomas Keneally;Steven Zaillian,8.9,1282467,94,2052,167,Color,
27,tt0080684,Irvin Kershner,Lawrence Kasdan;George Lucas;Leigh Brackett,8.7,1219238,82,12,233,Color,
32,tt0088763,Robert Zemeckis,Robert Zemeckis;Bob Gale,8.5,1128514,87,1361,256,Color,
49,tt0086190,Richard Marquand,Lawrence Kasdan;George Lucas,8.3,997245,58,920,206,Color,
...,...,...,...,...,...,...,...,...,...,...
4901,tt0827503,Cem Yilmaz;Ali Taner Baltaci,Cem Yilmaz,7.4,31833,,,5,,
4910,tt6485666,Atlee Kumar,Vijayendra Prasad;Atlee Kumar;Ramanagirivasan,7.8,31710,,120,13,Color,
4955,tt2359810,Aanand L. Rai,Himanshu Sharma,7.6,31175,,159,22,Color,
4972,tt1772288,Dan Fogelman,Dan Fogelman,7.0,31058,58,125,151,Color,


## Filling missing values

### Missing films

For some reason we couldn't scrape any info about three films.

In [15]:
df2[df2['director'].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
559,tt0448115,,,,,,,,,
4609,tt9354842,,,,,,,,,
4704,tt11045422,,,,,,,,,


Let's extract the film IDs of the three missing films into a Pandas Series

In [16]:
missing_film_ids = df2[df2['director'].isnull()]['id']

Let's recreate the functions we used to scrape the data

In [17]:
# Film ID
def scrape_film_id(soup):
    
    try:
        film_id = soup.find("meta", {"property": "imdb:pageConst"}).get("content")
    except:
        return np.nan
    else:
        return film_id
# Directors
def scrape_director(soup):
    
    try:  
        a_tags = soup.find_all(href = re.compile("tt_ov_dr"), class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")
    
        directors = list(set([a.text for a in a_tags]))
    
        directors = ';'.join(directors)
        
    except:
        return np.nan
        
    else:
        return directors
# Writers
def scrape_writer(soup):
    
    try:
        a_tags = soup.find_all(href = re.compile("tt_ov_wr"), class_="ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link")
        
        writers = list(set([a.text for a in a_tags]))
        
        writers = ';'.join(writers)
        
    except:
        return np.nan
        
    else:
        return writers
# IMDB average rating
def scrape_imdb_rating(soup):
    
    try:   
        span = soup.find_all("span", class_="AggregateRatingButton__RatingScore-sc-1ll29m0-1 iTLWoV")[0]
    
        imdb_rating = span.text
        
    except:
        return np.nan
    
    else:
        return imdb_rating
        
# IMDB rating count
def scrape_rating_count(soup):
    
    try:
        pattern = r'(?<="ratingCount":)[\d.]+'
        string = str(soup.find("script", {"type": "application/ld+json"}))
        rating_count = re.findall(pattern = pattern, string = string)[0]
        
    except:
        return np.nan
    
    else:
        return rating_count
# Metascore
def scrape_metascore(soup):
    
    try:
        metascore = soup.find("span", class_="score-meta").text 
    
    except:
        return np.nan
    
    else:
        return metascore
# User review count
def scrape_user_review_count(soup):
    
    try: 
        pattern = r'(?<="total":)\d+(?=,"__typename":"ReviewsConnection"},"criticReviewsTotal":)'
  
        string = str(soup.find("script", {'id': '__NEXT_DATA__'}))
    
        user_review_count = re.findall(pattern = pattern, string = string)[0]
        
    except:
        return np.nan
    
    else:
        return user_review_count
# Critic review count
def scrape_critic_review_count(soup):
    
    try:   
        spans = soup.find_all("span", class_= re.compile("three-Elements")) 
        
        string = list(filter(lambda x: 'Critic' in str(x), spans))[0].text
        
        critic_review_count = re.findall(r'\d+', string)[0]
    
    except:
        return np.nan
        
    else:
        return critic_review_count
# Color
def scrape_color(soup):
    
    try:
        pattern = r'(?<="text":")\w+(?=","attributes":\[\],"__typename":"Coloration"})'
        
        string = str(soup.find("script", {"type": "application/json"}))
    
        color = re.findall(pattern = pattern, string = string)[0]
        
    except:
        return np.nan
    
    else:
        return color
# Aspect ratio
def scrape_aspect_ratio(soup):
    try:
        pattern = r'(?<="aspectRatio":")[\d.\s:]+'
    
        string = str(soup.find("script", {"type": "application/json"}))
    
        aspect_ratio = re.findall(pattern = pattern, string = string)[0] 
        
    except:
        return np.nan
        
    else:
        return aspect_ratio


Let's scrape those pieces of data again 

In [18]:
for i in range(len(missing_film_ids)):
    
    content = requests.get(f"https://www.imdb.com/title/{missing_film_ids.iloc[i]}").content
    soup = BeautifulSoup(content)
    
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'director'] = scrape_director(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'writer'] = scrape_writer(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'imdb_rating'] = scrape_imdb_rating(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'imdb_rating_count'] = scrape_rating_count(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'metascore'] = scrape_metascore(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'user_review_count'] = scrape_user_review_count(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'critic_review_count'] = scrape_critic_review_count(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'color'] = scrape_color(soup)
    df2.loc[df2['id'] == missing_film_ids.iloc[i], 'aspect_ratio'] = scrape_aspect_ratio(soup)

Everything looks fine

In [19]:
df2[df2['id'].isin(missing_film_ids)]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
559,tt0448115,David F. Sandberg,Henry Gayden;Bill Parker;Darren Lemke,7.0,302935,71.0,2157,395,Color,2.39 : 1
4609,tt9354842,Michael Fimognari,Sofia Alvarez;J. Mills Goodloe;Jenny Han,6.0,34966,54.0,167,58,Color,2.39 : 1
4704,tt11045422,Rakeysh Omprakash Mehra,Vijay Maurya;Anjum Rajabali,5.2,33954,,752,19,Color,


### Missing user review counts

In [20]:
df2[df2['user_review_count'].isnull()]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
2720,tt10280276,David Dhawan,Rumi Jaffery;Farhad Samji,4.2,72596,,,25,,
2730,tt8695030,Jim Jarmusch,Jim Jarmusch,5.5,72358,53.0,,345,,
3847,tt0091225,Willard Huyck,Willard Huyck;Steve Gerber;Gloria Katz,4.7,45726,28.0,,112,,
4091,tt2178470,Ayan Mukherjee,Hussain Dalal;Ayan Mukherjee,7.2,41919,,,28,,
4578,tt0096332,Philip Kaufman,Milan Kundera;Philip Kaufman;Jean-Claude Carri...,7.3,35328,73.0,,58,,
4751,tt13491110,Mahesh Manjrekar,Siddharth Salvi;Mahesh Manjrekar;Abhijeet Shir...,7.6,33511,,,8,,
4901,tt0827503,Cem Yilmaz;Ali Taner Baltaci,Cem Yilmaz,7.4,31833,,,5,,


Let's extract the IDs of these films in a Pandas series

In [21]:
missing_user_review_ids = df2.loc[df2["user_review_count"].isnull(), "id"]

Let's scrape the user review count again

In [22]:
for i in range(len(missing_user_review_ids)):
    content = requests.get(f"https://www.imdb.com/title/{missing_user_review_ids.iloc[i]}").content
    soup = BeautifulSoup(content)
    
    df2.loc[df2['id'] == missing_user_review_ids.iloc[i], 'user_review_count'] = scrape_user_review_count(soup)

We managed to retrieve the user review count for most of those films.

In [23]:
df2[df2["id"].isin(missing_user_review_ids)]

Unnamed: 0,id,director,writer,imdb_rating,imdb_rating_count,metascore,user_review_count,critic_review_count,color,aspect_ratio
2720,tt10280276,David Dhawan,Rumi Jaffery;Farhad Samji,4.2,72596,,2984.0,25,,
2730,tt8695030,Jim Jarmusch,Jim Jarmusch,5.5,72358,53.0,1533.0,345,,
3847,tt0091225,Willard Huyck,Willard Huyck;Steve Gerber;Gloria Katz,4.7,45726,28.0,363.0,112,,
4091,tt2178470,Ayan Mukherjee,Hussain Dalal;Ayan Mukherjee,7.2,41919,,159.0,28,,
4578,tt0096332,Philip Kaufman,Milan Kundera;Philip Kaufman;Jean-Claude Carri...,7.3,35328,73.0,143.0,58,,
4751,tt13491110,Mahesh Manjrekar,Siddharth Salvi;Mahesh Manjrekar;Abhijeet Shir...,7.6,33511,,,8,,
4901,tt0827503,Cem Yilmaz;Ali Taner Baltaci,Cem Yilmaz,7.4,31833,,16.0,5,,


# Third dataframe