In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

import time
import json
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [87]:
df = pd.read_csv('../data/raw/imdb_movies_ids.csv')
print(df.shape)
df.head()

(35653, 7)


Unnamed: 0,id,title,year,runtime,rating,votes,plot
0,tt0172495,1. Gladiator,2000,2000,8.5 (1.6M),(1.6M),A former Roman General sets out to exact venge...
1,tt0217869,2. Unbreakable,2000,2000,7.3 (441K),(441K),A man learns something extraordinary about him...
2,tt0208092,3. Snatch,2000,2000,8.2 (909K),(909K),"Unscrupulous boxing promoters, violent bookmak..."
3,tt0144084,4. American Psycho,2000,2000,7.6 (714K),(714K),A wealthy New York City investment banking exe...
4,tt0180093,5. Requiem for a Dream,2000,2000,8.3 (897K),(897K),The drug-induced utopias of four Coney Island ...


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35653 entries, 0 to 35652
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       35653 non-null  object
 1   title    35653 non-null  object
 2   year     35650 non-null  object
 3   runtime  35650 non-null  object
 4   rating   35653 non-null  object
 5   votes    35653 non-null  object
 6   plot     33752 non-null  object
dtypes: object(7)
memory usage: 1.9+ MB


In [89]:
df['votes'] = df['votes'].str.replace('\(', '', regex=True)
df['votes'] = df['votes'].str.replace('\)', '', regex=True)
df['votes'] = df['votes'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True).map(pd.eval).astype(int)
df['votes'].head()

0    1600000
1     441000
2     909000
3     714000
4     897000
Name: votes, dtype: int64

# As we have 35653 imdb movies records, scraping all movies summary and synopsis would create a hige dataset. So we'll apply filter criteria as 500 minimum votes required to reduce the dataset.

In [90]:
min_votes_500 = df[df['votes'] > 500]
min_votes_500.shape

(7554, 7)

In [91]:
min_votes_500.head()

Unnamed: 0,id,title,year,runtime,rating,votes,plot
0,tt0172495,1. Gladiator,2000,2000,8.5 (1.6M),1600000,A former Roman General sets out to exact venge...
1,tt0217869,2. Unbreakable,2000,2000,7.3 (441K),441000,A man learns something extraordinary about him...
2,tt0208092,3. Snatch,2000,2000,8.2 (909K),909000,"Unscrupulous boxing promoters, violent bookmak..."
3,tt0144084,4. American Psycho,2000,2000,7.6 (714K),714000,A wealthy New York City investment banking exe...
4,tt0180093,5. Requiem for a Dream,2000,2000,8.3 (897K),897000,The drug-induced utopias of four Coney Island ...


In [92]:
min_votes_500.to_csv('../data/raw/imdb_movies_ids_min_votes_500.csv', index=False)

In [20]:
def create_url(id, attribute=[]):
    url = "https://www.imdb.com/title/"+str(id)+"/"
    if 'synopsis' in attribute:
        url += "plotsummary/"
    return url

In [21]:
chrome_options = Options()
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)

driver = webdriver.Chrome(options=chrome_options)

In [22]:
def load_page(url):

    driver.get(url)
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

In [23]:
def get_movie_data(id):
    url = create_url(id)
    soup = load_page(url)
    movie_data = soup.find('script',{"type":"application/ld+json"})
    movie = {}

    if movie_data is not None:
        movie_data = json.loads(movie_data.string)
        movie['image_link'] = movie_data.get('image', '')
        movie['imdb_id'] = id
        aggregate_rating = movie_data.get('aggregateRating', {})
        movie['rating_count'] = aggregate_rating.get('ratingCount', '')
        movie['best_rating'] = aggregate_rating.get('bestRating', '')
        movie['worst_rating'] = aggregate_rating.get('worstRating')
        movie['rating'] = aggregate_rating.get('ratingValue')
        movie['genre'] = movie_data.get('genre')
        movie['keywords'] = movie_data.get('keywords', '').split(',')
        movie['trailer_link'] = movie_data.get('trailer', {}).get('url', '')
        movie['actors'] = [actor.get('name', '') for actor in movie_data.get('actor', [])]
        movie['director'] = [director.get('name', '') for director in movie_data.get('director', [])]
        
        synopsis_soup = load_page(create_url(id, attribute=['synopsis']))
        synopsis_div = synopsis_soup.select_one('[data-testid="sub-section-synopsis"] .ipc-html-content-inner-div')
        if synopsis_div is not None:
            movie['synopsis'] = synopsis_div.text
        summary_div = synopsis_soup.select_one('[data-testid="sub-section-summaries"] .ipc-html-content-inner-div')
        if summary_div is not None:
            movie['summary'] = summary_div.text
    
    return movie

In [None]:
movies = []
for id in tqdm(min_votes_500['id']):
    print(id)
    try: 
        movie = get_movie_data(id)
        # Append the movie data to movies_df
        if movie != {}:
            movies.append(movie)
    except Exception as e:
        print(e)
driver.quit()

In [20]:
movies_df = pd.DataFrame(movies)
movies_df.to_csv('../data/raw/imdb_movies_detail_min_votes_500.csv', index=False)