Importing Libraries 📚

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

Getting all movies links from IMDB 🎥

In [2]:
response = requests.get("https://www.imdb.com/chart/top/?ref_=nv_mv_250")
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", {"class" : "chart full-width"})
rows = table.find_all("tr")[1:]
links = []
for i in rows:
    title_col = i.find("td", {"class" : "titleColumn"})
    link = title_col.a["href"]
    links += ["https://www.imdb.com" + link]

Getting Data from IMDB 🔍

In [14]:
chrome_options = Options()
chrome_options.add_argument("--lang=en-US")
driver = webdriver.Chrome(options = chrome_options)

IMDB_df = pd.DataFrame({"id" : [], "title" : [], "year" : [], "parental_guide" : [], "runtime" : [], "genre" : [],
                        "director" : [], "writer" : [], "star" : [], "gross_us_canada" : []})

names, name_id = [], []

for i in links:
    url = i
    driver.get(i)
    res = {}

    id = re.findall(r"\d+", i)[0]
    res["id"] = id

    try:
        title = driver.find_element(By.CSS_SELECTOR,"h1[data-testid='hero__pageTitle'] span").text
        res["title"] = title
    except NoSuchElementException:
        title = None

    try:
        year = driver.find_element(By.CSS_SELECTOR, "a[href*='releaseinfo']").text
        res["year"] = year
    except NoSuchElementException:
        year = None

    try:
        parental_guide = driver.find_element(By.CSS_SELECTOR, "a[href*='/parentalguide/certificates']").text
        res["parental_guide"] = parental_guide
    except NoSuchElementException:
        parental_guide = None

    try:
        duration = driver.find_elements(By.XPATH, "//li[@class='ipc-inline-list__item']")[5].text
        pattern = re.compile(r"(?:(\d+)h\s*)?(?:(\d+)m)?")
        match = pattern.match(duration)
        hours = int(match.group(1) or 0)
        minutes = int(match.group(2) or 0)
        runtime = hours * 60 + minutes
        res["runtime"] = runtime
    except NoSuchElementException:
        runtime = None

    try:
        genre_list = driver.find_elements(By.XPATH, "//a[@class='ipc-chip ipc-chip--on-baseAlt']")
        for i in range(len(genre_list)):
            genre_list[i] = genre_list[i].text
        genre = ", ".join(genre_list)
        res["genre"] = genre
    except NoSuchElementException:
        genre = None

    try:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", "accept-language": "en-US,en;q=0.9"}        page = requests.get(url, headers = headers)
        soup = BeautifulSoup(page.content, "html.parser")
        scrape = soup.find_all("ul", class_ = "ipc-metadata-list ipc-metadata-list--dividers-all title-pc-list ipc-metadata-list--baseAlt")
        cast_list = scrape[0].findAll("li")
        for i in range(len(cast_list)):
            cast_list[i] = cast_list[i].text
            if cast_list[i].startswith("Directors") or cast_list[i].startswith("Director"):
                cast_list[i] = "Director"
            if cast_list[i].startswith("Writers") or cast_list[i].startswith("Writer"):
                cast_list[i] = "Writer"
            if cast_list[i].startswith("Stars") or cast_list[i].startswith("Star"):
                cast_list[i] = "Star"

        index_director, index_writer, index_star = cast_list.index("Director"), cast_list.index("Writer"), cast_list.index("Star")
        director = ", ".join(cast_list[index_director+1:index_writer])
        writer = ", ".join(cast_list[index_writer+1:index_star])
        star = ", ".join(cast_list[index_star+1:])

        res["director"] = director
        res["writer"] = writer
        res["star"] = star

        for i in cast_list:
            if i not in ["Director", "Writer", "Star"]:
                names += [i]

        for a in scrape[0].find_all("a", href = True):
            if a["href"].startswith("/name/"):
                li = a["href"].replace("/name/", "")
                name_id += [re.findall(r"\d+", li)[0]]

    except NoSuchElementException:
        director = None
        writer = None
        star = None

    try:
        gross_us_canada_txt = driver.find_elements(By.XPATH,"//span[@class='ipc-metadata-list-item__list-content-item']")
        if len(gross_us_canada_txt) > 3:
            gross_us_canada_txt = driver.find_elements(By.XPATH,"//span[@class='ipc-metadata-list-item__list-content-item']")[3].text
            gross_us_canada = gross_us_canada_txt.replace(",", "").replace("$", "")
        else:
            gross_us_canada = None
        res["gross_us_canada"] = gross_us_canada
    except NoSuchElementException:
        gross_us_canada = None

    IMDB_df = IMDB_df.append(res, ignore_index = True)

IMDB_df["runtime"] = IMDB_df["runtime"].astype(int)

DataFrame for Movies detail 🎬

In [15]:
IMDB_df["gross_us_canada"] = IMDB_df["gross_us_canada"].apply(lambda x : x if str(x).isnumeric() else None)
IMDB_df.to_csv("IMDB_Movies.csv", index = False)

DataFrame for Casts detail 🎭

In [16]:
IMDB_character_df = pd.DataFrame({"name" : names, "name_id" : name_id})
IMDB_character_df = IMDB_character_df.drop_duplicates()
IMDB_character_df.to_csv("IMDB_Names.csv", index = False)