# Import Libraries

In [51]:
import pandas as pd
import numpy as np
import re
import lxml
from bs4 import BeautifulSoup
from requests import get
from tqdm import tqdm

# Scrape data from IMDB

In [343]:
urls = ['https://www.imdb.com/search/title/?title_type=feature&ref_=adv_prv',
        'https://www.imdb.com/search/title/?title_type=feature&start=51&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=101&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=151&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=201&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=251&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=301&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=351&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=401&ref_=adv_nxt',
        'https://www.imdb.com/search/title/?title_type=feature&start=451&ref_=adv_nxt'
       ]

In [348]:
movies = []

for url in tqdm(urls, position=0, leave=True):
    
    # Get data from url and parse it.
    page = get(url)
    soup = BeautifulSoup(page.content, 'lxml') # 'html' give same result
    
    # Scrape from wanted fields
    content = soup.find(id="main")
    movie_data = content.find_all("div", class_='lister-item mode-advanced')
    
    for i in range(len(movie_data)):

        # Header
        header = movie_data[i].find('h3')
        try:
            link = header.find('a').attrs['href']
            link = 'imdb.com' + link
        except:
            link = None    
        try:
            title = header.find('a').text
        except:
            title = None
        try:    
            year = header.find('span', class_='lister-item-year text-muted unbold').text.replace('(','').replace(')','')
            try:
                year = int(year)
            except: # some has got 'II 2020'
                year = int(year.split(' ')[1])
        except:
            year = None

        # Info
        info = movie_data[i].find_all('p', class_='text-muted')[0]
        try:
            certificate = info.find('span', class_='certificate').text
        except:
            certificate = None
        try:
            runtime = int(info.find('span', class_='runtime').text.split(' ')[0])
        except:
            runtime = None
        try:
            genre = info.find('span', class_='genre').text.replace(' ','').replace('\n','').split(',')
        except:
            genre = None

        # Score
        score = movie_data[i].find('div', class_='ratings-bar')
        try:
            rating = float(score.find('strong').text)
        except:
            rating = None
        try:
            metascore = int(score.find('span', class_='metascore favorable').text.replace(' ',''))
        except:
            metascore = None

        # Story
        try:
            story = movie_data[i].find_all('p', class_='text-muted')[1].text.replace('\n','').strip()
        except:
            story = None

        # People
        people = movie_data[i].find('p', class_='').text

        if '|' in people: # movies
            try:
                director = people.split('|')[0].replace('\n','').strip().replace('Directors:','').replace('Director:','')
                director = [name.strip() for name in director.split(',')]
            except:
                director = None
            try:
                cast = people.split('|')[1].replace('\n','').strip().replace('Stars:','')
                cast = [name.strip() for name in cast.split(',')]
            except:
                cast = None

        else: # series
            director = None
            try:
                cast = people.replace('\n','').strip().replace('Stars:','')
                cast = [name.strip() for name in cast.split(',')]
            except:
                cast = None

        # Store each in a dict
        movie_dict = {}
        movie_dict['link'] = link
        movie_dict['title'] = title
        movie_dict['year'] = year
        movie_dict['certificate'] = certificate
        movie_dict['runtime'] = runtime
        movie_dict['genre'] = genre
        movie_dict['rating'] = rating
        movie_dict['metascore'] = metascore
        movie_dict['story'] = story
        movie_dict['director'] = director
        movie_dict['cast'] = cast

        # Append the dict to a list
        movies.append(movie_dict)

100%|██████████| 10/10 [00:16<00:00,  1.68s/it]


# Convert to DataFrame
DataFrame converts ```year```, ```runtime```, and ```metascore``` which were ```int``` to ```float```. Further processings, include ```.dropna()``` and ```.astype()```, are required to convert them back.

In [369]:
# Convert to DataFrame and rearrange columns order
df = pd.DataFrame(movies, columns=['title','year','certificate','runtime','genre','rating','metascore','story','director','cast','link'])
df

Unnamed: 0,title,year,certificate,runtime,genre,rating,metascore,story,director,cast,link
0,Uncut Gems,2019.0,R,135.0,"[Crime, Drama, Thriller]",7.5,90.0,With his debts mounting and angry collectors c...,"[Benny Safdie, Josh Safdie]","[Adam Sandler, Julia Fox, Idina Menzel, Mesfin...",imdb.com/title/tt5727208/
1,Tenet,2020.0,,,"[Action, Sci-Fi, Thriller]",,,Armed with only one word -- Tenet -- and fight...,[Christopher Nolan],"[John David Washington, Robert Pattinson, Eliz...",imdb.com/title/tt6723592/
2,The Vast of Night,2019.0,PG-13,89.0,"[Drama, Mystery, Sci-Fi]",6.7,84.0,"In the twilight of the 1950s, on one fateful n...",[Andrew Patterson],"[Sierra McCormick, Jake Horowitz, Gail Cronaue...",imdb.com/title/tt6803046/
3,Secret Society of Second-Born Royals,2020.0,,,"[Action, Adventure, Comedy]",,,It follows Sam's adventures at a top-secret tr...,[Anna Mastro],"[Elodie Yung, Olivia Deeble, Peyton Elizabeth ...",imdb.com/title/tt10324122/
4,The Wrong Missy,2020.0,,90.0,"[Comedy, Romance]",5.7,,Tim thinks he's invited the woman of his dream...,[Tyler Spindel],"[David Spade, Lauren Lapkus, Nick Swardson, Ge...",imdb.com/title/tt9619798/
5,Joker,2019.0,18,122.0,"[Crime, Drama, Thriller]",8.5,,"In Gotham City, mentally troubled comedian Art...",[Todd Phillips],"[Joaquin Phoenix, Robert De Niro, Zazie Beetz,...",imdb.com/title/tt7286456/
6,The Last Days of American Crime,2020.0,,148.0,"[Action, Crime, Thriller]",3.7,,"In the not-too-distant future, as a final resp...",[Olivier Megaton],"[Neels Clasen, Edgar Ramírez, Tony Caprari, Ka...",imdb.com/title/tt1552211/
7,The Hunt,2020.0,R,90.0,"[Action, Horror, Thriller]",6.4,,Twelve strangers wake up in a clearing. They d...,[Craig Zobel],"[Betty Gilpin, Hilary Swank, Ike Barinholtz, W...",imdb.com/title/tt8244784/
8,365 dni,2020.0,,114.0,[Drama],3.8,,Massimo is a member of the Sicilian Mafia fami...,"[Barbara Bialowas, Tomasz Mandes]","[Michele Morrone, Anna Maria Sieklucka, Bronis...",imdb.com/title/tt10886166/
9,Snowpiercer,2013.0,R,126.0,"[Action, Drama, Sci-Fi]",7.1,84.0,In a future where a failed climate-change expe...,[Bong Joon Ho],"[Chris Evans, Jamie Bell, Tilda Swinton, Ed Ha...",imdb.com/title/tt1706620/


# Save to CSV File
to load .csv into DataFrame, use ```pd.read_csv('file_path')```.

In [365]:
df.to_csv('imbd_movies.csv', index=False)

# Useful Links

credit: https://github.com/Reljod/Python-Data-Scraping-IMDb-Movie-site-using-BeautifulSoup-Series-1-

resource: https://www.imdb.com/search/title/?title_type=feature,tv_series&view=advanced