# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import lxml
from bs4 import BeautifulSoup
from requests import get
from tqdm import tqdm

# Scrape data from IMDB

In [3]:
urls = ['https://www.imdb.com/search/title/?title_type=feature&ref_=adv_prv']

index = 51
for i in range(199): # after 10k, url style chages. Have to come up with another solution.
    urls.append('https://www.imdb.com/search/title/?title_type=feature&start=%d&ref_=adv_nxt'
                % index)
    index += 50

In [5]:
movies = []

for url in tqdm(urls, position=0, leave=True):
    
    # Get data from url and parse it.
    page = get(url)
    soup = BeautifulSoup(page.content, 'lxml') # 'html' give same result
    
    # Scrape from wanted fields
    content = soup.find(id="main")
    movie_data = content.find_all("div", class_='lister-item mode-advanced')
    
    for i in range(len(movie_data)):

        # Header
        header = movie_data[i].find('h3')
        try:
            link = header.find('a').attrs['href']
            link = 'imdb.com' + link
        except:
            link = None    
        try:
            title = header.find('a').text
        except:
            title = None
        try:    
            year = header.find('span', class_='lister-item-year text-muted unbold').text.replace('(','').replace(')','')
            try:
                year = int(year)
            except: # some has got 'II 2020'
                year = int(year.split(' ')[1])
        except:
            year = None

        # Info
        info = movie_data[i].find_all('p', class_='text-muted')[0]
        try:
            certificate = info.find('span', class_='certificate').text
        except:
            certificate = None
        try:
            runtime = int(info.find('span', class_='runtime').text.split(' ')[0])
        except:
            runtime = None
        try:
            genre = info.find('span', class_='genre').text.replace('\n','').strip()#.split(',')
        except:
            genre = None

        # Score
        score = movie_data[i].find('div', class_='ratings-bar')
        try:
            rating = float(score.find('strong').text)
        except:
            rating = None
        try:
            metascore = int(score.find('span', class_='metascore favorable').text.replace(' ',''))
        except:
            metascore = None

        # Story
        try:
            story = movie_data[i].find_all('p', class_='text-muted')[1].text.replace('\n','').strip()
        except:
            story = None

        # People
        people = movie_data[i].find('p', class_='').text

        if '|' in people: # movies
            try:
                director = people.split('|')[0].replace('\n','').strip().replace('Directors:','').replace('Director:','')
                #director = [name.strip() for name in director.split(',')]
            except:
                director = None
            try:
                cast = people.split('|')[1].replace('\n','').strip().replace('Stars:','')
                #cast = [name.strip() for name in cast.split(',')]
            except:
                cast = None

        else: # series
            director = None
            try:
                cast = people.replace('\n','').strip().replace('Stars:','')
                #cast = [name.strip() for name in cast.split(',')]
            except:
                cast = None

        # Store each in a dict
        movie_dict = {}
        movie_dict['link'] = link
        movie_dict['title'] = title
        movie_dict['year'] = year
        movie_dict['certificate'] = certificate
        movie_dict['runtime'] = runtime
        movie_dict['genre'] = genre
        movie_dict['rating'] = rating
        movie_dict['metascore'] = metascore
        movie_dict['story'] = story
        movie_dict['director'] = director
        movie_dict['cast'] = cast

        # Append the dict to a list
        movies.append(movie_dict)

100%|██████████| 200/200 [07:41<00:00,  2.52s/it]


# Convert to DataFrame
DataFrame converts ```year```, ```runtime```, and ```metascore``` which were ```int``` to ```float```. Further processings, include ```.dropna()``` and ```.astype()```, are required to convert them back.

In [6]:
# Convert to DataFrame and rearrange columns order
df = pd.DataFrame(movies, columns=['title','year','certificate','runtime','genre','rating','metascore','story','director','cast','link'])
df

Unnamed: 0,title,year,certificate,runtime,genre,rating,metascore,story,director,cast,link
0,365 dni,2020.0,,114.0,"Drama, Romance",3.5,,Massimo is a member of the Sicilian Mafia fami...,"Barbara Bialowas, Tomasz Mandes","Michele Morrone, Anna Maria Sieklucka, Bronisl...",imdb.com/title/tt10886166/
1,Da 5 Bloods,2020.0,R,154.0,"Adventure, Drama, War",6.7,82.0,Four African American vets battle the forces o...,Spike Lee,"Delroy Lindo, Jonathan Majors, Clarke Peters, ...",imdb.com/title/tt9777644/
2,Artemis Fowl,2020.0,PG,95.0,"Adventure, Family, Fantasy",4.1,,"Artemis Fowl, a young criminal prodigy, hunts ...",Kenneth Branagh,"Ferdia Shaw, Lara McDonnell, Josh Gad, Tamara ...",imdb.com/title/tt3089630/
3,The King of Staten Island,2020.0,R,136.0,"Comedy, Drama",7.2,68.0,Scott has been a case of arrested development ...,Judd Apatow,"Pete Davidson, Bel Powley, Ricky Velez, Lou Wi...",imdb.com/title/tt9686708/
4,Knives Out,2019.0,15,130.0,"Comedy, Crime, Drama",7.9,82.0,A detective investigates the death of a patria...,Rian Johnson,"Daniel Craig, Chris Evans, Ana de Armas, Jamie...",imdb.com/title/tt8946378/
5,You Should Have Left,2020.0,R,93.0,"Drama, Horror, Mystery",5.3,,"A former banker, his actress wife, and their s...",David Koepp,"Kevin Bacon, Amanda Seyfried, Avery Tiiu Essex...",imdb.com/title/tt8201852/
6,Palm Springs,2020.0,,90.0,"Comedy, Romance",7.2,83.0,When carefree Nyles and reluctant maid of hono...,Max Barbakow,"Andy Samberg, Cristin Milioti, J.K. Simmons, P...",imdb.com/title/tt9484998/
7,Tenet,2020.0,PG-13,,"Action, Sci-Fi, Thriller",,,Armed with only one word -- Tenet -- and fight...,Christopher Nolan,"John David Washington, Robert Pattinson, Eliza...",imdb.com/title/tt6723592/
8,Uncut Gems,2019.0,R,135.0,"Crime, Drama, Thriller",7.5,90.0,With his debts mounting and angry collectors c...,"Benny Safdie, Josh Safdie","Adam Sandler, Julia Fox, Idina Menzel, Mesfin ...",imdb.com/title/tt5727208/
9,Gisaengchung,2019.0,15,132.0,"Comedy, Drama, Thriller",8.6,96.0,Greed and class discrimination threaten the ne...,Bong Joon Ho,"Kang-ho Song, Sun-kyun Lee, Yeo-jeong Jo, Woo-...",imdb.com/title/tt6751668/


# Save to CSV File
to load .csv into DataFrame, use ```pd.read_csv('file_path')```.

In [7]:
df.to_csv('imbd_movies.csv', index=False)

# Useful Links

credit: https://github.com/Reljod/Python-Data-Scraping-IMDb-Movie-site-using-BeautifulSoup-Series-1-

resource: https://www.imdb.com/search/title/?title_type=feature,tv_series&view=advanced