# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import re
import lxml
from bs4 import BeautifulSoup
from requests import get
from tqdm import tqdm

# Scrape data from IMDB

### By popularities

In [3]:
# By popularities
urls = ['https://www.imdb.com/search/title/?title_type=feature&ref_=adv_prv']

index = 51
for i in range(199): # after 10k, url style chages. Have to come up with another solution.
    urls.append('https://www.imdb.com/search/title/?title_type=feature&start=%d&ref_=adv_nxt'
                % index)
    index += 50

### By genres, so classes are nearly balanced

In [34]:
def generate_url_by_genre(genres, num_page):
    '''
    Generates urls for scraping.
    
    Arguments:
    genres -- List of genres for scraping.
    num_page -- Number of pages to scrape for each genre.
    
    Returns:
    urls -- List of urls for scraping.
    '''
    
    urls = []
    for genre in genres:
        first_page = 'https://www.imdb.com/search/title/?title_type=feature&genres=%s&explore=title_type,genres&ref_=adv_prv'%genre
        urls.append(first_page)

        index = 51
        for i in range(num_page-1): # after 10k, url style chages. Have to come up with another solution.
            urls.append('https://www.imdb.com/search/title/?title_type=feature&genres=%s&start=%d&explore=title_type,genres&ref_=adv_nxt'
                        % (genre, index))
            index += 50
        
    print('Total of %d urls'%len(urls))
    return urls

In [50]:
genres = ['comedy','sci-fi','horror','romance','action','thriller','drama','mystery','crime','animation','adventure','fantasy']
urls = generate_url_by_genre(genres, num_page=199)

Total of 2388 urls


### Scrape

In [51]:
movies = []

for url in tqdm(urls, position=0, leave=True):
    
    # Get data from url and parse it.
    page = get(url)
    soup = BeautifulSoup(page.content, 'lxml') # 'html' give same result
    
    # Scrape from wanted fields
    content = soup.find(id="main")
    movie_data = content.find_all("div", class_='lister-item mode-advanced')
    
    for i in range(len(movie_data)):

        # Header
        header = movie_data[i].find('h3')
        try:
            link = header.find('a').attrs['href']
            link = 'imdb.com' + link
        except:
            link = None    
        try:
            title = header.find('a').text
        except:
            title = None
        try:    
            year = header.find('span', class_='lister-item-year text-muted unbold').text.replace('(','').replace(')','')
            try:
                year = int(year)
            except: # some has got 'II 2020'
                year = int(year.split(' ')[1])
        except:
            year = None

        # Info
        info = movie_data[i].find_all('p', class_='text-muted')[0]
        try:
            certificate = info.find('span', class_='certificate').text
        except:
            certificate = None
        try:
            runtime = int(info.find('span', class_='runtime').text.split(' ')[0])
        except:
            runtime = None
        try:
            genre = info.find('span', class_='genre').text.replace('\n','').strip()#.split(',')
        except:
            genre = None

        # Score
        score = movie_data[i].find('div', class_='ratings-bar')
        try:
            rating = float(score.find('strong').text)
        except:
            rating = None
        try:
            metascore = int(score.find('span', class_='metascore favorable').text.replace(' ',''))
        except:
            metascore = None

        # Story
        try:
            story = movie_data[i].find_all('p', class_='text-muted')[1].text.replace('\n','').strip()
        except:
            story = None

        # People
        people = movie_data[i].find('p', class_='').text

        if '|' in people: # movies
            try:
                director = people.split('|')[0].replace('\n','').strip().replace('Directors:','').replace('Director:','')
                #director = [name.strip() for name in director.split(',')]
            except:
                director = None
            try:
                cast = people.split('|')[1].replace('\n','').strip().replace('Stars:','')
                #cast = [name.strip() for name in cast.split(',')]
            except:
                cast = None

        else: # series
            director = None
            try:
                cast = people.replace('\n','').strip().replace('Stars:','')
                #cast = [name.strip() for name in cast.split(',')]
            except:
                cast = None

        # Store each in a dict
        movie_dict = {}
        movie_dict['link'] = link
        movie_dict['title'] = title
        movie_dict['year'] = year
        movie_dict['certificate'] = certificate
        movie_dict['runtime'] = runtime
        movie_dict['genre'] = genre
        movie_dict['rating'] = rating
        movie_dict['metascore'] = metascore
        movie_dict['story'] = story
        movie_dict['director'] = director
        movie_dict['cast'] = cast

        # Append the dict to a list
        movies.append(movie_dict)

100%|██████████| 2388/2388 [2:00:06<00:00,  3.96s/it]  


# Convert to DataFrame
DataFrame converts ```year```, ```runtime```, and ```metascore``` which were ```int``` to ```float```. Further processings, include ```.dropna()``` and ```.astype()```, are required to convert them back.

In [52]:
# Convert to DataFrame and rearrange columns order
df = pd.DataFrame(movies, columns=['title','year','certificate','runtime','genre','rating','metascore','story','director','cast','link'])
df

Unnamed: 0,title,year,certificate,runtime,genre,rating,metascore,story,director,cast,link
0,The King of Staten Island,2020.0,R,136.0,"Comedy, Drama",7.2,68.0,Scott has been a case of arrested development ...,Judd Apatow,"Pete Davidson, Bel Powley, Ricky Velez, Lou Wi...",imdb.com/title/tt9686708/
1,Knives Out,2019.0,15,130.0,"Comedy, Crime, Drama",7.9,82.0,A detective investigates the death of a patria...,Rian Johnson,"Daniel Craig, Chris Evans, Ana de Armas, Jamie...",imdb.com/title/tt8946378/
2,Palm Springs,2020.0,,90.0,"Comedy, Romance",7.2,83.0,When carefree Nyles and reluctant maid of hono...,Max Barbakow,"Andy Samberg, Cristin Milioti, J.K. Simmons, P...",imdb.com/title/tt9484998/
3,Gisaengchung,2019.0,15,132.0,"Comedy, Drama, Thriller",8.6,96.0,Greed and class discrimination threaten the ne...,Bong Joon Ho,"Kang-ho Song, Sun-kyun Lee, Yeo-jeong Jo, Woo-...",imdb.com/title/tt6751668/
4,Once Upon a Time... in Hollywood,2019.0,18,161.0,"Comedy, Drama",7.7,83.0,A faded television actor and his stunt double ...,Quentin Tarantino,"Leonardo DiCaprio, Brad Pitt, Margot Robbie, E...",imdb.com/title/tt7131622/
5,The Gentlemen,2019.0,15,113.0,"Action, Comedy, Crime",7.9,,An American expat tries to sell off his highly...,Guy Ritchie,"Matthew McConaughey, Charlie Hunnam, Michelle ...",imdb.com/title/tt8367814/
6,Bill & Ted Face the Music,2020.0,,,"Comedy, Music, Sci-Fi",,,Once told they'd save the universe during a ti...,Dean Parisot,"Keanu Reeves, Alex Winter, Samara Weaving, Bri...",imdb.com/title/tt1086064/
7,Gulabo Sitabo,2020.0,,124.0,"Comedy, Drama",6.5,,Two scheming men get caught up in a game of on...,Shoojit Sircar,"Amitabh Bachchan, Ayushmann Khurrana, Vijay Ra...",imdb.com/title/tt10333912/
8,The Personal History of David Copperfield,2019.0,PG,119.0,"Comedy, Drama",6.3,75.0,A modern take on Charles Dickens's classic tal...,Armando Iannucci,"Dev Patel, Hugh Laurie, Tilda Swinton, Peter C...",imdb.com/title/tt6439020/
9,Eurovision Song Contest: The Story of Fire Saga,2020.0,PG-13,123.0,"Comedy, Music",6.7,,When aspiring musicians Lars and Sigrit are gi...,David Dobkin,"Will Ferrell, Rachel McAdams, Pierce Brosnan, ...",imdb.com/title/tt8580274/


Remove duplicated rows using ```.drop_duplicates()```

In [53]:
df.drop_duplicates(subset=['title'], inplace=True)

In [54]:
df

Unnamed: 0,title,year,certificate,runtime,genre,rating,metascore,story,director,cast,link
0,The King of Staten Island,2020.0,R,136.0,"Comedy, Drama",7.2,68.0,Scott has been a case of arrested development ...,Judd Apatow,"Pete Davidson, Bel Powley, Ricky Velez, Lou Wi...",imdb.com/title/tt9686708/
1,Knives Out,2019.0,15,130.0,"Comedy, Crime, Drama",7.9,82.0,A detective investigates the death of a patria...,Rian Johnson,"Daniel Craig, Chris Evans, Ana de Armas, Jamie...",imdb.com/title/tt8946378/
2,Palm Springs,2020.0,,90.0,"Comedy, Romance",7.2,83.0,When carefree Nyles and reluctant maid of hono...,Max Barbakow,"Andy Samberg, Cristin Milioti, J.K. Simmons, P...",imdb.com/title/tt9484998/
3,Gisaengchung,2019.0,15,132.0,"Comedy, Drama, Thriller",8.6,96.0,Greed and class discrimination threaten the ne...,Bong Joon Ho,"Kang-ho Song, Sun-kyun Lee, Yeo-jeong Jo, Woo-...",imdb.com/title/tt6751668/
4,Once Upon a Time... in Hollywood,2019.0,18,161.0,"Comedy, Drama",7.7,83.0,A faded television actor and his stunt double ...,Quentin Tarantino,"Leonardo DiCaprio, Brad Pitt, Margot Robbie, E...",imdb.com/title/tt7131622/
5,The Gentlemen,2019.0,15,113.0,"Action, Comedy, Crime",7.9,,An American expat tries to sell off his highly...,Guy Ritchie,"Matthew McConaughey, Charlie Hunnam, Michelle ...",imdb.com/title/tt8367814/
6,Bill & Ted Face the Music,2020.0,,,"Comedy, Music, Sci-Fi",,,Once told they'd save the universe during a ti...,Dean Parisot,"Keanu Reeves, Alex Winter, Samara Weaving, Bri...",imdb.com/title/tt1086064/
7,Gulabo Sitabo,2020.0,,124.0,"Comedy, Drama",6.5,,Two scheming men get caught up in a game of on...,Shoojit Sircar,"Amitabh Bachchan, Ayushmann Khurrana, Vijay Ra...",imdb.com/title/tt10333912/
8,The Personal History of David Copperfield,2019.0,PG,119.0,"Comedy, Drama",6.3,75.0,A modern take on Charles Dickens's classic tal...,Armando Iannucci,"Dev Patel, Hugh Laurie, Tilda Swinton, Peter C...",imdb.com/title/tt6439020/
9,Eurovision Song Contest: The Story of Fire Saga,2020.0,PG-13,123.0,"Comedy, Music",6.7,,When aspiring musicians Lars and Sigrit are gi...,David Dobkin,"Will Ferrell, Rachel McAdams, Pierce Brosnan, ...",imdb.com/title/tt8580274/


# Save to CSV File
to load .csv into DataFrame, use ```pd.read_csv('file_path')```.

In [55]:
df.to_csv('imbd_movies_by_genre.csv', index=False)

# Useful Links

credit: https://github.com/Reljod/Python-Data-Scraping-IMDb-Movie-site-using-BeautifulSoup-Series-1-

resource: https://www.imdb.com/search/title/?title_type=feature,tv_series&view=advanced