In [1]:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
import numpy as np
import time, os

![](https://miro.medium.com/max/1400/1*1QcqrOoDE1rKa0NTp1iEtw.png)

<a href = "https://github.com/Ankitkalauni/Movie-Recommendation-System"><center>IMDB Website Web Scraping for College Project</center></a>

In [2]:
movie_info = {}
movie_info['movie_title'] = []
movie_info['genre'] = []
movie_info['director'] = []
movie_info['cast'] = []

counter = 0

![](https://miro.medium.com/max/875/1*VWezDbLoABPoRo1q8_ACrQ.png)

In [3]:
def get_movies(url):
    '''
    Inputs:
        url: link to the webpage.
    
    Returns:
        movies: returns list of movies div tags.
        next_: returns link of the next page to scrape.
        
    Credit: @ankitkalauni
    '''
    time.sleep(0.01)
    text = requests.get(url)
    text = text.text
    file = soup(text, 'html.parser')

    block = file.find('div', class_ = 'lister-list') #finding the lister-list class
    movies = block.find_all('div', class_ = 'lister-item-content') #list of all the movies content

    next_ = file.find('div', class_ = 'desc') #refrence for next page
    try:
        next_ = "https://www.imdb.com/" + next_.find('a',class_ = 'lister-page-next next-page')['href'] #next page link
        if counter == 3000: #notebook runs out of memory
            next_ = None
    except:
        print(url)
        next_ = None
    return movies, next_

In [4]:
import time
import os

def loop_moives(movies, next_page):
    '''
    Inputs:
        movies: list of movies to traverse the child's siblings content.
        next_page: link to the next page, if None the scraping will stop.
    
    Returns:
        next_page: returns the link to the next page to scrape, if None scrape will stops.
        
    Credit: @ankitkalauni
    '''
    for movie in movies:

        ##==================================================================
        try:
            movie_name = movie.select('a[href^="/title/"]')[0].string #movie title
        except:
            movie_name = np.nan

        ##==================================================================
        try:
            genre = movie.find('p', class_ = 'text-muted')
            genre = genre.find('span', class_ = 'genre').string.strip() #genre list
        except:
            genre = np.nan

        #==================================================================
        try:
            directors = movie.find('p', class_ = '')
            directors = str(directors).split('Stars')[0]
            directors = soup(directors, 'html.parser')
            directors = directors.select('a[href^="/name/"]') 

            all_directors = ""
            for director in directors:
                all_directors += director.string
                all_directors += ', '
            
            all_directors = all_directors[:-2] #director name
        except:
            all_directors = np.nan

        ##==================================================================
        try:
            casts = movie.find('p', class_ = '')
            casts = str(casts).split('Stars')[1]
            casts = soup(casts, 'html.parser')
            casts = casts.select('a[href^="/name/"]') 
            
            
            all_cast = ""
            for cast in casts:
                all_cast += cast.string
                all_cast += ', '
            
            all_cast = all_cast[:-2] #all cast names
        except:
            all_cast = np.nan

        ##==================================================================

        movie_info['movie_title'].append(movie_name)
        movie_info['genre'].append(genre)
        movie_info['director'].append(all_directors)
        movie_info['cast'].append(all_cast)

        
    print(f'\r Done with Page {counter}', end = ' ')
    return next_page

## Version Control:
v5 -> 2021 HollyWood Movies (2.2k)

v8 -> All Indian/Bollywood Movies (140k)

v9 -> Featured Movies from range(2010 - 2020)

In [5]:
%%time

# first_link = "https://www.imdb.com/search/title/?year=2021&title_type=feature&" #top hollywood movies 2021 version 5
# first_link = "https://www.imdb.com/search/title/?country_of_origin=IN" #140k Indian Movies version 9

movies_year = [*range(1999,2010,1)]

for year in movies_year:
    first_link = f"https://www.imdb.com/search/title/?year={str(year)}&title_type=feature&" #Featured Movies from range(2010 - 2020)
    
    print(f"Started Collecting Movies of year: {year}")
    while first_link:
        movies_list, next_page = get_movies(first_link)
        first_link = loop_moives(movies_list, next_page)
        if next_page == None:
            break
        counter +=1

Started Collecting Movies of year: 1999
 Done with Page 78 https://www.imdb.com//search/title/?title_type=feature&year=1999-01-01,1999-12-31&start=3951
 Done with Page 79 Started Collecting Movies of year: 2000
 Done with Page 159 https://www.imdb.com//search/title/?title_type=feature&year=2000-01-01,2000-12-31&start=4051
 Done with Page 160 Started Collecting Movies of year: 2001
 Done with Page 244 https://www.imdb.com//search/title/?title_type=feature&year=2001-01-01,2001-12-31&start=4251
 Done with Page 245 Started Collecting Movies of year: 2002
 Done with Page 331 https://www.imdb.com//search/title/?title_type=feature&year=2002-01-01,2002-12-31&start=4351
 Done with Page 332 Started Collecting Movies of year: 2003
 Done with Page 417 https://www.imdb.com//search/title/?title_type=feature&year=2003-01-01,2003-12-31&start=4301
 Done with Page 418 Started Collecting Movies of year: 2004
 Done with Page 507 https://www.imdb.com//search/title/?title_type=feature&year=2004-01-01,2004-1

In [6]:
movies = pd.DataFrame.from_dict(movie_info)
print(movies.shape)

movies.to_csv('Movies_2010_2020.csv', index=False)

(56314, 4)


In [7]:
movies.isnull().sum()

movie_title       0
genre          6305
director          0
cast           4838
dtype: int64

In [8]:
movies

Unnamed: 0,movie_title,genre,director,cast
0,The Matrix,"Action, Sci-Fi","Lana Wachowski, Lilly Wachowski","Keanu Reeves, Laurence Fishburne, Carrie-Anne ..."
1,Fight Club,Drama,David Fincher,"Brad Pitt, Edward Norton, Meat Loaf, Zach Grenier"
2,The Green Mile,"Crime, Drama, Fantasy",Frank Darabont,"Tom Hanks, Michael Clarke Duncan, David Morse,..."
3,"Girl, Interrupted","Biography, Drama",James Mangold,"Winona Ryder, Angelina Jolie, Clea DuVall, Bri..."
4,Eyes Wide Shut,"Drama, Mystery, Thriller",Stanley Kubrick,"Tom Cruise, Nicole Kidman, Todd Field, Sydney ..."
...,...,...,...,...
56309,Alor Thikana,Drama,Bidesh Sarkar,"Anjana Basu, Swantana Basu, Jagannath Guha, Ra..."
56310,"Dongbei, dongbei",Drama,Zou Peng,"Wu Rui-Peng, Yi-Wen Tian, Liu Xing-Ping"
56311,homo gamer,Fantasy,Stepan Grusha,"Inessa Perelygina-Vladimirova, Ilya Shidlovsky..."
56312,Czeski film,"Action, Comedy",Lukasz Jedynasty,"Adrian Jarocki, Kacper Kozicki, Katarzyna Szum..."


## to learn webscraping using BS4:

1. [Web Scraping Cheat Sheet (2021), Python for Web Scraping](https://medium.com/geekculture/web-scraping-cheat-sheet-2021-python-for-web-scraping-cad1540ce21c)

2. [Beautiful Soup Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

3. [StackOverflow](https://stackoverflow.com/)

4. [Beautiful Soup 4 Tutorial - Tech with Tim](https://www.youtube.com/watch?v=gRLHr664tXA&list=PLzMcBGfZo4-lSq2IDrA6vpZEV92AmQfJK)