## Import packages

In [21]:
import requests
from bs4 import BeautifulSoup
import time
from time import sleep
import pandas as pd
import numpy as np

url = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start=001&ref_=adv_nxt'
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")

## Create page urls

In [25]:
def generate_page_urls(base_url, num_pages):
    page_urls = []
        
    for counter in range(0, num_pages + 1):
        full_url = base_url + str(counter) + '01&ref_=adv_nxt'
        page_urls.append(full_url)
        
    return page_urls

## Extract data

In [128]:
def extract_movie_data(page_urls):
    movie_list = []
    
    for page_url in page_urls:
        res = requests.get(page_url)
        soup= BeautifulSoup(res.text, "html.parser")
        movies = soup.find_all(class_= "lister-item mode-advanced")
        
        for movie in movies:
            movie_rank = movie.find(class_="lister-item-index unbold text-primary").text
            movie_url = "https://imdb.com" + movie.find("a").attrs["href"]
            movie_title = movie.find('h3').find('a').text
            movie_date = movie.find(class_="lister-item-year text-muted unbold").text
            movie_date = movie_date.replace('(','').replace(')','')
            movie_rating = movie.find('strong').text
            movie_meta = movie.find('span', class_ = 'metascore favorable')
            movie_genre = movie.find(class_="genre").text
            movie_genre = movie_genre.replace('\n','').replace("  ", "")
            movie_runtime = movie.find(class_="runtime").text
            num_votes = movie.find_all('span', attrs={'name': 'nv'})
            number_votes = num_votes[0].text
            movie_grosses = num_votes[1].text if len(num_votes) > 1 else 'N/A'
            director = movie.find('p',class_= '').find_all('a')[0].text
            actor_1 = movie.find('p', class_ = '').find_all('a')[1].text
            actor_2 = movie.find('p', class_ = '').find_all('a')[2].text
            actor_3 = movie.find('p', class_ = '').find_all('a')[3].text
            actor_4 = movie.find('p', class_ = '').find_all('a')[4].text
            summary = movie.find('p').find(class_="text-muted")
            
                    
            movie_list.append({"rank": movie_rank,
                              "movie url": movie_url,
                              "title" : movie_title,
                              "date" : movie_date,
                              "IMDb rating" : movie_rating,
                              "metascore" : movie_meta,
                              "genre" : movie_genre,
                               "director" : director,
                              "actor_1": actor_1,
                              "actor_2" : actor_2,
                              "actor_3" : actor_3,
                              "actor_4" : actor_4,
                              "runtime" : movie_runtime,
                              "votes" : number_votes,
                              "gross" : movie_grosses,
                              "summary" : summary})
            
        #sleep (1)
            
    return movie_list

In [129]:
base_url = 'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start='
num_pages = 0
page_urls =  generate_page_urls(base_url, num_pages)
movie_list = extract_movie_data(page_urls)
movie_list

[{'rank': '1.',
  'movie url': 'https://imdb.com/title/tt0111161/',
  'title': 'The Shawshank Redemption',
  'date': '1994',
  'IMDb rating': '9.3',
  'metascore': <span class="metascore favorable">80        </span>,
  'genre': 'Drama',
  'director': 'Frank Darabont',
  'actor_1': 'Tim Robbins',
  'actor_2': 'Morgan Freeman',
  'actor_3': 'Bob Gunton',
  'actor_4': 'William Sadler',
  'runtime': '142 min',
  'votes': '2,388,501',
  'gross': '$28.34M',
  'summary': None},
 {'rank': '2.',
  'movie url': 'https://imdb.com/title/tt0068646/',
  'title': 'The Godfather',
  'date': '1972',
  'IMDb rating': '9.2',
  'metascore': <span class="metascore favorable">100        </span>,
  'genre': 'Crime, Drama',
  'director': 'Francis Ford Coppola',
  'actor_1': 'Marlon Brando',
  'actor_2': 'Al Pacino',
  'actor_3': 'James Caan',
  'actor_4': 'Diane Keaton',
  'runtime': '175 min',
  'votes': '1,655,142',
  'gross': '$134.97M',
  'summary': None},
 {'rank': '3.',
  'movie url': 'https://imdb.com/

## Create dataframe

In [137]:
movie_dataframe = pd.DataFrame(movie_list)
movie_dataframe.to_csv("IMDb_top1000_dataframe.csv", index=False)
movie_dataframe

Unnamed: 0,rank,movie url,title,date,IMDb rating,metascore,genre,director,actor_1,actor_2,actor_3,actor_4,runtime,votes,gross,summary
0,1.,https://imdb.com/title/tt0111161/,The Shawshank Redemption,1994,9.3,[80 ],Drama,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,142 min,2388501,$28.34M,
1,2.,https://imdb.com/title/tt0068646/,The Godfather,1972,9.2,[100 ],"Crime, Drama",Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,175 min,1655142,$134.97M,
2,3.,https://imdb.com/title/tt10189514/,Soorarai Pottru,2020,9.1,,Drama,Sudha Kongara,Suriya,Madhavan,Paresh Rawal,Aparna Balamurali,153 min,60614,,
3,4.,https://imdb.com/title/tt0468569/,The Dark Knight,2008,9.0,[84 ],"Action, Crime, Drama",Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,152 min,2351178,$534.86M,
4,5.,https://imdb.com/title/tt0071562/,The Godfather: Part II,1974,9.0,[90 ],"Crime, Drama",Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,202 min,1151039,$57.30M,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96.,https://imdb.com/title/tt0338013/,Eternal Sunshine of the Spotless Mind,2004,8.3,[89 ],"Drama, Romance, Sci-Fi",Michel Gondry,Jim Carrey,Kate Winslet,Tom Wilkinson,Gerry Robert Byrne,108 min,925899,$34.40M,
96,97.,https://imdb.com/title/tt0211915/,Amélie,2001,8.3,[69 ],"Comedy, Romance",Jean-Pierre Jeunet,Audrey Tautou,Mathieu Kassovitz,Rufus,Lorella Cravotta,122 min,712859,$33.23M,
97,98.,https://imdb.com/title/tt0208092/,Snatch,2000,8.3,,"Comedy, Crime",Guy Ritchie,Jason Statham,Brad Pitt,Benicio Del Toro,Dennis Farina,104 min,793411,$30.33M,
98,99.,https://imdb.com/title/tt0180093/,Requiem for a Dream,2000,8.3,[68 ],Drama,Darren Aronofsky,Ellen Burstyn,Jared Leto,Jennifer Connelly,Marlon Wayans,102 min,779048,$3.64M,
