In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
headers = {
  'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0',
  'Accept-Language': 'en-US, en;q=0.5',
}

In [3]:
# TMdb movie URL
tmdb_movies_url = 'https://www.themoviedb.org/movie'

In [4]:
response = requests.get(tmdb_movies_url , headers = headers)

In [5]:
# Check if the request was successful
response.status_code


200

In [6]:
doc = BeautifulSoup(response.content, 'html.parser')
# doc

In [18]:
def get_movies_page(url):
    """
    Function to download a web page using `requests` and
    to extract the HTML source code using BeautifulSoup.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0',
        'Accept-Language': 'en-US, en;q=0.5',
    }

    # Access the webpage using `requests`
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code != 200:
        raise Exception(f'Failed to load page {url}')

    # Parse the `response` text using BeautifulSoup
    doc = BeautifulSoup(response.text, 'html.parser')

    return doc


In [8]:
movies_names_tags = doc.find_all('h2')[4:]  #Exclude the first 4 lines
names = []
for h2 in movies_names_tags:
    names.append(h2.a.text.strip())
print(names)

['Leo', 'Trolls Band Together', 'Oppenheimer', "Five Nights at Freddy's", 'The Creator', 'Expend4bles', 'Fast X', 'Mission: Impossible - Dead Reckoning Part One', 'Hell House LLC Origins: The Carmichael Manor', 'Mousa', 'Napoleon', 'Reign of Chaos', 'The Super Mario Bros. Movie', 'The Collective', 'Princess Khutulun', 'The Hunger Games: The Ballad of Songbirds & Snakes', 'The Equalizer 3', 'The Marvels', 'Blue Beetle', 'Cobweb']


In [9]:
# Similarly, we can extarct the movie links.

links = []
for h2 in movies_names_tags:
    links.append(h2.a['href'])
print(links)

['/movie/1075794', '/movie/901362', '/movie/872585', '/movie/507089', '/movie/670292', '/movie/299054', '/movie/385687', '/movie/575264', '/movie/1035982', '/movie/775244', '/movie/753342', '/movie/951546', '/movie/502356', '/movie/1060090', '/movie/960481', '/movie/695721', '/movie/926393', '/movie/609681', '/movie/565770', '/movie/709631']


In [10]:
# Let's create functions to extract the movies names and movie URLs.

def get_movies_names(doc):
    """
    Function to extract the movie names from HTML source code using BeautifulSoup.
    """
    movies_names_tags = doc.find_all('h2')[4:]  #Exclude the first 4 lines
    movies_names = []
    # Loop through the page get all the movie names from the page
    for h2 in movies_names_tags:
        movies_names.append(h2.a.text.strip())
    return movies_names

In [11]:
def get_movies_rating(doc):
    """
    Function to extract the movie user rating from HTML source code using the BeautifulSoup.
    """
    desc_selector = 'user_score_chart'
    movies_rating_tags = doc.find_all('div', {'class': desc_selector})
    movies_rating = []
    # Loop through the webpage to get the ratings of all the movies in the page
    for tag in movies_rating_tags:
        movies_rating.append(tag.attrs['data-percent'])
    return movies_rating

In [12]:
def get_movies_urls(doc):
    """
    Function to extract the movie links from HTML source code using BeautifulSoup.
    """
    movies_urls = []
    base_url = 'https://www.themoviedb.org'
    movies_names_tags = doc.find_all('h2')[4:]  #Exclude the first 4 lines
    # Loop through the webpage to get the URL of each movie
    for tag in movies_names_tags:
        movies_urls.append(base_url + tag.a['href'])
    return movies_urls

In [13]:
# Let's read a movie page
def get_detailed_movie_page(movies_url):
    """
    Function to read the HTML source code using BeautifulSoup.
    """
    # Download the page
    response = requests.get(movies_url)
    # Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(movies_url))
    # Parse using Beautiful soup
    movies_doc = BeautifulSoup(response.text, 'html.parser')
    return movies_doc

In [14]:
def get_movies_info(doc):
    """
    Function to get the movie informations -
    release date, genre, runtime and director.
    """
    div1_tags = doc.find('div', class_ = 'facts')
    release_date = div1_tags.text.split()[1]
    genre = div1_tags.text.split()[3:-2]
    runtime = div1_tags.text.split()[-2:]

    div2_tags = doc.find_all('div', {'class':'scroller_wrap should_fade is_fading'})
    director = div2_tags[0].text.strip().partition("\n")[0]

    return release_date, genre, runtime, director

In [50]:
def get_movies_info_1(doc):
    """
    Function to get the movie informations -
    release date, genre, runtime and director.
    """
    div1_tags = doc.find('div', class_ = 'facts')
    release_date = div1_tags.text.split()[1]
    genre = div1_tags.text.split()[3:-2]
    runtime = div1_tags.text.split()[-2:]


    div2_tags = doc.find_all('div', {'class':'scroller_wrap should_fade is_fading'})
    director = div2_tags[0].text.strip().partition("\n")[0]

    div3_tags = doc.find('div' , class_='overview')
    overview = div3_tags.text
    return release_date, genre, runtime, director , overview

In [52]:
def get_all_movies_details_1(urls):
    """
    Function to get lists of movie information as lists from all the pages.
    """
    genres = []
    release_dates = []
    runtimes = []
    directors = []
    overviews = []

    # Loop through all the urls of the the movies
    for url in urls:
        movie_doc = get_movies_page(url)
        # get_movies_info returns release_date, genre, runtime, director.
        release_date, genre, runtime, director , overview = get_movies_info_1(movie_doc)
        # Convert the genre list to string on ` `.
        genres.append(" ".join(genre))
        release_dates.append(release_date)
        runtimes.append(" ".join(runtime))
        directors.append(director)
        overviews.append(overview)

    return genres, release_dates, runtimes, directors , overviews

In [46]:
def get_all_movies_details(urls):
    """
    Function to get lists of movie information as lists from all the pages.
    """
    genres = []
    release_dates = []
    runtimes = []
    directors = []

    # Loop through all the urls of the the movies
    for url in urls:
        movie_doc = get_movies_page(url)
        # get_movies_info returns release_date, genre, runtime, director.
        release_date, genre, runtime, director = get_movies_info(movie_doc)
        # Convert the genre list to string on ` `.
        genres.append(" ".join(genre))
        release_dates.append(release_date)
        runtimes.append(" ".join(runtime))
        directors.append(director)

    return genres, release_dates, runtimes, directors

In [53]:
def scrape_movies1():
    """
    Function to download web page using `requests` and
    to extract the HTML source code using BeautifulSoup.
    """
    # Let's get the popular movies listing from the TMdb website
    page_count = 1 # Initializing the movie page count to 1
    # Define lists for all the movie attributes
    all_names = []
    all_ratings = []
    all_genres = []
    all_release_dates = []
    all_runtimes = []
    all_directors = []
    all_overviews = []
    all_urls = []

    headers = {
      'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0',
      'Accept-Language': 'en-US, en;q=0.5',
    }

    while page_count < 8: # Looping for 8 pages of the TMdb web page

        movies_url = "https://www.themoviedb.org/movie?page=%d" %(page_count)
        # movies_url = f"https://www.themoviedb.org/movie?page={page_count}"
        # movies_url = "https://www.themoviedb.org/movie?page={}".format(page_count)

        # Access the webpage using `requests`
        response = requests.get(movies_url , headers = headers)

        # Check if the request was successful
        if response.status_code != 200:
            raise Exception('Failed to load page {}'.format(movies_url))

        # Parse the `response' text using BeautifulSoup
        doc = BeautifulSoup(response.text, 'html.parser')

        # Call get_movies_page to retrieve the BeautifulSoup document
        doc = get_movies_page(movies_url)

        urls = get_movies_urls(doc)
        genres, release_dates, runtimes, directors , overviews = get_all_movies_details_1(urls)

        # Append each movie attribute to respective lists
        all_names += get_movies_names(doc)
        all_ratings += get_movies_rating(doc)
        all_genres += genres
        all_release_dates += release_dates
        all_runtimes += runtimes
        all_directors += directors
        all_overviews += overviews
        all_urls += urls
        page_count += 1

        # Defining a dictionary to store the movie informations
    movies_dict = {
        'name': all_names,
        'rating': all_ratings,
        'genre': all_genres,
        'release_date': all_release_dates,
        'runtime': all_runtimes,
        'director': all_directors,
        'overview': all_overviews,
        'url': all_urls
    }
    return pd.DataFrame(movies_dict)

In [19]:
def scrape_movies():
    """
    Function to download web page using `requests` and
    to extract the HTML source code using BeautifulSoup.
    """
    # Let's get the popular movies listing from the TMdb website
    page_count = 1 # Initializing the movie page count to 1
    # Define lists for all the movie attributes
    all_names = []
    all_ratings = []
    all_genres = []
    all_release_dates = []
    all_runtimes = []
    all_directors = []
    all_urls = []

    headers = {
      'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0',
      'Accept-Language': 'en-US, en;q=0.5',
    }

    while page_count < 8: # Looping for 8 pages of the TMdb web page

        movies_url = "https://www.themoviedb.org/movie?page=%d" %(page_count)
        # movies_url = f"https://www.themoviedb.org/movie?page={page_count}"
        # movies_url = "https://www.themoviedb.org/movie?page={}".format(page_count)

        # Access the webpage using `requests`
        response = requests.get(movies_url , headers = headers)

        # Check if the request was successful
        if response.status_code != 200:
            raise Exception('Failed to load page {}'.format(movies_url))

        # Parse the `response' text using BeautifulSoup
        doc = BeautifulSoup(response.text, 'html.parser')

        # Call get_movies_page to retrieve the BeautifulSoup document
        doc = get_movies_page(movies_url)

        urls = get_movies_urls(doc)
        genres, release_dates, runtimes, directors = get_all_movies_details(urls)

        # Append each movie attribute to respective lists
        all_names += get_movies_names(doc)
        all_ratings += get_movies_rating(doc)
        all_genres += genres
        all_release_dates += release_dates
        all_runtimes += runtimes
        all_directors += directors
        all_urls += urls
        page_count += 1

        # Defining a dictionary to store the movie informations
    movies_dict = {
        'name': all_names,
        'rating': all_ratings,
        'genre': all_genres,
        'release_date': all_release_dates,
        'runtime': all_runtimes,
        'director': all_directors,
        'url': all_urls
    }
    return pd.DataFrame(movies_dict)

In [54]:
# Invoke the scrape_movies functionality
movies_df = scrape_movies1()
movies_df.head() # View the first few rows of the output

Unnamed: 0,name,rating,genre,release_date,runtime,director,overview,url
0,Leo,76.35,"Animation, Comedy, Family",11/17/2023,1h 42m,Adam Sandler,\nJaded 74-year-old lizard Leo has been stuck ...,https://www.themoviedb.org/movie/1075794
1,Trolls Band Together,72.33,"Animation, Family, Music, Fantasy, Comedy",11/17/2023,1h 32m,Anna Kendrick,"\nWhen Branch’s brother, Floyd, is kidnapped f...",https://www.themoviedb.org/movie/901362
2,Oppenheimer,81.57,"Drama, History",07/21/2023,3h 1m,Cillian Murphy,\nThe story of J. Robert Oppenheimer's role in...,https://www.themoviedb.org/movie/872585
3,Five Nights at Freddy's,78.49000000000001,"Horror, Mystery",10/27/2023,1h 50m,Josh Hutcherson,"\nRecently fired and desperate for work, a tro...",https://www.themoviedb.org/movie/507089
4,The Creator,71.36999999999999,"Science Fiction, Action, Thriller",09/29/2023,2h 14m,John David Washington,\nAmid a future war between the human race and...,https://www.themoviedb.org/movie/670292


In [44]:
movies_df.overview[1]

'\nWhen Branch’s brother, Floyd, is kidnapped for his musical talents by a pair of nefarious pop-star villains, Branch and Poppy embark on a harrowing and emotional journey to reunite the other brothers and rescue Floyd from a fate even worse than pop-culture obscurity.\n'

In [21]:
# Save the dataset to `.csv` format
movies_df.to_csv('movies.csv', index=None)

In [22]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,name,rating,genre,release_date,runtime,director,url
0,Leo,76.62,"Animation, Comedy, Family",11/17/2023,1h 42m,Adam Sandler,https://www.themoviedb.org/movie/1075794
1,Trolls Band Together,72.3,"Animation, Family, Music, Fantasy, Comedy",11/17/2023,1h 32m,Anna Kendrick,https://www.themoviedb.org/movie/901362
2,Oppenheimer,81.55,"Drama, History",07/21/2023,3h 1m,Cillian Murphy,https://www.themoviedb.org/movie/872585
3,Five Nights at Freddy's,78.0,"Horror, Mystery",10/27/2023,1h 50m,Josh Hutcherson,https://www.themoviedb.org/movie/507089
4,The Creator,71.36,"Science Fiction, Action, Thriller",09/29/2023,2h 14m,John David Washington,https://www.themoviedb.org/movie/670292


In [23]:
df.shape

(140, 7)