In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
## this url contains the top most viewed movie pages on IMDB within a specified period of time
url = 'https://www.imdb.com/chart/moviemeter?sort=rk,asc&mode=simple&page=1'

In [3]:
page = requests.get(url)
page

<Response [200]>

In [4]:
## check the page source code
# page.content

In [5]:
soup = BeautifulSoup(page.content, "html.parser")

In [6]:
## get movie names
movie_name_info = soup.find_all('td', class_='titleColumn')

In [7]:
## tidy movie names
movie_names = []
for movie in movie_name_info:
    movie = movie.find('a')
    movie = movie.get_text().replace('\n', "")
    movie = movie.strip(" ")
    movie = movie.split("(")[0]
    movie_names.append(movie)

In [8]:
## get movie ids
movie_ids = []
for movie in movie_name_info:
    movie = movie.find('a')
    movie = movie.get("href")
    movie = movie.split("/")[2]
    movie_ids.append(movie)

In [9]:
## get year of movie, most get ride of brackets to convert to int
## if year isn't available, we assign None
movie_years = []
for movie in movie_name_info:
    movie = movie.find('span', class_='secondaryInfo')
    movie = movie.get_text()
    movie = movie.replace("(", '')
    movie = movie.replace(")", '')
    if len(movie) > 4:
        movie = None
    movie_years.append(movie)

In [10]:
## get movie rating info
movie_rating_info = soup.find_all('td', class_='ratingColumn imdbRating')

In [11]:
## get movie ratings
movie_ratings = []
for movie in movie_rating_info:
    movie = movie.get_text().replace("\n","")
    movie_ratings.append(movie)

In [12]:
## create dataframe using pandas
d = {'ID' : movie_ids, 'Name' : movie_names, 'Rating' : movie_ratings, 'Year' : movie_years}
df = pd.DataFrame(data = d)

In [13]:
df.head(10)

Unnamed: 0,ID,Name,Rating,Year
0,tt9114286,Black Panther: Wakanda Forever,7.3,2022
1,tt10999120,Spirited,6.6,2022
2,tt9764362,The Menu,7.5,2022
3,tt1596342,Disenchanted,5.8,2022
4,tt17220704,A Christmas Story Christmas,7.0,2022
5,tt13320662,Slumberland,6.7,2022
6,tt9288822,The Wonder,6.7,2022
7,tt1630029,Avatar: The Way of Water,,2022
8,tt10168670,Bones and All,7.3,2022
9,tt6443346,Black Adam,6.8,2022


In [14]:
## change rating column to type float and Year to Int
df['Rating'] = df['Rating'].replace([''], None)
df['Rating'] = df['Rating'].astype('float')
df['Year'] = df['Year'].astype('Int64')

In [15]:
df = df.sort_values(by=['Rating'], ascending = False)

In [16]:
## top 10 highest rated movies in the top 100 'hottest' movies
df.head(10)

Unnamed: 0,ID,Name,Rating,Year
84,tt0111161,The Shawshank Redemption,9.3,1994
64,tt0068646,The Godfather,9.2,1972
48,tt15327088,Kantara,8.9,2022
98,tt0110912,Pulp Fiction,8.9,1994
24,tt15501640,Drishyam 2,8.8,2022
30,tt13833688,The Whale,8.5,2022
31,tt1745960,Top Gun: Maverick,8.4,2022
16,tt14208870,The Fabelmans,8.3,2022
95,tt0993846,The Wolf of Wall Street,8.2,2013
36,tt11813216,The Banshees of Inisherin,8.2,2022
