## Import Modules

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

## Request page source from URL

In [2]:
url = "https://www.imdb.com/chart/top/"

In [3]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148'}

In [4]:
page = requests.get(url, headers=HEADERS)
page

<Response [200]>

In [5]:
## display the page source code
page.content



In [6]:
soup = BeautifulSoup(page.content, "html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1714391579523);
        }
    })
  </script>
  <title>
   IMDb Top 250 Movies
  </title>
  <meta content="As rated by regular IMDb voters." data-id="main" name="description"/>
  <meta content="IMDb" property="og:site_name"/>
  <meta content="IMDb Top 250 Movies" property="og:title"/>
  

In [7]:
# scrap movie names
scraped_movies = soup.find_all('div', class_='ipc-title-link-no-icon')
scraped_movies

[<div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-b189961a-9 iALATN cli-title"><a class="ipc-title-link-wrapper" href="/title/tt0111161/?ref_=chttp_t_1" tabindex="0"><h3 class="ipc-title__text">1. Les Évadés</h3></a></div>,
 <div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-b189961a-9 iALATN cli-title"><a class="ipc-title-link-wrapper" href="/title/tt0068646/?ref_=chttp_t_2" tabindex="0"><h3 class="ipc-title__text">2. Le Parrain</h3></a></div>,
 <div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-b189961a-9 iALATN cli-title"><a class="ipc-title-link-wrapper" href="/title/tt0468569/?ref_=chttp_t_3" tabindex="0"><h3 class="ipc-title__text">3. The Dark Knight : Le Chevalier noir</h3></a></div>,
 <div class="ipc-title ipc-title--base ipc-title--title ipc-title-link-no-icon ipc-title--on-textPrimary sc-b189961a-9 iALATN cli

In [8]:
# parse movie names
movies = []
for movie in scraped_movies:
    movie = movie.get_text().replace('\n', "")
    movie = movie.strip(" ")
    movies.append(movie)
movies

['1. Les Évadés',
 '2. Le Parrain',
 '3. The Dark Knight : Le Chevalier noir',
 '4. Le Parrain, 2ᵉ partie',
 '5. 12 Hommes en colère',
 '6. La Liste de Schindler',
 '7. Le Seigneur des anneaux : Le Retour du roi',
 '8. Pulp Fiction',
 "9. Le Seigneur des anneaux : La Communauté de l'anneau",
 '10. Le Bon, la Brute et le Truand',
 '11. Forrest Gump',
 '12. Le Seigneur des anneaux : Les Deux Tours',
 '13. Fight Club',
 '14. Inception',
 "15. L'Empire contre-attaque",
 '16. Matrix',
 '17. Les affranchis',
 "18. Vol au-dessus d'un nid de coucou",
 '19. Seven',
 '20. Dune: Deuxième partie',
 '21. Interstellar',
 '22. La vie est belle',
 '23. Les 7 Samouraïs',
 '24. Le Silence des agneaux',
 '25. Il faut sauver le soldat Ryan',
 '26. La Cité de Dieu',
 '27. La vie est belle',
 '28. La Ligne verte',
 '29. Terminator 2 : Le Jugement dernier',
 '30. Star Wars: Épisode IV - Un nouvel espoir',
 '31. Retour vers le futur',
 '32. Le Voyage de Chihiro',
 '33. Le Pianiste',
 '34. Parasite',
 '35. Psy

In [9]:


regex_movies = []
pattern = r'^\d+\.\s'
for movie in movies:
    cleaned_text = re.sub(pattern, '', movie)
    regex_movies.append(cleaned_text)
regex_movies

['Les Évadés',
 'Le Parrain',
 'The Dark Knight : Le Chevalier noir',
 'Le Parrain, 2ᵉ partie',
 '12 Hommes en colère',
 'La Liste de Schindler',
 'Le Seigneur des anneaux : Le Retour du roi',
 'Pulp Fiction',
 "Le Seigneur des anneaux : La Communauté de l'anneau",
 'Le Bon, la Brute et le Truand',
 'Forrest Gump',
 'Le Seigneur des anneaux : Les Deux Tours',
 'Fight Club',
 'Inception',
 "L'Empire contre-attaque",
 'Matrix',
 'Les affranchis',
 "Vol au-dessus d'un nid de coucou",
 'Seven',
 'Dune: Deuxième partie',
 'Interstellar',
 'La vie est belle',
 'Les 7 Samouraïs',
 'Le Silence des agneaux',
 'Il faut sauver le soldat Ryan',
 'La Cité de Dieu',
 'La vie est belle',
 'La Ligne verte',
 'Terminator 2 : Le Jugement dernier',
 'Star Wars: Épisode IV - Un nouvel espoir',
 'Retour vers le futur',
 'Le Voyage de Chihiro',
 'Le Pianiste',
 'Parasite',
 'Psychose',
 'Spider-Man: Across the Spider-Verse',
 'Gladiator',
 'Le Roi lion',
 'Léon',
 'Les Infiltrés',
 'American History X',
 'W

In [10]:
len(movies)

250

In [11]:
# scrap rating for movies
scraped_ratings = soup.find_all('span', class_='ipc-rating-star--imdb')
scraped_ratings

[<span aria-label="IMDb rating: 9.3" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating" data-testid="ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M12 20.1l5.82 3.682c1.066.675 2.37-.322 2.09-1.584l-1.543-6.926 5.146-4.667c.94-.85.435-2.465-.799-2.567l-6.773-.602L13.29.89a1.38 1.38 0 0 0-2.581 0l-2.65 6.53-6.774.602C.052 8.126-.453 9.74.486 10.59l5.147 4.666-1.542 6.926c-.28 1.262 1.023 2.26 2.09 1.585L12 20.099z"></path></svg>9.3<span class="ipc-rating-star--voteCount"> (<!-- -->2.9M<!-- -->)</span></span>,
 <span aria-label="IMDb rating: 9.2" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating" data-testid="ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24" width="2

In [12]:
# parse ratings
ratings = []
for rating in scraped_ratings:
    rating = rating.get_text().replace('\n', '')
    ratings.append(rating)
ratings

['9.3\xa0(2.9M)',
 '9.2\xa0(2M)',
 '9.0\xa0(2.9M)',
 '9.0\xa0(1.4M)',
 '9.0\xa0(865K)',
 '9.0\xa0(1.5M)',
 '9.0\xa0(2M)',
 '8.9\xa0(2.2M)',
 '8.9\xa0(2M)',
 '8.8\xa0(811K)',
 '8.8\xa0(2.3M)',
 '8.8\xa0(1.8M)',
 '8.8\xa0(2.3M)',
 '8.8\xa0(2.6M)',
 '8.7\xa0(1.4M)',
 '8.7\xa0(2.1M)',
 '8.7\xa0(1.3M)',
 '8.7\xa0(1.1M)',
 '8.6\xa0(1.8M)',
 '8.7\xa0(376K)',
 '8.7\xa0(2.1M)',
 '8.6\xa0(499K)',
 '8.6\xa0(367K)',
 '8.6\xa0(1.5M)',
 '8.6\xa0(1.5M)',
 '8.6\xa0(801K)',
 '8.6\xa0(743K)',
 '8.6\xa0(1.4M)',
 '8.6\xa0(1.2M)',
 '8.6\xa0(1.4M)',
 '8.5\xa0(1.3M)',
 '8.6\xa0(849K)',
 '8.5\xa0(912K)',
 '8.5\xa0(959K)',
 '8.5\xa0(718K)',
 '8.6\xa0(369K)',
 '8.5\xa0(1.6M)',
 '8.5\xa0(1.1M)',
 '8.5\xa0(1.2M)',
 '8.5\xa0(1.4M)',
 '8.5\xa0(1.2M)',
 '8.5\xa0(989K)',
 '8.5\xa0(1.4M)',
 '8.5\xa0(310K)',
 '8.6\xa0(69K)',
 '8.5\xa0(1.1M)',
 '8.5\xa0(606K)',
 '8.5\xa0(929K)',
 '8.5\xa0(283K)',
 '8.5\xa0(259K)',
 '8.5\xa0(522K)',
 '8.5\xa0(349K)',
 '8.5\xa0(951K)',
 '8.5\xa0(196K)',
 '8.5\xa0(1.7M)',
 '8.4\xa0(710K)',

In [13]:
regex_ratings = []
pattern = r'\d+\.\d+'
for rating in ratings:
    
    rating = re.search(pattern, rating)
    regex_ratings.append(rating.group())
    
len(regex_ratings)
regex_ratings

['9.3',
 '9.2',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '8.9',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.5',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.4',
 '8.4',
 '9.0',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.3',
 '8.3',
 '8.2',
 '8.3',
 '8.2',
 '8.2',
 '8.3',


## Store the Scraped Data

In [14]:
data = pd.DataFrame()
data['Movie Names'] = regex_movies
data['Ratings'] = regex_ratings
data.head()

Unnamed: 0,Movie Names,Ratings
0,Les Évadés,9.3
1,Le Parrain,9.2
2,The Dark Knight : Le Chevalier noir,9.0
3,"Le Parrain, 2ᵉ partie",9.0
4,12 Hommes en colère,9.0


In [15]:
data.to_csv('IMDB Top Movies.csv', index=False)