<a href="https://colab.research.google.com/github/Jahnavi-run/Scraped-Data-to-RDF/blob/main/imdb_scraper_topmovies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMDB Scraper
**Notebook to scrape movies from IMDB, particularly this page: https://m.imdb.com/chart/top/. The script gets 100 movies.**

## Install the required Packages

In [1]:
!pip install scrapy crochet --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m700.9 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.8/259.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

## Importing the required packages.

In [2]:
import scrapy
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from crochet import setup, wait_for
import pandas as pd
setup()

## Scraping using the CSS classes.

In [3]:
class IMDBSpider(scrapy.Spider):
    name = 'imdb_top250_detailed'
    allowed_domains = ['imdb.com']
    start_urls = ['https://www.imdb.com/chart/top/']

    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'ROBOTSTXT_OBEY': True,
        'CONCURRENT_REQUESTS': 10,
        'FEED_FORMAT': 'csv',
        'FEED_URI': 'imdb_top250_detailed.csv',
    }

    def parse(self, response):
        movies = set(response.css('li.ipc-metadata-list-summary-item a::attr(href)').getall())
        for movie in movies:
            yield response.follow(movie, self.parse_movie)

    def parse_movie(self, response):
        yield {
            'movie_title': response.css('span.hero__primary-text::text').get(),
            'director': list(set(response.css('.ipc-metadata-list__item:contains("Director") .ipc-inline-list__item a::text').getall())),
            'synopsis': response.css("span.sc-42125d72-0.gKbnVu::text").get(),
            'actors': list(set(response.css('.ipc-metadata-list__item:contains("Stars") .ipc-inline-list__item a::text').getall())),
            'genre': response.css("div.ipc-chip-list__scroller span::text").getall(),
            'rating': response.css('span.sc-d541859f-1.imUuxf::text').get(),
        }

In [4]:
from twisted.internet import defer

# Configure logging to suppress verbose output
configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s'})

# Initialize the crawler runner
runner = CrawlerRunner()

# Store scraped data
scraped_items = []

@wait_for(600)  # Timeout after 600 seconds
@defer.inlineCallbacks
def crawl():
    yield runner.crawl(IMDBSpider, cb_kwargs={'items': scraped_items})

crawl()

INFO:scrapy.addons:Enabled addons:
[]
2025-03-25 07:21:28 [scrapy.addons] INFO: Enabled addons:
[]
INFO:scrapy.extensions.telnet:Telnet Password: 7937d34a14d475f0
2025-03-25 07:21:28 [scrapy.extensions.telnet] INFO: Telnet Password: 7937d34a14d475f0
INFO:scrapy.middleware:Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2025-03-25 07:21:28 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
INFO:scrapy.crawler:Overridden settings:
{'CONCURRENT_REQUESTS': 10,
 'ROBOTSTXT_OBEY': True,
 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
               '(KHTML, like Gecko) Chro

## Save the CSV file to Google Drive

In [5]:
import os
import shutil
from google.colab import drive

drive.mount('/content/drive')

imdb_folder_path = '/content/drive/MyDrive/imdb'

# Create the 'imdb' folder if it doesn't exist
if not os.path.exists(imdb_folder_path):
    os.makedirs(imdb_folder_path)

# Copy the CSV file to the 'imdb' folder
shutil.copy('/content/imdb_top250_detailed.csv', imdb_folder_path)


Mounted at /content/drive


'/content/drive/MyDrive/imdb/imdb_top250_detailed.csv'

The CSV file contains duplicates, commas, and square brackets in some values. Data needs to be cleaned using OpenRefine or Excel before using further for converting into RDF or some other use case.