<a href="https://colab.research.google.com/github/Kratos-024/Basic/blob/master/Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/Kratos-024/RamayanaExtractionCode.git

Cloning into 'RamayanaExtractionCode'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (23/23), done.[K
remote: Total 27 (delta 2), reused 27 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 1.49 MiB | 3.81 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [2]:
%%writefile ramayana_spider.py
"""
Outline of the Algorithm:
1. Start from the homepage of Valmiki Ramayana (https://www.valmikiramayan.net/).
2. Extract the frame source that contains the list of books (Kandas).
3. Visit each book's page and extract links to all chapters (sargas).
4. Visit each chapter and extract the verse translations.
5. Parse the sarga URL to determine the book and chapter number.
6. Output all verses with metadata like book name, chapter number, sarga, shloka number, and translation.
"""

import scrapy
from scrapy.http import HtmlResponse

class RamayanaSpider(scrapy.Spider):
    name = "ramayana"
    allowed_domains = ["www.valmikiramayan.net"]
    start_urls = ["https://www.valmikiramayan.net/"]

    count = 0
    S_no = 0

    def parse(self, response):
        # Extract the inner frame which contains book links
        frame_src = response.css('frame::attr(src)').get()
        if frame_src:
            full_url = response.urljoin(frame_src)
            yield scrapy.Request(url=full_url, callback=self.parse_frame)
        else:
            self.logger.info("No iframe found in the page")

    def parse_frame(self, response):
        # Extract book names and URLs
        booksUrl = response.xpath("/html/body/ol/li/a/@href").getall()
        parentUrl = 'https://www.valmikiramayan.net/'

        for url in booksUrl:
            relativeUrl = parentUrl + url

            # Route each book to its handler
            if "baala" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_baala)
            elif "kish" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_krish)
            elif "aranya" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_aranya)
            elif "yuddha" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_yuddha)
            elif "ayodhya" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_ayodha)
            elif "sundara" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_sundara)

    # For each book, extract links to its chapters (sargas)
    def parse_book_sundara(self, response):
        chapters = response.xpath('/html/body/center[2]/table/tr/td[2]/a/@href').getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/sundara/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_aranya(self, response):
        chapters = response.css("table tr td center table tr td a::attr('href')").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/aranya/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_baala(self, response):
        chapters = response.xpath('/html/body/center[2]/table/tr/td[2]/a/@href').getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/baala/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_ayodha(self, response):
        chapters = response.css("table a.nav::attr(href)").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/ayodhya/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_krish(self, response):
        chapters = response.css("table a.nav::attr(href)").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/kish/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_yuddha(self, response):
        chapters = response.css("table a.nav::attr(href)").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/yuddha/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    # Handles the final sarga content pages and drills down to the actual verse translation
    def parse_book_sarga(self, response):
        frame_url = response.css("frame::attr(src)").get()
        if frame_url:
            full_url = response.urljoin(frame_url)
            yield scrapy.Request(url=full_url, callback=self.parse_book_sarga)
        else:
            yield from self.parse_verse(response)

    # Parse verse content from the sarga
    def parse_verse(self, response):
        # Identify book and sarga number from URL
        if 'balasans' in str(response):
            book = "bala"
            sarga = str(response).split("balasans")[1][0:2]
        elif 'kishkindha' in str(response):
            book = "kishkindha"
            sarga = str(response).split("/kishkindhasans")[1][0:3]
        elif 'yuddha' in str(response):
            book = "yuddha"
            sarga = str(response).split("/yuddhasans")[1][0:3]
        elif 'ayodhya' in str(response):
            book = "ayodhya"
            sarga = str(response).split("/ayodhyasans")[1][0:3]
        elif 'sundara' in str(response):
            book = "sundara"
            sarga = str(response).split("/sundarasans")[1][0:3]
        elif 'aranya' in str(response):
            book = "aranya"
            sarga = str(response).split("/aranyasans")[1][0:3]

        self.count += 1
        sarga = ''.join(c for c in sarga if c.isdigit())

        # Extract all translations (tatparya)
        translations = response.css('p.tat::text').getall()
        Shloka = 0

        for translation in translations:
            Shloka += 1
            self.S_no += 1
            yield {
                "S.No": self.S_no,
                "Chapter": self.count,
                "Kanda": book,
                "Sarga": sarga,
                "Shloka": Shloka,
                "Translation": translation
            }
"""
Outline of the Algorithm:
1. Start from the homepage of Valmiki Ramayana (https://www.valmikiramayan.net/).
2. Extract the frame source that contains the list of books (Kandas).
3. Visit each book's page and extract links to all chapters (sargas).
4. Visit each chapter and extract the verse translations.
5. Parse the sarga URL to determine the book and chapter number.
6. Output all verses with metadata like book name, chapter number, sarga, shloka number, and translation.
"""

import scrapy
from scrapy.http import HtmlResponse

class RamayanaSpider(scrapy.Spider):
    name = "ramayana"
    allowed_domains = ["www.valmikiramayan.net"]
    start_urls = ["https://www.valmikiramayan.net/"]

    count = 0
    S_no = 0

    def parse(self, response):
        # Extract the inner frame which contains book links
        frame_src = response.css('frame::attr(src)').get()
        if frame_src:
            full_url = response.urljoin(frame_src)
            yield scrapy.Request(url=full_url, callback=self.parse_frame)
        else:
            self.logger.info("No iframe found in the page")

    def parse_frame(self, response):
        # Extract book names and URLs
        booksUrl = response.xpath("/html/body/ol/li/a/@href").getall()
        parentUrl = 'https://www.valmikiramayan.net/'

        for url in booksUrl:
            relativeUrl = parentUrl + url

            # Route each book to its handler
            if "baala" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_baala)
            elif "kish" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_krish)
            elif "aranya" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_aranya)
            elif "yuddha" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_yuddha)
            elif "ayodhya" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_ayodha)
            elif "sundara" in relativeUrl:
                self.count = 0
                yield scrapy.Request(url=relativeUrl, callback=self.parse_book_sundara)

    # For each book, extract links to its chapters (sargas)
    def parse_book_sundara(self, response):
        chapters = response.xpath('/html/body/center[2]/table/tr/td[2]/a/@href').getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/sundara/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_aranya(self, response):
        chapters = response.css("table tr td center table tr td a::attr('href')").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/aranya/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_baala(self, response):
        chapters = response.xpath('/html/body/center[2]/table/tr/td[2]/a/@href').getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/baala/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_ayodha(self, response):
        chapters = response.css("table a.nav::attr(href)").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/ayodhya/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_krish(self, response):
        chapters = response.css("table a.nav::attr(href)").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/kish/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    def parse_book_yuddha(self, response):
        chapters = response.css("table a.nav::attr(href)").getall()
        for chapter in chapters:
            sargaUrl = 'https://www.valmikiramayan.net/utf8/yuddha/' + chapter
            yield scrapy.Request(url=sargaUrl, callback=self.parse_book_sarga)

    # Handles the final sarga content pages and drills down to the actual verse translation
    def parse_book_sarga(self, response):
        frame_url = response.css("frame::attr(src)").get()
        if frame_url:
            full_url = response.urljoin(frame_url)
            yield scrapy.Request(url=full_url, callback=self.parse_book_sarga)
        else:
            yield from self.parse_verse(response)

    # Parse verse content from the sarga
    def parse_verse(self, response):
        # Identify book and sarga number from URL
        if 'balasans' in str(response):
            book = "bala"
            sarga = str(response).split("balasans")[1][0:2]
        elif 'kishkindha' in str(response):
            book = "kishkindha"
            sarga = str(response).split("/kishkindhasans")[1][0:3]
        elif 'yuddha' in str(response):
            book = "yuddha"
            sarga = str(response).split("/yuddhasans")[1][0:3]
        elif 'ayodhya' in str(response):
            book = "ayodhya"
            sarga = str(response).split("/ayodhyasans")[1][0:3]
        elif 'sundara' in str(response):
            book = "sundara"
            sarga = str(response).split("/sundarasans")[1][0:3]
        elif 'aranya' in str(response):
            book = "aranya"
            sarga = str(response).split("/aranyasans")[1][0:3]

        self.count += 1
        sarga = ''.join(c for c in sarga if c.isdigit())

        # Extract all translations (tatparya)
        translations = response.css('p.tat::text').getall()
        Shloka = 0

        for translation in translations:
            Shloka += 1
            self.S_no += 1
            yield {
                "S.No": self.S_no,
                "Chapter": self.count,
                "Kanda": book,
                "Sarga": sarga,
                "Shloka": Shloka,
                "Translation": translation
            }


Writing ramayana_spider.py
