In [None]:
%%bash

pip install --no-cache-dir -qU scrapy

In [None]:
import os
import pathlib
import scrapy
from scrapy.crawler import CrawlerProcess

class HtmlSpider(scrapy.Spider):
    name = "html"

    def start_requests(self):
        urls = ["https://www.timeanddate.com/holidays"]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_front)

    def parse_front(self, response):
        page = response.url.split("/")[-2]
        input_dir = os.path.join(os.getcwd(), 'Data/Input')
        os.makedirs(input_dir, exist_ok=True)
        filename = f"{input_dir}/{page}.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")
        
        for country in response.css('ul.category-list__list > li'):
            country_name = country.xpath(".//text()")
            self.country_dir = f"{input_dir}/{country_name}"
            url = country.xpath("./a/@href").get()
            url = url + "/2023?hol=1"
            if url is not None:
                yield response.follow(url=url, callback=self.parse_page)

    def parse_page(self, response):
        os.makedirs(self.country_dir, exist_ok=True)
        filename = f"{self.country_dir}/2023.html"
        pathlib.Path(filename).write_bytes(response.body)
        self.log(f"Saved file {filename}")

process = CrawlerProcess()
process.crawl(HtmlSpider)
process.start()

In [None]:
import os
import scrapy

input_dir = "./Data/Input"
with open(os.path.join(input_dir, 'front.html'), 'r') as file:
  html = file.read()

sel = scrapy.Selector(text=html)
blocks = sel.css("ul.category-list__list > li")
countries = blocks.xpath(".//text()").getall()
links = blocks.xpath("./a/@href").getall()
links

In [None]:
import os
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from itemloaders.processors import MapCompose, TakeFirst
from w3lib.html import remove_tags

base_dir = os.getcwd()
input_dir = "Data/Input"
output_dir = "Data/Output"

class HolidayItem(scrapy.Item):
    country = scrapy.Field(
        input_processor=MapCompose(remove_tags),
        output_processor=TakeFirst()
    )
    

class HolidaysSpider(scrapy.Spider):
    name = 'Holidays'

    def start_requests(self):
        urls = [
            f"file://{base_dir}/{input_dir}/front.html"
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse_front)

    def parse_front(self, response):
        for country in response.css('ul.category-list__list > li'):
            l = ItemLoader(item=HolidayItem(), selector=country)
            l.add_xpath("country", ".//text()")
            link = country.xpath("./a/@href").get()
            link = link + "/2023?hol=1"
            print(link)
            yield l.load_item()

    # def parse_pages(self, response):
    #     tables = response.css('table#holidays-table')
    #     dates = tables.xpath('.//th[@class="nw"]/text()').getall()
    #     print(dates)

process = CrawlerProcess(
    settings={
        "FEEDS":{f"{base_dir}/{output_dir}/holiday.jl":{"format":"jsonlines"}}
    }
)
process.crawl(HolidaysSpider)
process.start()

In [None]:
import scrapy
import requests

url = 'https://www.timeanddate.com/holidays/afghanistan/2023?hol=1'
html = requests.get(url).content

sel = scrapy.Selector(text=html)
tables = sel.css('table#holidays-table')
tables
# dates = sel.css('th.nw::text').extract()
# holidays = sel.css('td > a::text').extract()
