<a href="https://colab.research.google.com/github/MengOonLee/WebScrapy/blob/master/CountryHoliday/CountryHoliday.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
pip install --no-cache-dir -qU scrapy selenium

In [None]:
%%writefile CountryHoliday.py
import time
import logging
logging.getLogger().setLevel(logging.ERROR)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import wait, expected_conditions
import scrapy
from scrapy import crawler

class CountryHolidaySpider(scrapy.Spider):
    name = "CountryHoliday"
    def __init__(self, year, **kwargs):
        super().__init__(**kwargs)
        options = webdriver.chrome.options.Options()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--enable-javascript")
        options.add_argument("--disable-cookies")
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-web-security")
        options.add_argument("--incognito")
        self.driver = webdriver.Chrome(options=options)
        self.year = year

    def start_requests(self):
        urls = [
            "https://www.timeanddate.com/holidays/?allcountries"
        ]
        for url in urls:
            request = scrapy.Request(url=url, callback=self.parse_country)
            yield request

    def parse_country(self, response):
        self.driver.get(response.url)

        wait.WebDriverWait(self.driver, timeout=10)\
        .until(expected_conditions.presence_of_element_located(
            (By.XPATH, "//article[@class='category-list']")))

        selector = scrapy.Selector(text=self.driver.page_source)
        for row in selector.css("ul.category-list__list > li"):
            time.sleep(10)
            country = row.css("::text").get()
            link = row.css("a::attr(href)").get()

            if link is not None:
                link = "https://www.timeanddate.com" + link \
                    + f"{self.year}?hol=1"
                yield scrapy.Request(link, callback=self.parse_holiday,
                    meta={"country":country})

    def parse_holiday(self, response):
        self.driver.get(response.url)

        wait.WebDriverWait(self.driver, timeout=10)\
        .until(expected_conditions.presence_of_element_located(
            (By.XPATH, "//table[@id='holidays-table']")))

        selector = scrapy.Selector(text=self.driver.page_source)
        for row in selector.css("tr.showrow"):
            info = row.css("::text").getall()
            daily_period = info[0] + f" {self.year}"
            holiday = info[2]
            yield {
                "CountryName":response.meta["country"],
                "Daily_Period":daily_period,
                "Holiday":holiday
            }

process = crawler.CrawlerProcess(
    settings={"FEEDS":{"items.jl":{"format":"jsonlines"}}}
)
process.crawl(CountryHolidaySpider, year="2025")
process.start()

Overwriting CountryHoliday.py


In [None]:
%%bash
python CountryHoliday.py

In [None]:
from selenium import webdriver
from selenium.webdriver.support import wait, expected_conditions
from selenium.webdriver.common.by import By
import scrapy
import datetime

options = webdriver.chrome.options.Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--enable-javascript")
options.add_argument("--disable-cookies")
options.add_argument("--disable-notifications")
options.add_argument("--disable-web-security")
options.add_argument("--incognito")
driver = webdriver.Chrome(options=options)

url = "https://www.timeanddate.com/holidays/afghanistan/2024?hol=1"
driver.get(url)

wait.WebDriverWait(driver, timeout=10).until(
    expected_conditions.presence_of_element_located(
    (By.XPATH, "//table[@id='holidays-table']")))

selector = scrapy.Selector(text=driver.page_source)
# holiday = selector.css("tr.showrow ::text").getall()
for row in selector.css("tr.showrow"):
    info = row.css("::text").getall()
    Daily_Period = datetime.datetime.strptime(
        info[0] + " 2024", "%d %b %Y").date()
    Holiday = info[2]
    print({
        "Daily_Period":Daily_Period,
        "Holiday":Holiday
    })

driver.close()

{'Daily_Period': datetime.date(2024, 2, 15), 'Holiday': 'Liberation Day'}
{'Daily_Period': datetime.date(2024, 3, 11), 'Holiday': 'First Day of Ramadan'}
{'Daily_Period': datetime.date(2024, 4, 10), 'Holiday': 'Eid al-Fitr'}
{'Daily_Period': datetime.date(2024, 4, 11), 'Holiday': 'Eid al-Fitr Holiday'}
{'Daily_Period': datetime.date(2024, 4, 12), 'Holiday': 'Eid al-Fitr Holiday'}
{'Daily_Period': datetime.date(2024, 4, 13), 'Holiday': 'Eid al-Fitr Holiday'}
{'Daily_Period': datetime.date(2024, 4, 28), 'Holiday': 'Afghan Victory Day'}
{'Daily_Period': datetime.date(2024, 5, 1), 'Holiday': 'Labor Day'}
{'Daily_Period': datetime.date(2024, 6, 16), 'Holiday': 'Day of Arafat'}
{'Daily_Period': datetime.date(2024, 6, 17), 'Holiday': 'Eid al-Qurban'}
{'Daily_Period': datetime.date(2024, 6, 18), 'Holiday': 'Eid al-Qurban Holiday'}
{'Daily_Period': datetime.date(2024, 6, 19), 'Holiday': 'Eid al-Qurban Holiday'}
{'Daily_Period': datetime.date(2024, 6, 20), 'Holiday': 'Eid al-Qurban Holiday'}
{'D