<a href="https://colab.research.google.com/github/MengOonLee/WebScrapy/blob/master/Groceries/Lotus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%bash
pip install --no-cache-dir -U scrapy selenium

In [63]:
%%bash
rm -rf Images/Lotus
rm -rf Lotus.jl

In [58]:
%%writefile Lotus.py
import os
import logging
logging.getLogger().setLevel(logging.ERROR)
import time
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import wait, expected_conditions
import scrapy
from scrapy import crawler
from scrapy.pipelines.images import ImagesPipeline

class LotusItem(scrapy.Item):
    category = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    information = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

class LotusImagesPipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
            yield scrapy.Request(item["image_urls"], meta={
                "category": item["category"],
                "name": item["name"]
            })

    def file_path(self, request, response=None, info=None, *, item=None):
        path = os.path.join(request.meta["category"],
            request.meta["name"].replace("/", "_").replace(" ", "_"))
        return path + ".jpg"

class LotusSpider(scrapy.Spider):
    name = 'Lotus'
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        options = webdriver.chrome.options.Options()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--enable-javascript")
        options.add_argument("--enable-cookies")
        options.add_argument("--disable-notifications")
        options.add_argument("--disable-web-security")
        options.add_argument("--incognito")
        self.driver = webdriver.Chrome(options=options)

    def start_requests(self):
        urls = [
            "https://www.lotuss.com.my/en/category/grocery/commodities/rice"
            # "https://www.lotuss.com.my/en/category/meat-poultry/pre-packed"
        ]

        for url in urls:
            request = scrapy.Request(url=url, callback=self.parse_products)
            yield request

    def parse_products(self, response):
        self.driver.get(response.url)

        try:
            wait.WebDriverWait(self.driver, timeout=10)\
            .until(expected_conditions.presence_of_element_located(
                (By.XPATH, "//div[@class='carousel']")))
            selector = scrapy.Selector(text=self.driver.page_source)
            category_urls = selector.css("div.carousel a")
            yield from response.follow_all(category_urls,
                callback=self.parse_products)
        except Exception:
            pass

        try:
            wait.WebDriverWait(self.driver, timeout=10)\
            .until(expected_conditions.presence_of_element_located(
                (By.XPATH, "//div[@id='product-list']")))
        except Exception:
            raise

        html = self.driver.find_element(By.TAG_NAME, "html")
        last_height = self.driver.execute_script(
            "return document.body.scrollHeight")
        while True:
            for _ in range(10):
                html.send_keys(Keys.END)
                time.sleep(3)
                html.send_keys(Keys.HOME)
            new_height = self.driver.execute_script(
                "return document.body.scrollHeight")
            if new_height==last_height:
                break
            last_height = new_height

        selector = scrapy.Selector(text=self.driver.page_source)
        category = selector.css("ol.MuiBreadcrumbs-ol ::text").getall()
        category = "/".join(category[1:])

        for item in selector.css("div.product-grid-item"):

            name = item.css("a#product-title ::text").get()
            price = item.css("p ::text").getall()
            price = [s for s in price if not re.search("^RM\d", s)]
            price = "".join(price)
            image_urls = item.css("img::attr(src)").get()
            link = item.css("a::attr(href)").get()

            if link is not None:
                yield response.follow(link, callback=self.parse_item,
                    meta={
                        "category": category,
                        "name": name,
                        "price": price,
                        "image_urls": image_urls
                    })

    def parse_item(self, response):
        self.driver.get(response.url)

        try:
            wait.WebDriverWait(self.driver, timeout=10)\
            .until(expected_conditions.presence_of_element_located(
                (By.XPATH, "//div[@id='scrollable-force-tabpanel-0']")))

            selector = scrapy.Selector(text=self.driver.page_source)
            information = selector.css(
                "div#scrollable-force-tabpanel-0 ::text").get()
        except Exception:
            information = None

        item = LotusItem()
        item["category"] = response.meta["category"]
        item["name"] = response.meta["name"]
        item["price"] = response.meta["price"]
        item["information"] = information
        item["image_urls"] = response.meta["image_urls"]

        yield item

os.makedirs("./Images/Lotus", exist_ok=True)

process = crawler.CrawlerProcess(settings={
    "FEEDS": {"Lotus.jl": {"format":"jsonlines"}},
    "ITEM_PIPELINES": {"__main__.LotusImagesPipeline": 1},
    "IMAGES_STORE": "./Images/Lotus"
})
process.crawl(LotusSpider)
process.start()

Overwriting Lotus.py


In [None]:
%%bash
python Lotus.py