<a href="https://colab.research.google.com/github/Ghonem22/Learning/blob/main/Web%20Scraping/Scraping_floordecor_products.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import time
from numpy.random import uniform



class url_domain_validation:
        
    # validate if the url start with the domain, if notL add the domain
    def validate_url_domain(self, url, domain):
        if not url.startswith("http"):
            url = domain + url
        return url



class get_sub_departments_urls(url_domain_validation):
    '''
    getting all the sub departments urls and all products urls    
    '''
    
    def __init__(self, hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com", page_departments= ['/tile', '/stone', '/wood', '/laminate', '/vinyl',
                                          '/decoratives', '/installation-materials']):
        
        self.page_departments = page_departments
        self.hdr = hdr
        self.domain = domain

    # Crawl all the urls of the sub-departments under the departments we defined.  
    def get_all_sub_departments_urls(self):
        '''
        Take departments' names as a list, and scrape all the sub-departments' urls 
        '''
        sub_departments_urls_list = []

        for deparement in self.page_departments:
            deparement_url = self.validate_url_domain(deparement, self.domain)
            print(deparement_url)
            req = Request(deparement_url, headers= self.hdr)
            text = urlopen(req).read()
            soup = BeautifulSoup(text,"lxml")
            urls = soup.find_all("a", attrs={"class":'clp-link'})

            for url in urls:
                validated_url = self.validate_url_domain(url['href'], self.domain)
                sub_departments_urls_list.append(validated_url)

            print("******** num of scrapped sub-departments till now is {}".format(len(sub_departments_urls_list)))
            time.sleep(uniform(10, 20))

        return sub_departments_urls_list


    def get_sub_department_products_urls(self, url):
        '''
        take sub_department url as an input, and return all the products'urls it contains
        '''
        products_urls = []
        req = Request(url, headers= self.hdr)
        text = urlopen(req).read()
        soup = BeautifulSoup(text,"lxml")
        urls = soup.find_all("a", attrs={"class":'b-product_tile-figure_link'})
        
        for url in urls:
            validated_url = self.validate_url_domain(url['href'], self.domain)

            try:
                products_urls.append(validated_url)
            except:
                print("err occured while scraping:  {}".format(url))

        # print("******** num of scrapped products'urls till now is {}".format(len(products_urls)))
        time.sleep(uniform(10, 20))

        return products_urls    



class get_product_info:
    '''
    using product url, getting its information 
    '''
    def __init__(self, product_url, hdr = {'User-Agent': 'Mozilla/5.0'}):
        self.product_url = product_url
        self.hdr = hdr
        self._soup = None

    @property
    def soup(self):
        if not self._soup:
            print("Getting page content...")
            req = Request(self.product_url, headers= self.hdr)
            text = urlopen(req).read()
            self._soup = BeautifulSoup(text,"lxml")

        return self._soup

    def get_images(self):
        images = self.soup.find_all("figure", attrs={"class":'b-pdp_thumbnail-figure'})
        images_urls = []
        for image in images:
            try:
                image_url = image.find("img", attrs = {"class":"b-pdp_thumbnail-figure_img"})['data-src']
                images_urls.append(image_url)
            except:
                pass
        return images_urls


    def get_price(self):
        try:
            price = self.soup.find_all("span", attrs={"class":'b-pdp_price-cost'})[0].text
        except:
            price = ''

        return price


    def get_specifications(self):

        product_properties = self.soup.find_all("section", attrs={"class":'b-pdp_specifications-container'})

        '''
        specifications is devided into three sections, we here scraped each section and saved them together under
        the umberella of "specifications"
        '''
        specifications = {"DIMENSIONS":{},"DETAILS":{},"INSTALLATION & WARRANTY":{}  }
        groups = ["DIMENSIONS", "DETAILS", "INSTALLATION & WARRANTY", ""]
        
        for i, product_property in enumerate(product_properties):
            peoperties = product_property.find_all("span", attrs={"class":'b-pdp_specifications-name'})
            peoperties_val = product_property.find_all("span", attrs={"class":'b-pdp_specifications-number'})

            for pro, pro_val in zip(peoperties,peoperties_val):
                try:
                    if specifications[groups[i]].get(pro.string.strip()):
                        specifications[groups[i]][pro.string.strip()] = [specifications[groups[i]][pro.string.strip()]]
                        specifications[groups[i]][pro.string.strip()].append(pro_val.string.strip())

                    else:
                        specifications[groups[i]][pro.string.strip()] = pro_val.string.strip()
                except:
                    key = pro.text.split('\n')[1].strip()
                    if specifications[groups[i]].get(key):
                        specifications[groups[i]][key] = [specifications[groups[i]][key]]
                        specifications[groups[i]][key].append(pro_val.string.strip())

                    else:
                        specifications[groups[i]][key] = pro_val.string.strip()

        return specifications

    def  get_categories(self):

        try:
            categories = self.soup.find_all("a", attrs={"class":'b-breadcrumbs-item_link'})

            main_category = str(categories[1].string).replace('\n\t','').strip()
            category = str(categories[2].string).replace('\n\t','').strip()
            sub_category = str(categories[3].string).replace('\n\t','').strip()
        except:
            main_category = ''
            category = ''
            sub_category = ''

        return main_category, category, sub_category


    def get_title(self):
        try:
            properties = self.soup.find("h1", attrs={"class":'b-pdp_title-name'})
            title = properties.text
        except:
            title = ''
        return title

    def get_SKU_and_size(self):
        try:
            properties = self.soup.find_all("span", attrs={"class":'b-pdp_details-element_value'})
            SKU = properties[0].text
            size = properties[1].text
        except:
            SKU = '' 
            size = ''

        return SKU, size

    def get_brand(self):
        properties = self.soup.find("img", attrs={"class":'b-pdp_title-brand_img'})
        try:
            brand = properties.get('alt')
        except:
            brand = ''
        return brand

    def get_discription(self):
        properties = self.soup.find("div", attrs={"class":'b-pdp_specifications-txt'})
        try:
            discription = properties.text
        except:
            discription = ''
        return discription



class floor_and_decoor_Scraper:
    '''
    The website is devided into 7 department, each one is devided into multiple sub-departments.
    So, out methodology is based on: 
    
    1. scraping all the sub_departments urls using composition with "get_sub_departments_urls" class.
    2. itrate over all the sub_departments urls and scrape all the products' urls using a method through the same relationship.
    3. using  the composition relation with "get_product_info" class, we will itrate over all the products' urls
        and exrtact their information.
    '''

    def __init__(self, hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com", departments= ['/tile', '/stone', '/wood', '/laminate', '/vinyl',
                                          '/decoratives', '/installation-materials']):
        self.hdr = hdr
        self.domain = domain
        self.sub_departments_urls_scrapper = get_sub_departments_urls(page_departments= departments)
        self.products_urls = []         # to save all products urls       

    def get_all_products_urls(self):
        '''
        This method work as a manager for "get_sub_departments_urls" class, it used the benifits we got using composition to get all departments urls,
        and then itrate over them to get all the products' urls.
        '''
        skipped_urls = []
        department_urls = self.sub_departments_urls_scrapper.get_all_sub_departments_urls()

        # itrate over all main catefories urls
        for category_index, sub_department_url in enumerate(department_urls):
            try:
                sub_catefory_url = '{}?start=0&sz=5000'.format(sub_department_url)
                self.products_urls.extend(self.sub_departments_urls_scrapper.get_sub_department_products_urls(sub_catefory_url))
                print("************** Crawling department num {} is finished with total crawled urls:  {}  **************".format(category_index,len(list(set(self.products_urls)))))
                time.sleep(uniform(10, 20))

            except:
                print("there is an error occured when we tried to crawl: {}".format(sub_catefory_url))
                skipped_urls.append(sub_catefory_url)
                time.sleep(uniform(200, 300))

        return list(set(goods_urls))


    def Scraping_all_products(self, products_urls= None):

        all_products = {}
        skipped_urls = []
        if not products_urls:
            products_urls = self.products_urls

        for product_index, product_url in enumerate(products_urls):

            try:
                product = get_product_info(product_url = product_url)       # Composition

                product_title = product.get_title()
                product_id = product_url.split('-')[-1].split('.')[0]
                try:
                    main_key = product_title
                    all_products[main_key] = {}
                except:
                    main_key = product_title 
                    all_products[main_key] = {}
                    
                all_products[main_key]["url"] = product_url    
                all_products[main_key]["product id"] = product_id
                all_products[main_key]["images urls"] = product.get_images()
                all_products[main_key]["price"] = product.get_price()
                all_products[main_key]["specifications"] = product.get_specifications()

                main_category, category, sub_category = product.get_categories()
                all_products[main_key]["main category"] = main_category
                all_products[main_key]["category"] = category
                all_products[main_key]["sub category"] = sub_category

                SKU, size = product.get_SKU_and_size()
                all_products[main_key]["SKU"] = SKU
                all_products[main_key]["size"] = size

                all_products[main_key]["brand"] = product.get_brand()

                all_products[main_key]["discription"] = product.get_discription()
                time.sleep(uniform(20, 30))

                print("************** Crawling product num {} is finished  **************".format(product_index))
                
            except:
                print("there is an error occured when we tried to crawl: {}".format(product_url))
                skipped_urls.append(product_url)
                time.sleep(uniform(100, 200))

        return all_products, skipped_urls

## Test the code with one department

In [None]:
crawler = floor_and_decoor_Scraper(hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com", departments= ['/tile'] )
products_urls = crawler.get_all_products_urls()
print(products_url[:50])

## Testing scraping products information using their urls

In [3]:
crawler = floor_and_decoor_Scraper()

urls = ["https://www.flooranddecor.com/glass-decoratives/ivory-glass-tile-100465673.html",
        "https://www.flooranddecor.com/glass-decoratives/harbour-island-polished-linear-mosaic-100268952.html",
        "https://www.flooranddecor.com/glass-tile/rhea-1-in.-glass-hexagon-mosaic-100899822.html"
        ]
        
all_products_info, skipped_urls = crawler.Scraping_all_products(urls)

Getting page content...
************** Crawling product num 0 is finished  **************
Getting page content...
************** Crawling product num 1 is finished  **************
Getting page content...
************** Crawling product num 2 is finished  **************


In [11]:
all_products_info.keys()

dict_keys(['Ivory Glass Tile', 'Harbour Island Polished Linear Mosaic', 'Rhea 1 in. Glass Hexagon Mosaic'])

In [12]:
all_products_info

{'Harbour Island Polished Linear Mosaic': {'SKU': '100268952',
  'brand': 'Montage',
  'category': 'shop by material',
  'discription': '\nLiven up any room with this linear 12 x 12 Harbour Island Polished Linear Mosaic in blue.The long lines of a linear shaped glass decorative can make a small room seem bigger.Decorative accents can be used to enhance kitchens, bathrooms, and other areas of your home. A stunning glass or stone backsplash adds unique detail to any design. The wide range of shapes and colors found in our glass presents a great way to express creativity.This product can be installed on a shower wall.\n',
  'images urls': ['https://i8.amplience.net/i/flooranddecor/100268952_harbour-island-polished-linear-mosaic_display?fmt=auto',
   'https://i8.amplience.net/i/flooranddecor/100268952_harbour-island-polished-linear-mosaic_1?fmt=auto',
   'https://i8.amplience.net/i/flooranddecor/100268952_context?fmt=auto'],
  'main category': 'decoratives',
  'price': '$12.99 / piece',
  

# Scraping all products

In [None]:
crawler = floor_and_decoor_Scraper(hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com")
products_urls = crawler.get_all_products_urls()
# here we saved any urls that weren't scrapped successfully in skipped_urls, we can re-scrape them later
all_products_info, skipped_urls = crawler.Scraping_all_products()