<a href="https://colab.research.google.com/github/Ghonem22/Action-Detection/blob/master/Scraping_floordecor_products_data_(x).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [64]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import time
from numpy.random import uniform
import pandas as pd


class url_domain_validation:
        
    # validate if the url start with the domain, if notL add the domain
    def validate_url_domain(self, url, domain):
        if not url.startswith("http"):
            url = domain + url
        return url



class get_sub_departments_urls(url_domain_validation):
    '''
    getting all the sub departments urls and all products urls    
    '''
    
    def __init__(self, hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com", page_departments= ['/tile', '/stone', '/wood', '/laminate', '/vinyl',
                                          '/decoratives', '/installation-materials']):
        
        self.page_departments = page_departments
        self.hdr = hdr
        self.domain = domain

    # Crawl all the urls of the sub-departments under the departments we defined.  
    def get_all_sub_departments_urls(self):
        '''
        Take departments' names as a list, and scrape all the sub-departments' urls 
        '''
        sub_departments_urls_list = []

        for deparement in self.page_departments:
            deparement_url = self.validate_url_domain(deparement, self.domain)
            print(deparement_url)
            req = Request(deparement_url, headers= self.hdr)
            text = urlopen(req).read()
            soup = BeautifulSoup(text,"lxml")
            urls = soup.find_all("a", attrs={"class":'clp-link'})

            for url in urls:
                validated_url = self.validate_url_domain(url['href'], self.domain)
                sub_departments_urls_list.append(validated_url)

            print("******** num of scrapped sub-departments till now is {}".format(len(sub_departments_urls_list)))
            time.sleep(uniform(40, 60))

        return sub_departments_urls_list


    def get_sub_department_products_urls(self, url):
        '''
        take sub_department url as an input, and return all the products'urls it contains
        '''
        products_urls = []
        req = Request(url, headers= self.hdr)
        text = urlopen(req).read()
        soup = BeautifulSoup(text,"lxml")
        urls = soup.find_all("a", attrs={"class":'b-product_tile-figure_link'})
        
        for url in urls:
            validated_url = self.validate_url_domain(url['href'], self.domain)

            try:
                products_urls.append(validated_url)
            except:
                print("err occured while scraping:  {}".format(url))

        # print("******** num of scrapped products'urls till now is {}".format(len(products_urls)))
        time.sleep(uniform(20, 30))

        return products_urls    



class get_product_info:
    '''
    using product url, getting its information 
    '''
    def __init__(self, product_url, hdr = {'User-Agent': 'Mozilla/5.0'}):
        self.product_url = product_url
        self.hdr = hdr
        self._soup = None

    @property
    def soup(self):
        if not self._soup:
            print("Getting page content...")
            req = Request(self.product_url, headers= self.hdr)
            text = urlopen(req).read()
            self._soup = BeautifulSoup(text,"lxml")

        return self._soup

    def get_images(self):
        images = self.soup.find_all("figure", attrs={"class":'b-pdp_thumbnail-figure'})
        images_urls = []
        for image in images:
            try:
                image_url = image.find("img", attrs = {"class":"b-pdp_thumbnail-figure_img"})['data-src']
                images_urls.append(image_url)
            except:
                pass
        return images_urls


    def get_price(self):
        try:
            price = self.soup.find_all("span", attrs={"class":'b-pdp_price-cost'})[0].text
        except:
            price = ''

        return price


    def get_specifications(self):

        product_properties = self.soup.find_all("section", attrs={"class":'b-pdp_specifications-container'})

        '''
        specifications is devided into three sections, we here scraped each section and saved them together under
        the umberella of "specifications"
        '''
        specifications = {"DIMENSIONS":{},"DETAILS":{},"INSTALLATION & WARRANTY":{}  }
        groups = ["DIMENSIONS", "DETAILS", "INSTALLATION & WARRANTY", ""]
        
        for i, product_property in enumerate(product_properties):
            peoperties = product_property.find_all("span", attrs={"class":'b-pdp_specifications-name'})
            peoperties_val = product_property.find_all("span", attrs={"class":'b-pdp_specifications-number'})

            for pro, pro_val in zip(peoperties,peoperties_val):
                try:
                    if specifications[groups[i]].get(pro.string.strip()):
                        specifications[groups[i]][pro.string.strip()] = [specifications[groups[i]][pro.string.strip()]]
                        specifications[groups[i]][pro.string.strip()].append(pro_val.string.strip())

                    else:
                        specifications[groups[i]][pro.string.strip()] = pro_val.string.strip()
                except:
                    key = pro.text.split('\n')[1].strip()
                    if specifications[groups[i]].get(key):
                        specifications[groups[i]][key] = [specifications[groups[i]][key]]
                        specifications[groups[i]][key].append(pro_val.string.strip())

                    else:
                        specifications[groups[i]][key] = pro_val.string.strip()

        return specifications

    def  get_categories(self):

        try:
            categories = self.soup.find_all("a", attrs={"class":'b-breadcrumbs-item_link'})

            main_category = str(categories[1].string).replace('\n\t','').strip()
            category = str(categories[2].string).replace('\n\t','').strip()
            sub_category = str(categories[3].string).replace('\n\t','').strip()
        except:
            main_category = ''
            category = ''
            sub_category = ''

        return main_category, category, sub_category


    def get_title(self):
        try:
            properties = self.soup.find("h1", attrs={"class":'b-pdp_title-name'})
            title = properties.text
        except:
            title = ''
        return title

    def get_SKU_and_size(self):
        try:
            properties = self.soup.find_all("span", attrs={"class":'b-pdp_details-element_value'})
            SKU = properties[0].text
            size = properties[1].text
        except:
            SKU = '' 
            size = ''

        return SKU, size

    def get_brand(self):
        properties = self.soup.find("img", attrs={"class":'b-pdp_title-brand_img'})
        try:
            brand = properties.get('alt')
        except:
            brand = ''
        return brand

    def get_discription(self):
        properties = self.soup.find("div", attrs={"class":'b-pdp_specifications-txt'})
        try:
            discription = properties.text
        except:
            discription = ''
        return discription



class floor_and_decoor_Scraper:
    '''
    The website is devided into 7 department, each one is devided into multiple sub-departments.
    So, out methodology is based on: 
    
    1. scraping all the sub_departments urls using composition with "get_sub_departments_urls" class.
    2. itrate over all the sub_departments urls and scrape all the products' urls using a method through the same relationship.
    3. using  the composition relation with "get_product_info" class, we will itrate over all the products' urls
        and exrtact their information.
    '''

    def __init__(self, hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com", departments= ['/tile', '/stone', '/wood', '/laminate', '/vinyl',
                                          '/decoratives', '/installation-materials']):
        self.hdr = hdr
        self.domain = domain
        self.sub_departments_urls_scrapper = get_sub_departments_urls(page_departments= departments)
        self.products_urls = []         # to save all products urls       
        self.skipped_urls = []
        self.all_products = {}
        self.skipped_scrapping_urls = []

    def get_all_products_urls(self):
        '''
        This method work as a manager for "get_sub_departments_urls" class, it used the benifits we got using composition to get all departments urls,
        and then itrate over them to get all the products' urls.
        '''
        skipped_urls = []
        department_urls = self.sub_departments_urls_scrapper.get_all_sub_departments_urls()

        # itrate over all main catefories urls
        for category_index, sub_department_url in enumerate(department_urls):
            try:
                sub_catefory_url = '{}?start=0&sz=5000'.format(sub_department_url)
                self.products_urls.extend(self.sub_departments_urls_scrapper.get_sub_department_products_urls(sub_catefory_url))
                print("************** Crawling department num {} is finished with total crawled urls:  {}  **************".format(category_index,len(list(set(self.products_urls)))))
                time.sleep(uniform(40, 70))

            except:
                print("there is an error occured when we tried to crawl: {}".format(sub_catefory_url))
                self.skipped_urls.append(sub_catefory_url)
                time.sleep(uniform(200, 300))

        return list(set(self.products_urls)), self.skipped_urls


    def Scraping_all_products(self, products_urls= None):

        all_products = {}
        skipped_urls = []
        if not products_urls:
            products_urls = self.products_urls

        for product_index, product_url in enumerate(products_urls):

            try:
                product = get_product_info(product_url = product_url)       # Composition

                product_title = product.get_title()
                product_id = product_url.split('-')[-1].split('.')[0]
                try:
                    main_key = product_title
                    self.all_products[main_key] = {}
                except:
                    main_key = product_title 
                    self.all_products[main_key] = {}
                    
                self.all_products[main_key]["url"] = product_url    
                self.all_products[main_key]["product id"] = product_id
                self.all_products[main_key]["images urls"] = product.get_images()
                self.all_products[main_key]["price"] = product.get_price()
                self.all_products[main_key]["specifications"] = product.get_specifications()

                main_category, category, sub_category = product.get_categories()
                self.all_products[main_key]["main category"] = main_category
                self.all_products[main_key]["category"] = category
                self.all_products[main_key]["sub category"] = sub_category

                SKU, size = product.get_SKU_and_size()
                self.all_products[main_key]["SKU"] = SKU
                self.all_products[main_key]["size"] = size

                self.all_products[main_key]["brand"] = product.get_brand()

                self.all_products[main_key]["discription"] = product.get_discription()
                time.sleep(uniform(60, 120))

                print("************** Crawling product num {} is finished  **************".format(product_index))
                if product_index % 3 == 0:
                    time.sleep(uniform(100, 150))

                if product_index % 40 == 0:
                    scrapped_dataframe = pd.DataFrame.from_dict(self.all_products)
                    scrapped_dataframe.to_csv('Scrapped_floor_products_data_1_{}.csv'.format(product_index), index=True)

                    skipped_dataframe = pd.DataFrame.from_dict(self.skipped_scrapping_urls)
                    skipped_dataframe.to_csv('Skipped_floor_products_data_1_{}.csv'.format(product_index), index=False)

            except:
                print("there is an error occured when we tried to crawl: {}".format(product_url))
                self.skipped_scrapping_urls.append(product_url)
                time.sleep(uniform(150, 250))

        return self.all_products, self.skipped_scrapping_urls

In [59]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [60]:
import os
%cd /content/drive/MyDrive
os.mkdir("floor")

%cd floor

/content/drive/MyDrive
/content/drive/MyDrive/floor


In [61]:
# create scraper object
crawler = floor_and_decoor_Scraper(hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com")


In [62]:
import pandas as pd
df = pd.read_csv("/content/all_products_urls.csv")
products_utls = list(df['urls'])
len(products_utls)

4575

In [None]:
# here we saved any urls that weren't scrapped successfully in skipped_urls, we can re-scrape them later
all_products_info, skipped_urls = crawler.Scraping_all_products(products_utls[0:300])

Getting page content...
************** Crawling product num 0 is finished  **************
Getting page content...
************** Crawling product num 1 is finished  **************
Getting page content...
************** Crawling product num 2 is finished  **************
Getting page content...
************** Crawling product num 3 is finished  **************
Getting page content...
Getting page content...
there is an error occured when we tried to crawl: https://www.flooranddecor.com//schluter-installation-materials/schluter-kerdi-shower-kit-38in.-x-60in.-offset-abs-flange-2-100606896.html
Getting page content...
Getting page content...
there is an error occured when we tried to crawl: https://www.flooranddecor.com//porcelain-tile/atlas-blue-matte-porcelain-tile-100884501.html
Getting page content...
Getting page content...
there is an error occured when we tried to crawl: https://www.flooranddecor.com//duralux-performance-flooring/concrete-americano-rigid-core-luxury-vinyl-plank---foam

In [49]:
scrapped_dataframe = pd.DataFrame.from_dict(all_products_info)
scrapped_dataframe.to_csv('floor_products_data_1_{}.csv'.format(10), index=True)


In [51]:
import pandas as pd
data = pd.read_csv("/content/floor_products_data_1_10.csv", index_col=0)
data


Unnamed: 0,Calacatta Bluette I Marble Base Molding,Goldblatt 3in. Spring Steel Joint Knife,Schluter Bara-Rw Balcony Edge 1-3/16in. Aluminum Black Brown
url,https://www.flooranddecor.com//finishing-piece...,https://www.flooranddecor.com//installation-to...,https://www.flooranddecor.com//tile-metal-trim...
product id,100885854,100523778,100521061
images urls,['https://i8.amplience.net/i/flooranddecor/100...,['https://i8.amplience.net/i/flooranddecor/100...,['https://i8.amplience.net/i/flooranddecor/100...
price,$18.99 / piece,$2.79 / piece,$54.73 / piece
specifications,"{'DIMENSIONS': {'Size': '5 x 12', 'Product Len...","{'DIMENSIONS': {'Product Length': '9.5', 'Prod...","{'DIMENSIONS': {'Size': '1 3/16in.', 'Product ..."
main category,stone,installation materials,installation materials
category,shop by type,shop tile & stone installation,shop tile & stone installation
sub category,finishing pieces,installation tools,metal trims & transitional pieces
SKU,100885854,,100521061
size,5 x 12,,1 3/16in.


## Test the code with one department

In [None]:
crawler = floor_and_decoor_Scraper(hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com", departments=  ['/tile', '/stone', '/wood'] )
products_urls, skipped_urls = crawler.get_all_products_urls()


https://www.flooranddecor.com/tile
******** num of scrapped sub-departments till now is 36
https://www.flooranddecor.com/stone
******** num of scrapped sub-departments till now is 66
https://www.flooranddecor.com/wood
******** num of scrapped sub-departments till now is 99
************** Crawling department num 0 is finished with total crawled urls:  2000  **************
************** Crawling department num 1 is finished with total crawled urls:  2267  **************
************** Crawling department num 2 is finished with total crawled urls:  2638  **************
************** Crawling department num 3 is finished with total crawled urls:  2641  **************
************** Crawling department num 4 is finished with total crawled urls:  2647  **************
************** Crawling department num 5 is finished with total crawled urls:  2655  **************
************** Crawling department num 6 is finished with total crawled urls:  2659  **************
************** Crawling de

NameError: ignored

In [None]:
len(products_urls)

3462

In [None]:
products_urls

NameError: ignored

## Testing scraping products information using their urls

In [None]:
crawler = floor_and_decoor_Scraper()

urls = ["https://www.flooranddecor.com/glass-decoratives/ivory-glass-tile-100465673.html",
        "https://www.flooranddecor.com/glass-decoratives/harbour-island-polished-linear-mosaic-100268952.html",
        "https://www.flooranddecor.com/glass-tile/rhea-1-in.-glass-hexagon-mosaic-100899822.html"
        ]
        
all_products_info, skipped_urls = crawler.Scraping_all_products(urls)

Getting page content...
************** Crawling product num 0 is finished  **************
Getting page content...
************** Crawling product num 1 is finished  **************
Getting page content...
************** Crawling product num 2 is finished  **************


In [None]:
all_products_info.keys()

dict_keys(['Ivory Glass Tile', 'Harbour Island Polished Linear Mosaic', 'Rhea 1 in. Glass Hexagon Mosaic'])

In [None]:
all_products_info['Ivory Glass Tile']

{'SKU': '100465673',
 'brand': 'Pure',
 'category': 'shop by material',
 'discription': '\nAdd an accent to your design with this 3 x 9 Ivory Glass Tile with a polished or high gloss finish.Straight and perfectly cut-to-size edges of this decorative create continuity.Decorative accents can be used to enhance kitchens, bathrooms, and other areas of your home. A stunning glass backsplash adds unique detail to any design. The wide range of shapes and colors found in our glass presents a great way to express creativity.This product can be installed on a shower wall.\n',
 'images urls': ['https://i8.amplience.net/i/flooranddecor/100465673_ivory-glass-tile_display?fmt=auto',
  'https://i8.amplience.net/i/flooranddecor/100465673_ivory-glass-tile_1?fmt=auto',
  'https://i8.amplience.net/i/flooranddecor/100465673_context?fmt=auto'],
 'main category': 'decoratives',
 'price': '$1.69 / piece',
 'product id': '100465673',
 'size': '3 x 9',
 'specifications': {'DETAILS': {'Color': 'White',
   'Edge

In [None]:
all_products_info

{'Harbour Island Polished Linear Mosaic': {'SKU': '100268952',
  'brand': 'Montage',
  'category': 'shop by material',
  'discription': '\nLiven up any room with this linear 12 x 12 Harbour Island Polished Linear Mosaic in blue.The long lines of a linear shaped glass decorative can make a small room seem bigger.Decorative accents can be used to enhance kitchens, bathrooms, and other areas of your home. A stunning glass or stone backsplash adds unique detail to any design. The wide range of shapes and colors found in our glass presents a great way to express creativity.This product can be installed on a shower wall.\n',
  'images urls': ['https://i8.amplience.net/i/flooranddecor/100268952_harbour-island-polished-linear-mosaic_display?fmt=auto',
   'https://i8.amplience.net/i/flooranddecor/100268952_harbour-island-polished-linear-mosaic_1?fmt=auto',
   'https://i8.amplience.net/i/flooranddecor/100268952_context?fmt=auto'],
  'main category': 'decoratives',
  'price': '$12.99 / piece',
  

# Scraping all products

In [10]:
# create scraper object
crawler = floor_and_decoor_Scraper(hdr = {'User-Agent': 'Mozilla/5.0'}, domain = "https://www.flooranddecor.com")


## if you want to start scraping from zero point, uncomment this cell and run it

In [None]:
# if we want to scrape all the products'urls by ourselves again
products_urls, skipped_urls = crawler.get_all_products_urls()

# all_products_info, skipped_urls = crawler.Scraping_all_products()

https://www.flooranddecor.com/tile
******** num of scrapped sub-departments till now is 36
https://www.flooranddecor.com/stone
******** num of scrapped sub-departments till now is 66
https://www.flooranddecor.com/wood
******** num of scrapped sub-departments till now is 99
https://www.flooranddecor.com/laminate
******** num of scrapped sub-departments till now is 121
https://www.flooranddecor.com/vinyl
******** num of scrapped sub-departments till now is 151
https://www.flooranddecor.com/decoratives
******** num of scrapped sub-departments till now is 179


KeyboardInterrupt: ignored

In [None]:
products_urls

NameError: ignored

# Save results

In [None]:
import pandas as pd
df = pd.DataFrame(products_urls, columns=["urls"])
df.to_csv('floor_products_urls1.csv', index=False)

## I've scraped all the products url's, we can read them dirctly from the csv file instead of scraping them again.

In [5]:
import pandas as pd
df = pd.read_csv("/content/all_products_urls.csv")
products_utls = list(df['urls'])
len(products_utls)

4575

In [1]:

# here we saved any urls that weren't scrapped successfully in skipped_urls, we can re-scrape them later
all_products_info, skipped_urls = crawler.Scraping_all_products(products_utls[:300])

NameError: ignored

In [6]:
all_products_info

{'Calacatta Bluette I Marble Base Molding': {'SKU': '100885854',
  'brand': 'Viviano',
  'category': 'shop by type',
  'discription': '\nAdd a gorgeous accent to your next project with this 5 x 12 Calacatta Bluette I Marble Base Molding.Add interest to your decorative design with an elaborate or simple border. Borders add elegance and are used to frame a tiled area.\n',
  'images urls': ['https://i8.amplience.net/i/flooranddecor/100885854_calacatta-bluette-i-marble-base-molding_1?fmt=auto',
   'https://i8.amplience.net/i/flooranddecor/100885854_2?fmt=auto'],
  'main category': 'stone',
  'price': '$18.99 / piece',
  'product id': '100885854',
  'size': '5 x 12',
  'specifications': {'DETAILS': {'Color': 'Blue',
    'Color Variation': '3',
    'Country of Origin': 'Italy',
    'Edge': 'Straight or Rectified',
    'Finish': 'Polished or High Gloss',
    'Material': 'Marble',
    'Water Resistance': 'Water Resistant (with proper sealant)'},
   'DIMENSIONS': {'Box Length': '12.200',
    'B

In [7]:
new = pd.DataFrame.from_dict(all_products_info)

In [8]:
new.head()

Unnamed: 0,Calacatta Bluette I Marble Base Molding,Goldblatt 3in. Spring Steel Joint Knife,Schluter Bara-Rw Balcony Edge 1-3/16in. Aluminum Black Brown,Schluter Kerdi-Shower Side Linear Tray 39in. x 39in.,Schluter Kerdi-Shower Kit 38in. x 60in. Offset ABS Flange 2
url,https://www.flooranddecor.com//finishing-piece...,https://www.flooranddecor.com//installation-to...,https://www.flooranddecor.com//tile-metal-trim...,https://www.flooranddecor.com//schluter-instal...,https://www.flooranddecor.com//schluter-instal...
product id,100885854,100523778,100521061,100583764,100606896
images urls,[https://i8.amplience.net/i/flooranddecor/1008...,[https://i8.amplience.net/i/flooranddecor/1005...,[https://i8.amplience.net/i/flooranddecor/1005...,[https://i8.amplience.net/i/flooranddecor/1005...,[https://i8.amplience.net/i/flooranddecor/1006...
price,$18.99 / piece,$2.79 / piece,$54.73 / piece,$187.31 / piece,$537.95 / piece
specifications,"{'DIMENSIONS': {'Size': '5 x 12', 'Product Len...","{'DIMENSIONS': {'Product Length': '9.5', 'Prod...","{'DIMENSIONS': {'Size': '1 3/16in.', 'Product ...","{'DIMENSIONS': {'Size': '39in. X 39in.', 'Prod...","{'DIMENSIONS': {'Size': '38in. x 60in.', 'Prod..."
