In [454]:
class AmazonScrapper:
    from bs4 import BeautifulSoup
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    import pandas
    import regex as re

    def __init__(self, base_url, paging_url, 
                 driver_path='C:/Users/Jimoh/Documents/ML projects/Scraping/chromedriver_win32/chromedriver.exe', 
                 brand_list_path = 'C:/Users/Jimoh/Documents/GitHub/Scraper/brandList.csv'):
        
        self.driver_path = driver_path
        self.brand_list_path = brand_list_path
        self.base_url = base_url
        self.paging_url = paging_url
        self.proc_paging_url()
        
    def proc_paging_url(self):
        self.paging_url = self.paging_url.replace(re.findall('qid=\d+&*', self.paging_url)[0], '')
        self.paging_url = self.paging_url.replace(re.findall('page=\d', self.paging_url)[0], 'page={}')
        self.paging_url = self.paging_url.replace(re.findall('sr_pg_\d', self.paging_url)[0], 'sr_pg_{}')
    
    @property
    def brand_list(self):
        return pd.read_csv(self.brand_list_path).set_index('0')#.iloc[:,0].to_list()
        
    def get_paging_url(self, page_no):
        """
        get the paging url from the user
        
        """
        return self.paging_url.format(page_no, page_no)
    
    def set_src(self, page_no=1):
        options = Options()
        options.add_experimental_option('debuggerAddress', 'localhost:9222')

        driver = webdriver.Chrome(options=options, executable_path=self.driver_path)
        if page_no == 1:
            driver.get(self.base_url)
        elif page_no > 1:
            driver.get(self.get_paging_url(page_no))
        
        self.src = bs(driver.page_source, features= 'html.parser')
        driver.quit()
    
    @property
    def get_product_panes(self):
        product_panes = self.src.find_all('div', {'data-component-type':'s-search-result'})
        return product_panes

    def get_product_asin(self, product_pane):
        return product_pane.attrs['data-asin']
            
        
    def get_product_desc(self, product_pane):
        """Get the product description"""    
        product_desc = product_pane.h2.a.text.strip()
        return product_desc
    
    def get_product_link(self, product_pane):
        product_link = 'https://www.amazon.com' + product_pane.h2.a['href']
        return product_link
        
    def get_helium_pane(self, product_pane):
        helium_pane = product_pane.find('div', {'id':re.compile('bsr-\w+')}).div
        return helium_pane

    def filter_asin_by_allowed_brands(self, product_pane):
        brand = self.get_product_desc(product_pane).split()[0]
        sub_brand_list = self.brand_list.filter(like= brand[:3], axis= 'index')
        
        for i in sub_brand_list:
            if brand in i:
                return 0
        return 1

    def filter_asin_by_seller_count(self, product_pane, count = 1):
        helium_pane = self.get_helium_pane(product_pane)
        seller_count = helium_pane.findAll('div', recursive= False)[1].findAll('div')[1].findAll('a')[1].text.split(' ')[0].replace(',', '')
        if int(seller_count) > count:
            return 0
        else:
            return 1

    def filter_asin_by_brn_no(self, product_pane):
        helium_pane = self.get_helium_pane(product_pane)
        brns= []
        for i in range(len(helium_pane.findAll('div', recursive= False)[0].findAll('div'))):
            brn_data = (helium_pane.findAll('div', recursive= False)[0].findAll('div')[i].text)
            brn_data = brn_data.split()[0][1:].replace(',','')
            if int(brn_data) > 150000:
                return 0
            else:
                brns.append(int(brn_data))
        return 1
            
    def filter_asin_by_price(self, product_pane):
        pass

    def save_result():
        pass
    
    def apply_filters():
        pass


In [455]:
base = 'https://www.amazon.com/s?bbn=16225007011&dc&i=computers-intl-ship&k=-thfjfd%20keyboards&qid=1626710483&ref=glow_cls&refresh=1&rh=p_6%3AATVPDKIKX0DER%2Cp_36%3A2000-&rnid=386442011&s=relevancerank'

In [456]:
page = 'https://www.amazon.com/s?k=-thfjfd+keyboards&i=computers-intl-ship&bbn=16225007011&rh=p_6%3AATVPDKIKX0DER%2Cp_36%3A2000-&s=relevancerank&dc&page=2&qid=1626759178&refresh=1&rnid=386442011&ref=sr_pg_2'

In [457]:
scraper = AmazonScrapper(base, page)

In [458]:
# scraper.set_src(30)
# panes = scraper.get_product_panes

In [459]:
pane = panes[10]

In [460]:
scraper.get_product_desc(pane)

'Razer Blade 15 Base Gaming Laptop 2020: Intel Core i7-10750H 6-Core, NVIDIA GeForce GTX 1660 Ti, 15.6" FHD 1080p 144Hz, 16GB RAM, 256GB SSD, CNC Aluminum, Chroma RGB Lighting, Thunderbolt 3, Black'

In [461]:
scraper.get_helium_pane(pane)

<div class="sc-lahOFi hyIXZ"><div class="sc-dhWmbD fDsoTu"><div class="sc-DhIVs gOgEBp">#12,265 in <span><a class="sc-iawIMh bZcxwc" href="/gp/bestsellers/pc/ref=pd_zg_ts_pc">Computers &amp; Accessories</a></span></div><div class="sc-DhIVs gOgEBp">#1,690 in <span><a class="sc-iawIMh bZcxwc" href="/gp/bestsellers/pc/13896615011/ref=pd_zg_hrsr_pc">Traditional Laptop Computers</a></span></div></div><div class="sc-iQTVEV kcoRTK"><div class="sc-dqFsfS cxdBhe"><span class="sc-jMZYIo"><strong>ASIN:</strong>B086MGY9TZ</span><span class="sc-bhrzfF iwbQlR"><svg class="sc-cdxCiY ezvtPS" viewbox="0 0 32 32" xmlns="http://www.w3.org/2000/svg"><g fill="none" fill-rule="evenodd"><path d="M2 10h16a2 2 0 012 2v16a2 2 0 01-2 2H2a2 2 0 01-2-2V12a2 2 0 012-2zm0 4v14h16V14H2z" fill="#0081ff"></path><path d="M18 20h12V4H14v10h-2V4a2 2 0 012-2h16a2 2 0 012 2v16a2 2 0 01-2 2H18v-2z" fill="#0081ff"></path><path d="M0 0h32v32H0z"></path></g></svg>Show</span></div><div class="sc-dqFsfS cxdBhe"><a class="sc-cuzpc

In [462]:
scraper.get_product_link(pane)

'https://www.amazon.com/Razer-Blade-Base-Gaming-Laptop/dp/B086MGY9TZ/ref=sr_1_707?dchild=1&keywords=-thfjfd+keyboards&m=ATVPDKIKX0DER&qid=1626813133&refinements=p_6%3AATVPDKIKX0DER%2Cp_36%3A2000-&refresh=1&rnid=386442011&s=computers-intl-ship&sr=1-707'

In [463]:
scraper.filter_asin_by_allowed_brands(pane)

1

In [464]:
scraper.filter_asin_by_brn_no(pane)

1

In [473]:
scraper.filter_asin_by_seller_count(pane, 2)

0

In [48]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd

In [11]:
opts = Options()

In [12]:
driver_path='C:/Users/Jimoh/Documents/ML projects/Scraping/chromedriver_win32/chromedriver.exe'
log_path='C:/Users/Jimoh/Documents/GitHub/Scraper'

In [20]:
opts.add_experimental_option('debuggerAddress', 'localhost:9222')

In [26]:
driver = webdriver.Chrome(executable_path= driver_path, options= opts)

In [27]:
driver.get('https://www.amazon.com')

In [30]:
data = bs(driver.page_source)

In [46]:
product_panes = data.find_all('div', {'data-component-type':'s-search-result'})

In [39]:
product_pane.attrs['data-asin']

'B08Z85TSJ3'

In [77]:
# brand_list = []
# with open('C:/Users/Jimoh/Downloads/brainz - Sheet2.csv', 'r') as f:
#     reader = csv.reader(f)
#     for i in reader:
#         brand_list += i

In [84]:
# brand_list = list(set(brand_list))
# brand_list.remove('')

# ser = pd.Series(brand_list).str.strip()
# ser.to_csv('C:/Users/Jimoh/Documents/GitHub/Scraper/brandList.csv', index= False)

# pd.read_csv('C:/Users/Jimoh/Documents/GitHub/Scraper/brandList.csv').iloc[:,0].to_list()

[<div class="sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col sg-col-4-of-20" data-asin="B0148NNKTC" data-component-id="38" data-component-type="s-search-result" data-index="0" data-uuid="25ab8f75-0403-456c-a912-5eb37c25730b"><div class="sg-col-inner">
 <span cel_widget_id="MAIN-SEARCH_RESULTS-0" class="celwidget slot=MAIN template=SEARCH_RESULTS widgetId=search-results">
 <div class="s-expand-height s-include-content-margin s-latency-cf-section {{ borderCssClass }}"><div id="bsr-B0148NNKTC"><div class="sc-lahOFi hyIXZ"><div class="sc-dhWmbD fDsoTu"><div class="sc-DhIVs gOgEBp">#18 in <span><a class="sc-iawIMh bZcxwc" href="/gp/bestsellers/electronics/ref=pd_zg_ts_electronics">Electronics</a></span></div><div class="sc-DhIVs gOgEBp">#1 in <span><a class="sc-iawIMh bZcxwc" href="/gp/bestsellers/electronics/1292115011/ref=pd_zg_hrsr_electronics">Computer Monitors</a></span></div></div><div class="sc-iQTVEV kcoRTK"><div class="sc-dqFsfS cxdBhe"><span class="sc-jMZYIo"><strong>ASI

In [485]:
if True:
    print('nice')

nice


In [320]:
%time
pd.read_csv(brand_list_path).set_index('0')#.filter(like= 'A', axis= 'index').index#.iloc[:,0]#.to_list()

Wall time: 0 ns


DANYA B
CAMP CHEF
MEXX
PLAYMOBILÂ®
ANGELA LAFRAMBOISE
...
VIZIR
GRAMINEX
CHROME WHEELS
FROZEN (DISNEY)
DISCREET
