In [10]:
from bs4 import BeautifulSoup as bs
import requests
import random
import time
import re
import pandas as pd

In [2]:
def get_brands(response):
    brands_dict = {}
    html = response.content
    page_soup = bs(html, "lxml")
    output = page_soup.find("div", {"id": "brandsRefinements"})
    brands = output.find_all("span", {"class": "a-size-base a-color-base"})
    
    links = output.find_all("a", {"class":"a-link-normal s-navigation-item"})
    
    for i in range(len(brands)):
        brands_dict[brands[i].text] = links[i]['href']
        
    return(brands_dict)
    

In [3]:
def get_product_name(product):
    product_title = 'N/A'
    try:
        product_title = product.find('span', {'class', 'a-size-base-plus a-color-base a-text-normal'}).text
    except AttributeError:
        print('missing title')
    return product_title

In [4]:
def get_product_name(product):
    product_name = "N/A"
    try:
        product_name = product.find('span', {'class' : 'a-size-base-plus a-color-base a-text-normal'}).text
    except AttributeError:
        print('missing product name')
        
    return product_name
    
        
def get_prices(product):
    discounted = 'N/A'
    current_price = 'N/A'
    original_price = 'N/A'
    
    try:
        price = product.find_all("span", {'class': 'a-offscreen'})
        current_price = price[0].text

        if (len(price) < 2):
            original_price = price[0].text
            discounted = False
        else:
            original_price = price[1].text
            discounted = True
    except (AttributeError, IndexError):
        print('missing product price')
        
    return {
            "current price" : current_price,
            "original price" : original_price,
            "discounted" : discounted
           }

def get_product_review_number(product):
    product_review_number = "N/A"
    try:
        product_review_number = product.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        print('missing product review number') 
    
    return product_review_number

def get_product_number_of_stars(product):
    product_number_of_stars = "N/A"
    
    try:
        product_number_of_stars = product.find('span', {'class': 'a-icon-alt'}).text
    except AttributeError:
        print('missing product stars') 
        
    return product_number_of_stars

def get_shipping_information(product):
    try:
        product_shipping = product(text=re.compile('ship'))
    except AttributeError:
        product_shipping = "N/A"
        print('missing product shipping')   

In [5]:
def scrap(brands, url, header):
    product_dict = {}
    index = 1
    for brand, link in brands.items():
        for page in range(1,8):
            
            wait_time = random.uniform(7,16)
            time.sleep(wait_time)
            print ("Scraping: " + url + link + '&page=' + str(page))
            response = requests.get(url + link + '&page=' + str(page), headers=header)
            html = response.content
            page_soup = bs(html, 'lxml')
            products = page_soup.find_all('div', {'class' : 's-expand-height s-include-content-margin s-border-bottom'})
            
            if products is None:
                break
            
        
            rank_count = 1
            for product in products:
                price_info = get_prices(product)
                product_dict[index] = {
                        "product_name" : get_product_name(product),
                        "brand_name" : brand,
                        "current_price" : price_info["current price"],
                        "original_price" : price_info["original price"],
                        "discounted" : price_info["discounted"],
                        "number_of_reviews" : get_product_review_number(product),
                        "number_of_stars" : get_product_number_of_stars(product),
                        "product_shipping" : get_shipping_information(product),
                        "page" : page,
                        "page_rank" : rank_count
                        }
                rank_count = rank_count + 1
                index = index + 1
            print("Number of products scrapped: " + str(len(product_dict)))
            break # limit one page
        break # limit to one brand
    return product_dict
            
            

In [11]:


url = 'https://www.amazon.com'
product_url = url + '/s?k=iphone+case' 
header={'user-agent':''}

response = requests.get(product_url, headers=header)

if response.status_code != 200:
    print("Error: " + response.status_code)
    exit(0)
else:
    print("Response: " + str(response.status_code))
    
print(response)
brands = get_brands(response)

products = scrap(brands, url, header)

print("Total number of products scraped: " + str(len(products)))

# write results to a file
df = pd.DataFrame.from_dict(products, orient='index')
df.to_csv('./output-test-2.csv')


Response: 200
<Response [200]>
Scraping: https://www.amazon.com/s?k=iphone+case&rh=p_89%3ASpigen&dc&qid=1564186583&rnid=2528832011&ref=sr_nr_p_89_1&page=1
Number of products scrapped: 48
Total number of products scraped: 48
