In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 30 16:12:51 2019

@author: Karim Cissé
"""

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re # regex

def get_brands(response):
    brands_dict = {}
    html = response.content
    page_soup = BeautifulSoup(html, "html.parser")
    output = page_soup.find("div", {"id": "brandsRefinements"})
    brands = output.find_all("span", {"class": "a-size-base a-color-base"})
    
    links = output.find_all("a", {"class":"a-link-normal s-navigation-item"})
    
    for i in range(len(brands)):
        brands_dict[brands[i].text] = links[i]['href']
        
    return(brands_dict)

'''def get_user_agent():
    user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", "Mozilla/5.0 (X11; Linux i586; rv:63.0) Gecko/20100101 Firefox/63.0"]
    return {"user_agent" : user_agents[random.randint(0,len(user_agents)-1)]}
'''
def get_user_agent():
    user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/44.0.2403.155 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36", "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36", "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", "Mozilla/5.0 (X11; Linux i586; rv:63.0) Gecko/20100101 Firefox/63.0"]
    return {"user_agent" : ''}


def get_product_name(product):
    product_name = "N/A"
    try:
        product_name = product.find('span', {'class' : 'a-size-base-plus a-color-base a-text-normal'}).text
    except AttributeError:
        print('missing product name')
        
    return product_name
    
        
def get_prices(product):
    discounted = 'N/A'
    current_price = 'N/A'
    original_price = 'N/A'
    
    try:
        price = product.find_all("span", {'class': 'a-offscreen'})
        current_price = price[0].text

        if (len(price) < 2):
            original_price = price[0].text
            discounted = False
        else:
            original_price = price[1].text
            discounted = True
    except (AttributeError, IndexError):
        print('missing product price')
        
    return {
            "current price" : current_price,
            "original price" : original_price,
            "discounted" : discounted
           }

def get_product_review_number(product):
    product_review_number = "N/A"
    try:
        product_review_number = product.find('span', {'class': 'a-size-base'}).text
    except AttributeError:
        print('missing product review number') 
    
    return product_review_number

def get_product_number_of_stars(product):
    product_number_of_stars = "N/A"
    
    try:
        product_number_of_stars = product.find('span', {'class': 'a-icon-alt'}).text
    except AttributeError:
        print('missing product stars') 
        
    return product_number_of_stars

def get_shipping_information(product):
    try:
        product_shipping = product(text=re.compile('ship'))
    except AttributeError:
        product_shipping = "N/A"
        print('missing product shipping')   



def scrap(brands, url, headers):
    product_dict = {}
    index = 1
    for brand, link in brands.items():
        for page in range(1,8):
            
            wait_time = random.uniform(7,16)
            time.sleep(wait_time) # sleep sp Amazon doesn't get angry
            print("scraping: " + url + link + "&page=" + str(page))
            response = requests.get(url + link, headers=get_user_agent())
            html = response.content
            page_soup = BeautifulSoup(html, 'html.parser')
            products = page_soup.find_all('div', {'class': 's-expand-height s-include-content-margin s-border-bottom'})
            
            # if there are no products on the page the go to the next brand
            if products is None:
                break
                
            rank_count = 1
            for product in products:
                price_info = get_prices(product)
                product_dict[index] = {
                        "product_name" : get_product_name(product),
                        "brand_name" : brand,
                        "current_price" : price_info["current price"],
                        "original_price" : price_info["original price"],
                        "discounted" : price_info["discounted"],
                        "number_of_reviews" : get_product_review_number(product),
                        "number_of_stars" : get_product_number_of_stars(product),
                        "product_shipping" : get_shipping_information(product),
                        "page" : page,
                        "page_rank" : rank_count
                        }
                rank_count = rank_count + 1
                index = index + 1
            print("Number of products scrapped: " + str(len(product_dict)))
            break # limit to one page
        break # limit to one brand     
    return product_dict


url = "https://www.amazon.com"    
product_url = url + "/s?k=iphone+case"
response = requests.get(product_url, headers=get_user_agent())


if response.status_code != 200:
    print("Error status code: " + str(response.status_code))
    exit(0)
else:
    print("Response status code: " + str(response.status_code))

# It is neccessary to get the brands first to be able to search for more products since Amazon limits the number of products presneted
brands = get_brands(response)

products = scrap(brands, url, headers=get_user_agent())

print("Total number of products scraped: " + str(len(products)))

# write results to a file
df = pd.DataFrame.from_dict(products, orient='index')
df.to_csv('./output-test-2.csv')




    

Response status code: 200
scraping: https://www.amazon.com/iphone-case/s?k=iphone+case&rh=p_89%3ASpigen&page=1
Number of products scrapped: 48
Total number of products scraped: 48


In [13]:
df = pd.read_csv('./output-test-2.csv')

In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,product_name,brand_name,current_price,original_price,discounted,number_of_reviews,number_of_stars,product_shipping,page,page_rank
0,1,,Spigen,$15.99,$15.99,False,3489,4.4 out of 5 stars,,1,1
1,2,,Spigen,$11.99,$11.99,False,3040,4.4 out of 5 stars,,1,2
2,3,,Spigen,$15.99,$15.99,False,1631,4.6 out of 5 stars,,1,3
3,4,,Spigen,$13.99,$13.99,False,2704,4.4 out of 5 stars,,1,4
4,5,,Spigen,$15.99,$15.99,False,1363,4.7 out of 5 stars,,1,5
