In [324]:
from bs4 import BeautifulSoup
import requests
import csv
from urllib.parse import quote_plus
import pandas as pd
import numpy as np


In [326]:
def generate_url(product_search):
    url_template = 'https://www.amazon.com/s?k={}'
    encoded_article = quote_plus(product_search)
    url = url_template.format(encoded_article)
    return url
    
    

In [328]:
def request_http(url):
    HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.5'}
    # HTTP Request
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return response.text
    else:
        return None
    
#response is html

In [330]:
def collect_all_product_links(html):
    # Soup Object containiang all data
    soup = BeautifulSoup(html, "html.parser")
    # Fetch links as List of Tag Objects
    links = soup.find_all("a", attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})
    return links

In [332]:
#collect_product_link(request_http(generate_url('ipad')))

In [334]:
def get_href_of_product_links(links):
    href_list =[]
    for link in links:
        href_list.append(link.get('href'))
    return href_list
        
    

In [336]:
def generate_children_pages(href_list):
    children_pages = []
    for href in href_list:
         children_pages.append("https://amazon.com" + href)
    return children_pages
        

In [338]:
def get_product_title(soup):
    try:
        title = soup.find("span", attrs={"id":'productTitle'}).text.strip()
    except AttributeError:
        title = ""
    return title  

In [340]:
def get_product_price(soup):
    try:
        price = soup.find("span", attrs = {"class":'a-offscreen'}).text.strip()
    except AttributeError:
        price = ""
    return price

In [342]:
def get_product_rating(soup):
    try:
        rating = soup.find("span", attrs = {"class":'a-icon-alt'}).text.strip()
    except AttributeError:
        rating = ""
    return rating

In [344]:
def get_product_reviews(soup):
    try:
        reviews = soup.find("span", attrs = {"id":'acrCustomerReviewText'}).text.strip()
    except AttributeError:
        reviews = ""
    return reviews
    

In [346]:
def get_product_availability(soup):
    try:
        availability = soup.find("div", attrs = {"id":'availability'}).find("span", attrs = {"class":'a-size-medium a-color-success'}).text.strip()
    except AttributeError:
        availability = "Not Available"
    return availability

In [348]:
def get_nb_of_sales_past_month(soup):
    try:
        sales_past_month = soup.find("span", attrs = {"id": 'social-proofing-faceout-title-tk_bought'}).text
    except AttributeError:
        sales_past_month = ""
    return sales_past_month

In [356]:
def get_data_from_children_page(children_pages):
    dict = {"product_name":[], "price":[], "rating":[], "reviews":[],"availability":[],"sales_past_month":[]}
    for page in children_pages:
        children_html = request_http(page)
        children_soup = BeautifulSoup(children_html, "html.parser")  
        dict['product_name'].append(get_product_title(children_soup))
        dict['price'].append(get_product_price(children_soup))
        dict['rating'].append(get_product_rating(children_soup))
        dict['reviews'].append(get_product_reviews(children_soup))
        dict['availability'].append(get_product_availability(children_soup))
        dict['sales_past_month'].append(get_nb_of_sales_past_month(children_soup))
    return dict

In [358]:
def saving_csv(dict):
    amazon_df = pd.DataFrame.from_dict(dict)
    amazon_df['product_name'].replace('', np.nan)
    amazon_df = amazon_df.dropna(subset=['product_name'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)
    return amazon_df
    

In [360]:
if __name__ == '__main__':
    
    product = 'ipad'
    url = generate_url(product)
    response = request_http(url)
    links = collect_all_product_links(response)
    href_list = get_href_of_product_links(links)
    children_pages = generate_children_pages(href_list)
    dict = get_data_from_children_page(children_pages)
    amazon_data = saving_csv(dict)
    

In [362]:
amazon_data

Unnamed: 0,product_name,price,rating,reviews,availability,sales_past_month
0,Apple iPad (9th Generation): with A13 Bionic c...,$197.01,4.8 out of 5 stars,"66,919 ratings",In Stock,10K+ bought in past month
1,Apple iPad (10th Generation): with A14 Bionic ...,$316.79,4.8 out of 5 stars,"16,616 ratings",In Stock,10K+ bought in past month
2,"Apple iPad (10.2-Inch, Wi-Fi, 32GB) - Space Gr...",$149.00,4.4 out of 5 stars,"9,362 ratings",In Stock,4K+ bought in past month
3,Apple iPad Mini (6th Generation): with A15 Bio...,$349.59,4.8 out of 5 stars,"10,446 ratings",In Stock,5K+ bought in past month
4,Apple iPad (2018 Model) with Wi-Fi only 32GB A...,$119.99,4.5 out of 5 stars,"9,228 ratings",Not Available,4K+ bought in past month
5,"2022 Apple iPad (10.9-inch, Wi-Fi, 64GB) - Sil...",$309.99,4.6 out of 5 stars,472 ratings,Not Available,50+ bought in past month
6,Apple iPad Air 13-inch (M2): Built for Apple I...,$872.29,4.6 out of 5 stars,202 ratings,Not Available,500+ bought in past month
7,"Apple iPad 9.7in with WiFi, 32GB 2017 Newest M...",$126.00,4.5 out of 5 stars,"1,494 ratings",In Stock,400+ bought in past month
8,Apple iPad Pro 12.9-inch (6th Generation): wit...,"$1,189.97",4.7 out of 5 stars,"2,114 ratings",Not Available,
9,"2020 Apple iPad (10.2-inch, Wi-Fi, 128GB) - Si...",$229.00,4.6 out of 5 stars,228 ratings,In Stock,100+ bought in past month
