# Amazon price scraper

In [16]:
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver

In [17]:
def get_url(search_term, page):
    #Generate a url from search text
    template = "https://www.amazon.com.tr/s?k={}&page={}&__mk_tr_TR=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=6B9N7CEYX8L5&qid=1730488024&ref=sr_pg_{}"
    search_term = search_term.replace(" ", "+")
    url = template.format(search_term, page, page)
        
    return url

# Mapping of Turkish keys to English equivalents (translated versions of scrapped product information headers in Turkish )
# Sometimes products don't have every field in here and/or have additional different fields
# These are just some of the common ones I chose to get for products that have them
translation_dict = {
    "Marka": "Brand",
    "Üretici": "Manufacturer",
    "Paket Boyutları": "Package Dimensions",
    "Üretici Referansı": "Manufacturer Reference",
    "Çözünürlük": "Resolution",
    "İşlemci Markası": "Processor Brand",
    "İşlemci Türü": "Processor Type",
    "İşlemci Hızı": "Processor Speed",
    "RAM Boyutu": "RAM Size",
    "Bilgisayar Bellek Türü": "Computer Memory Type",
    "Sabit Sürücü Boyutu": "Hard Drive Size",
    "Sabit Disk Açıklaması": "Hard Drive Description",
    "Sabit Sürücü Arabirimi": "Hard Drive Interface",
    "Grafik İşlemci Üreticisi": "Graphics Processor Manufacturer",
    "Kablosuz Türü": "Wireless Type",
    "Voltaj": "Voltage",
    "İşletim Sistemi": "Operating System",
    "ASIN": "ASIN",
    "Ürün Ağırlığı": "Product Weight"
}

In [18]:
def extract_record(item, driver):
    
    #Extract and return data from a single record
    
    # description and url
    atag = item.h2.a
    description = atag.text.strip()
    url = "https://www.amazon.com.tr" + atag.get("href")
    
    rating = ''
    review_count = ''
    star_5_text = ''
    star_4_text = ''
    star_3_text = ''
    star_2_text = ''
    star_1_text = ''
    release_date = ''
    
    try:
        # product price
        price_parent = item.find('span', 'a-price')
        price = price_parent.find('span', 'a-offscreen').text
    except AttributeError:
        return
    
    try:
        # rating and review count
        rating = item.i.text
        review_count = item.find('div', {'data-cy': 'reviews-block'}).find('span', {'class': 'a-size-base s-underline-text'}).text
    except AttributeError:
        pass # Leave rating and review_count as empty strings if not found
    
    
        
    # Go to the product page to extract additional details
    driver.get(url)
    time.sleep(10)  # Allow page to load fully
    product_soup = BeautifulSoup(driver.page_source, 'html.parser')
    
        
    # Extract full text of star rating elements (purposefully left uncleaned)
    try:
        # Locate the rating elements within the histogram table 
        rating_elements = product_soup.find('ul', id='histogramTable')
        
        if rating_elements:
            for li in rating_elements.find_all('li'):
                # Find either an <a> or <span> tag with aria-label attribute
                anchor_or_span = li.find(['a', 'span'], {'aria-label': True})
                
                if anchor_or_span:
                    label = anchor_or_span['aria-label'].strip()
                    
                    # Check for star level and assign the label to the appropriate variable
                    if "5 yıldız" in label:
                        star_5_text = label
                    elif "4 yıldız" in label:
                        star_4_text = label
                    elif "3 yıldız" in label:
                        star_3_text = label
                    elif "2 yıldız" in label:
                        star_2_text = label
                    elif "1 yıldız" in label:
                        star_1_text = label
    except AttributeError:
        pass  # If rating elements are not found, leave star texts as empty
    
    
    # Locate the release date from the details table
    try:
        details_table = product_soup.find('table', id='productDetails_detailBullets_sections1')
        if details_table:
            for row in details_table.find_all('tr'):
                header = row.find('th', 'a-color-secondary')
                if header and "Satışa Sunulduğu İlk Tarih" in header.text.strip():
                    release_date = row.find('td', class_='a-size-base').text.strip()
                    break
    except AttributeError:
        pass  # If release date is not found, leave it as an empty string


    # Extract technical specifications with translation
    tech_specs = {}
    try:
        tech_spec_table = product_soup.find('table', id='productDetails_techSpec_section_1')
        if tech_spec_table:
            for row in tech_spec_table.find_all('tr'):
                header = row.find('th', 'a-color-secondary a-size-base prodDetSectionEntry')
                value = row.find('td', 'a-size-base prodDetAttrValue')
                if header and value:
                    key = header.text.strip()
                    translated_key = translation_dict.get(key)  # Map only if in translation_dict
                    if translated_key:  # Include only keys found in the dictionary
                        tech_specs[translated_key] = value.text.strip()
    except AttributeError:
        pass  # Leave tech_specs empty if no valid table found



    # Compile result
    result = {
        "Description": description,
        "Price": price,
        "Rating": rating,
        "ReviewCount": review_count,
        "Url": url,
        "5StarPct": star_5_text,
        "4StarPct": star_4_text,
        "3StarPct": star_3_text,
        "2StarPct": star_2_text,
        "1StarPct": star_1_text,
        "ReleaseDate": release_date,
    }
    result.update(tech_specs)  # Add tech specs as additional columns
    return result

In [None]:
def main(search_term):
    
    # Startup the webdriver
    driver = webdriver.Chrome()
    records = []

    # Second number in range should be adjusted manually according to the search term 
    # and how many pages of results there are for it
    for page in range(1, 5):
        url = get_url(search_term, page)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        results = soup.find_all('div', {'data-component-type': 's-search-result'})
        for item in results:
            record = extract_record(item, driver)
            if record:
                records.append(record)
    
    driver.close()

    # Save data to CSV
    if records:
        # Use only predefined keys
        predefined_keys = [
            "Description", "Price", "Rating", "ReviewCount", "Url",
            "5StarPct", "4StarPct", "3StarPct", "2StarPct", "1StarPct", "ReleaseDate"
        ] + list(translation_dict.values())  # Combine with translated keys
        fieldnames = predefined_keys
        with open('output.csv', 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(records)

In [22]:
# run program with any search item granted page number in main function is adjusted properly
main('laptop')