In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [4]:
# Amazon Link with "apple macbook" keyword in search

URL = "https://www.amazon.com/s?k=apple+macbook&ref=nb_sb_noss"
HEADERS = ({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0',
    'Accept-Langugae': 'en-US, en; q=0.5'
})

In [5]:
webpage = requests.get(URL, headers=HEADERS)
webpage

<Response [200]>

In [6]:
soup = BeautifulSoup(webpage.content, 'html.parser')

links = soup.find_all("a", attrs={
    'class': 'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'
})

In [9]:
links_list = []


for link in links:
    links_list.append(link.get('href'))

links_list

['/Apple-2024-MacBook-13-inch-Laptop/dp/B0CX22ZW1T/ref=sr_1_1?dib=eyJ2IjoiMSJ9.6d1ivEKF9ITfXdoIONkPjhfFZWEV9wVPJN9YpSxF1SbHHXTtU-omer1G0AJCk0SWJh0sTQouwgnkyRDdo5o3r3rtMAvXK9bOYfyLGNJWzZL-ASH658YAkQoMFwWsj_SwcpIqXgY-f-DAGUcNhlLiVXOHjzIhP-rAdsU-lL-OM9AOqTtoJJiyRxKQUN3LG8tjmlcqO8BViKvXoG9SUzCRwiw17XE7mkB5wCUhbPUDqqA.QpvTxGVjzzGi-PngbN_IB9yvTugyKhz_KwMUNq25NXM&dib_tag=se&keywords=apple+macbook&qid=1714615399&sr=8-1',
 '/2022-Apple-MacBook-Laptop-chip/dp/B0B3CDZLTB/ref=sr_1_2?dib=eyJ2IjoiMSJ9.6d1ivEKF9ITfXdoIONkPjhfFZWEV9wVPJN9YpSxF1SbHHXTtU-omer1G0AJCk0SWJh0sTQouwgnkyRDdo5o3r3rtMAvXK9bOYfyLGNJWzZL-ASH658YAkQoMFwWsj_SwcpIqXgY-f-DAGUcNhlLiVXOHjzIhP-rAdsU-lL-OM9AOqTtoJJiyRxKQUN3LG8tjmlcqO8BViKvXoG9SUzCRwiw17XE7mkB5wCUhbPUDqqA.QpvTxGVjzzGi-PngbN_IB9yvTugyKhz_KwMUNq25NXM&dib_tag=se&keywords=apple+macbook&qid=1714615399&sr=8-2',
 '/Apple-Macbook-13-inch-Storage-English/dp/B0751N2Y78/ref=sr_1_3?dib=eyJ2IjoiMSJ9.6d1ivEKF9ITfXdoIONkPjhfFZWEV9wVPJN9YpSxF1SbHHXTtU-omer1G0AJCk0SWJh0sTQouwgnkyRDdo5o3r3

In [10]:
def get_title(soup):
    try:
        title = soup.find("span", attrs={'id': 'productTitle'}).text.strip()
    except AttributeError:
        title = ""

    return title


def get_price(soup):
    try:
        price = soup.find("span", attrs={'class': 'a-price a-text-price a-size-medium apexPriceToPay'}).find(
            'span', attrs={'class': 'a-offscreen'}).text
    except AttributeError:
        price = ""

    return price


def get_screen_size(soup):
    try:
        screen_size = soup.find("tr", attrs={'class': 'a-spacing-small po-display.size'}).find(
            "td",  attrs={"class": "a-span9"}).find("span", attrs={'class': 'a-size-base po-break-word'}).text
    except AttributeError:
        screen_size = ""

    return screen_size


def get_color(soup):
    try:
        color = soup.find("tr", attrs={'class': 'a-spacing-small po-color'}).find("td", attrs={
            "class": "a-span9"}).find("span", attrs={'class': 'a-size-base po-break-word'}).text
    except AttributeError:
        color = ""

    return color


def get_hard_disk_size(soup):
    try:
        hard_disk_size = soup.find("tr", attrs={'class': 'a-spacing-small po-hard_disk.size'}).find(
            "td", attrs={"class": "a-span9"}).find("span", attrs={'class': 'a-size-base po-break-word'}).text
    except AttributeError:
        hard_disk_size = ""

    return hard_disk_size


def get_ram_memory_size(soup):
    try:
        ram_memory_size = soup.find("tr", attrs={'class': 'a-spacing-small po-ram_memory.installed_size'}).find(
            "td", attrs={"class": "a-span9"}).find("span", attrs={'class': 'a-size-base po-break-word'}).text
    except AttributeError:
        ram_memory_size = ""

    return ram_memory_size

In [11]:
info = {'title': [], 'price': [], 'screen_size': [],
        'color': [], 'hard_disk_size': [], 'ram_memory_size': [], }

for link in links_list:
    product_list = "https://amazon.com" + link
    new_webpage = requests.get(product_list, headers=HEADERS)
    new_soup = BeautifulSoup(new_webpage.content, 'html.parser')

    info['title'].append(get_title(new_soup))
    info['price'].append(get_price(new_soup))
    info['screen_size'].append(get_screen_size(new_soup))
    info['color'].append(get_color(new_soup))
    info['hard_disk_size'].append(get_hard_disk_size(new_soup))
    info['ram_memory_size'].append(get_ram_memory_size(new_soup))

In [14]:
import numpy as np

amazon_df = pd.DataFrame.from_dict(info)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df.to_csv("amazon_apple_macbook_data.csv", header=True, index=False)

amazon_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  amazon_df['title'].replace('', np.nan, inplace=True)


Unnamed: 0,title,price,screen_size,color,hard_disk_size,ram_memory_size
0,Apple 2024 MacBook Air 13-inch Laptop with M3 ...,$979.11,13.6 Inches,Space Gray,256 GB,8 GB
1,Apple 2022 MacBook Air Laptop with M2 chip: 13...,$806.88,13.6 Inches,Silver,256 GB,8 GB
2,2017 Apple Macbook Air with 1.8GHz Intel Core ...,$234.00,13.3 Inches,,128 GB,8 GB
3,Apple 2023 MacBook Air Laptop with M2 chip: 15...,,15.3 Inches,Midnight,256 GB,8 GB
4,Apple 2023 MacBook Pro Laptop M3 Pro chip with...,"$1,655.08",14.2 Inches,Silver,512 GB,18 GB
5,Apple 2023 MacBook Pro Laptop M3 chip with 8‑c...,"$1,378.13",14.2 Inches,Silver,512 GB,8 GB
6,"Apple 2022 13"" MacBook Air M2, 16GB RAM, 256GB...","$1,198.55",13.6 Inches,,256 GB,16 GB
7,Mid 2017 Apple MacBook Air with 1.8GHz Intel C...,$266.00,13.3 Inches,Silver,256 GB,8 GB
8,Apple 2023 MacBook Pro Laptop M3 Pro chip with...,"$2,287.06",16.2 Inches,Space Black,512 GB,36 GB
9,Apple MacBook Air MJVE2LL/A 13-inch Laptop (1....,$223.00,13.3 Inches,Silver,128 GB,4 GB
