Import required libraries

In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import re 

Amazon URL

In [4]:
URL = "https://www.amazon.in/s?k=laptop&crid=3H4ZV8KKLMX73&sprefix=lapto%2Caps%2C348&ref=nb_sb_noss_2"

Make a header to mimic a browser visit (We make headers to tell that we are not any hackers or scammers)

In [5]:
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/144.0.0.0 Safari/537.36"}

Send request to URL

Beautifulsoup to parse the HTML Content (It helps to convert the scarp data to html form)

In [None]:
data = []
print("Data list cleared")

for page in range(1, 7):  
    params = {"k": "laptop", "page": page}
    
    # Use verify=False to bypass SSL verification (for testing purposes)
    response = requests.get(URL, params=params, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    
    products = soup.find_all("div", {"data-component-type": "s-search-result"})

    
    
    for product in products:
        # Extracting the product title
        title_tag = product.find("h2")
        if not title_tag:
            continue
        title_text = title_tag.get_text(strip=True)

        
        # Extracting the product price
        price_tag = product.find("span", {"class": "a-price-whole"})
        price_text = price_tag.get_text(strip=True) if price_tag else "N/A"

        #Extracting the processor
        processor = re.search(r"(i[3579]|Ryzen\s?\d)", title_text)
        processor = processor.group() if processor else None
        
        # Extracting the product brand
        match = re.match(r'^\W*([A-Za-z]+)', title_text)
        brand = match.group(1).upper() if match else "UNKNOWN"

        # Extracting RAM (improved version)
        ram_match = re.search(r'(\d+)\s*GB\s*(RAM|DDR\d+|LPDDR\d+)', title_text, re.IGNORECASE)
        ram = ram_match.group(1) + "GB" if ram_match else "N/A"
        
        # Extracting the product rating 
        rating_tag = product.find("span", class_="a-icon-alt")
        rating = "N/A"
        if rating_tag:
            rating_match = re.search(r'(\d+\.?\d*)', rating_tag.get_text())
            rating = rating_match.group(1) if rating_match else "N/A"

        # Extracting the Product SSD - Simplified
        ssd_match = re.search(r'(\d+)\s*(GB|TB)\s*(?:SSD|NVMe)', title_text, re.IGNORECASE) or re.search(r'(\d+)\s*(GB|TB)', title_text)
        ssd = (ssd_match.group(1) + ssd_match.group(2).upper()) if ssd_match else "N/A"
      

        # Extracting the Product Windows Version - Improved version
        windows_version = "N/A"
        
        # Try: "Windows 11", "Windows 10", "Win11", "Win10", etc.
        windows_match = re.search(r'(?:Windows\s*|Win\s*)(\d+)', title_text, re.IGNORECASE)
        if windows_match:
            windows_version = "Windows " + windows_match.group(1)
        
        # Extracting the Product Color
        color_match = re.search(r'\b(Black|Silver|Gray|Grey|White|Blue|Red|Gold|Green|Brown|Pink|Purple|Yellow|Orange|Champagne|Midnight|Space|Cosmic|Stardust|Graphite|Ash|Onyx|Platinum|Metallic)\b', title_text, re.IGNORECASE)
        color = color_match.group(1) if color_match else "N/A"

        # Extracting Discount Percentage
        discount_tag = product.find("span", string=re.compile(r'%'))
        discount = discount_tag.get_text(strip=True) if discount_tag else "N/A"

        screen_match = re.search(r'(\d{1,2}\.?\d?)\s*(?=[^\d]*cm|[^\d]*["\'])',title_text,re.IGNORECASE)
        screen_size = screen_match.group(1) if screen_match else "N/A"



        # Store the extracted data in a dictionary and append to the list
        data.append({
            "Title": title_text,
            "Price": price_text,
            "Processor":processor,
            "Brand": brand,
            'RAM': ram,
            "Rating": rating,
            "Storage": ssd,
            "Windows": windows_version,
            "Color": color,
            "Discount": discount,
            "Screen Size": screen_size
        })

      
    print(f"Page {page} scraped")
    time.sleep(1)


Data list cleared
Page 1 scraped
Page 2 scraped
Page 3 scraped
Page 4 scraped
Page 5 scraped
Page 6 scraped


In [38]:
for product in data:
    print("Title:", product["Title"])
    print("Price:", product["Price"])
    print("Brand:", product["Brand"])
    print("Processor:",product["Processor"])
    print("RAM:", product.get("RAM"))
    print("Rating:", product.get("Rating"))
    print("Storage:", product.get("Storage"))
    print("Windows:", product.get("Windows"))
    print("Color:", product.get("Color"))
    print("Discount:",product.get("Discount"))
    print("Screen Size:",product.get("screen_size"))
    print("-" * 50)

Title: HP Omnibook 5 OLED, Snapdragon X Processor (16GB LPDDR5x,1TB SSD) 2K OLED, Micro-Edge, 16''/40.6cm, Win11, M365*Office24, Glacier Silver, 1.59kg, fb0001QU, FHD Camera, Backlit, Next-Gen AI Laptop
Price: 69,990
Brand: HP
Processor: None
RAM: 16GB
Rating: 4.1
Storage: 1TB
Windows: Windows 11
Color: Silver
Discount: (19% off)
Screen Size: None
--------------------------------------------------
Title: HP 15, 13th Gen Intel Core i5-13420H, (16GB DDR4, 512GB SSD), FHD, Anti-Glare, Micro-Edge, 15.6''/39.6cm, Win11, M365 Basic(1yr)* Office24, Silver, 1.65kg, FR0045TU, Camera w/Shutter, Backlit Laptop
Price: 60,490
Brand: HP
Processor: i5
RAM: 16GB
Rating: 4.0
Storage: 512GB
Windows: Windows 11
Color: Silver
Discount: (12% off)
Screen Size: None
--------------------------------------------------
Title: EBook 11.6" HD Laptop | Best Student & Office Work Laptop | Celeron N4020 | 4GB DDR4 | 128GB eMMC + M.2 SSD Expandable Slot | Win 11 Home |31Wh Battery | UHD Graphics 600 | Black
Price: 10

In [39]:
data

[{'Title': "HP Omnibook 5 OLED, Snapdragon X Processor (16GB LPDDR5x,1TB SSD) 2K OLED, Micro-Edge, 16''/40.6cm, Win11, M365*Office24, Glacier Silver, 1.59kg, fb0001QU, FHD Camera, Backlit, Next-Gen AI Laptop",
  'Price': '69,990',
  'Processor': None,
  'Brand': 'HP',
  'RAM': '16GB',
  'Rating': '4.1',
  'Storage': '1TB',
  'Windows': 'Windows 11',
  'Color': 'Silver',
  'Discount': '(19% off)',
  'Screen Size': '16'},
 {'Title': "HP 15, 13th Gen Intel Core i5-13420H, (16GB DDR4, 512GB SSD), FHD, Anti-Glare, Micro-Edge, 15.6''/39.6cm, Win11, M365 Basic(1yr)* Office24, Silver, 1.65kg, FR0045TU, Camera w/Shutter, Backlit Laptop",
  'Price': '60,490',
  'Processor': 'i5',
  'Brand': 'HP',
  'RAM': '16GB',
  'Rating': '4.0',
  'Storage': '512GB',
  'Windows': 'Windows 11',
  'Color': 'Silver',
  'Discount': '(12% off)',
  'Screen Size': '15.6'},
 {'Title': 'EBook 11.6" HD Laptop | Best Student & Office Work Laptop | Celeron N4020 | 4GB DDR4 | 128GB eMMC + M.2 SSD Expandable Slot | Win 11 

In [40]:
df=pd.DataFrame(data)
df

Unnamed: 0,Title,Price,Processor,Brand,RAM,Rating,Storage,Windows,Color,Discount,Screen Size
0,"HP Omnibook 5 OLED, Snapdragon X Processor (16...",69990,,HP,16GB,4.1,1TB,Windows 11,Silver,(19% off),16
1,"HP 15, 13th Gen Intel Core i5-13420H, (16GB DD...",60490,i5,HP,16GB,4.0,512GB,Windows 11,Silver,(12% off),15.6
2,"EBook 11.6"" HD Laptop | Best Student & Office ...",10990,,EBOOK,4GB,5.0,4GB,Windows 11,Black,(56% off),11.6
3,"HP 15, AMD Ryzen 3 7320U (8GB DDR4, 512GB SSD)...",,Ryzen 3,HP,8GB,4.0,512GB,Windows 11,Silver,,15.6
4,"BrowseBook 14.1"" FHD IPS Laptop | Best Student...",12990,,BROWSEBOOK,4GB,3.0,128GB,Windows 11,Grey,(57% off),14.1
...,...,...,...,...,...,...,...,...,...,...,...
127,"HP Pavilion Plus, Intel Core i5-1335U-13th Gen...",61990,i5,HP,16GB,3.6,512GB,Windows 11,Silver,(26% off),14
128,Lenovo V15 G4 AMD Ryzen 5 7520U 15.6 inch FHD ...,,Ryzen 5,LENOVO,16GB,4.0,512GB,Windows 11,Grey,,
129,"Dyazo Slim 15"" to 15.6 Inch Laptop Sleeve, Lap...",279,,DYAZO,,4.3,,,Grey,(72% off),15
130,"Primebook 2 Max 2025 (New Launch) | 8GB RAM, 2...",22990,,PRIMEBOOK,8GB,4.4,8GB,,Gray,(34% off),


In [41]:
df['Product Name'] = df['Title'].str.split(r'[|,]').str[0].str.strip()
col = df.pop('Product Name')
df.insert(0, 'Product Name', col)
df.drop("Title",axis=1,inplace=True)

In [42]:
df.to_csv("amazon_laptops_raw.csv")

In [43]:
df

Unnamed: 0,Product Name,Price,Processor,Brand,RAM,Rating,Storage,Windows,Color,Discount,Screen Size
0,HP Omnibook 5 OLED,69990,,HP,16GB,4.1,1TB,Windows 11,Silver,(19% off),16
1,HP 15,60490,i5,HP,16GB,4.0,512GB,Windows 11,Silver,(12% off),15.6
2,"EBook 11.6"" HD Laptop",10990,,EBOOK,4GB,5.0,4GB,Windows 11,Black,(56% off),11.6
3,HP 15,,Ryzen 3,HP,8GB,4.0,512GB,Windows 11,Silver,,15.6
4,"BrowseBook 14.1"" FHD IPS Laptop",12990,,BROWSEBOOK,4GB,3.0,128GB,Windows 11,Grey,(57% off),14.1
...,...,...,...,...,...,...,...,...,...,...,...
127,HP Pavilion Plus,61990,i5,HP,16GB,3.6,512GB,Windows 11,Silver,(26% off),14
128,Lenovo V15 G4 AMD Ryzen 5 7520U 15.6 inch FHD ...,,Ryzen 5,LENOVO,16GB,4.0,512GB,Windows 11,Grey,,
129,"Dyazo Slim 15"" to 15.6 Inch Laptop Sleeve",279,,DYAZO,,4.3,,,Grey,(72% off),15
130,Primebook 2 Max 2025 (New Launch),22990,,PRIMEBOOK,8GB,4.4,8GB,,Gray,(34% off),
