In [5]:
import requests 
from bs4 import BeautifulSoup 
import time 
from urllib.parse import urljoin, urlencode
import random 
import re
import json
import mysql.connector
import csv 
import os 

In [6]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
    "Cache-Control": "max-age=0",
    "cookie": "paste_cookie",
}

In [3]:
def crawling_page(url, csv_writer):
    try:
        response = requests.get(url, headers = headers)
        #Test lỗi trong yêu cầu requests
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        items = soup.find_all('div', class_='item-cell')
        
        for item in items:
            #itemID:
            item_id = item.find('div', class_='item-container').get('id')
            
            #Title:
            title_element = item.find('a', class_='item-title')
            title = title_element.text.strip() if title_element else "N/A"
            
            #Branding:
            branding_element = item.find('div', class_='item-branding')
            branding = branding_element.img['title'] if branding_element and branding_element.img else "N/A"
            
            #Rating and Number of Ratings:
            rating_element = item.find('a', class_='item-rating')
            rating = None 
            if rating_element:
                rating = float(rating_element.get('title').replace("Rating +", "")) if rating_element and rating_element.get('title') else "N/A"
            
            num_ratings_element = item.find('span', class_='item-rating-num')
            num_ratings = None
            if num_ratings_element:
                num_ratings = int(re.search(r'\d+', num_ratings_element.text).group()) if num_ratings_element else "N/A"
            
            #Current Price (convert to number):
            price_element = item.find('li', class_='price-current')
            if price_element.strong and price_element.sup:
                price_text = price_element.strong.text.replace(",", "") + price_element.sup.text
                current_price = float(price_text)
            else: 
                price_was_element = item.find('li', class_='price-was')
                if price_was_element:
                    pattern = r'\d{1,3}(?:,\d{3})*(?:\.\d+)?'
                    match = re.search(pattern, price_was_element.text)
                    if match: 
                        current_price = float(match.group().replace(',', ''))
                else: 
                    current_price = "N/A"
            #current_price = price_element.text.strip().replace('$', '').replace(',', '') if price_element else "N/A"
            
            #Shipping:
            shipping_element = item.find('li', class_='price-ship')
            shipping = 0
            if shipping_element.text.startswith('$'):
                shipping = float(re.search(r'\d+\.\d+', shipping_element.text).group())
            #shipping = shipping_element.text.strip() if shipping_element else "N/A"
            
            #New Column: Total Price:
            Total_price = current_price + shipping 
            #ImageURL:
            image_url = item.find('img')['src']
            
            #List_features:
            list_features = item.find('ul', class_='item-features')
            if list_features:
                list_features = list_features.find_all('li')
                features = {}
                max_resolution = None 
                hdmi = None 
                displayport = None
                card = None
                model = None
            
                for features in list_features:
                    features = features.text
                    
                    if features.startswith("Max Resolution"): 
                        max_resolution = features.split(':')[1].strip()
                            
                    if features.startswith("HDMI"):
                        hdmi = features.split(':')[1].strip()
                        
                    if features.startswith("DisplayPort"):
                        displayport = features.split(':')[1].strip()
                        
                    if features.startswith("Card"):
                        card = features.split(':')[1].strip()
                        
                    if features.startswith("Model"):
                        model = features.split(':')[1].strip()
                    
                    features = {'Max Resolution': max_resolution,
                                'HDMI': hdmi,
                                'DisplayPort': displayport,
                                'Card': card,
                                'Model': model}
                
                features = json.dumps(features)
            else: 
                features = None 
            
            GPU_info = {"Item ID": item_id,
                        "Title": title,
                        "Brand": branding,
                        "Rating": rating,
                        "Number of Ratings": num_ratings,
                        "Current Price": current_price,
                        "Shipping": shipping,
                        "Total Price": Total_price,
                        "Image URL": image_url,
                        "Features": features}
        
        
        print("\n".join([f"{key}: {value}" for key, value in GPU_info.items()]))
        print("\n")
        
        csv_writer.writerow(GPU_info)
    
    #Print lỗi requests ra
    except requests.RequestException as e:
        print(f"Error request: {e}")

base_url = "https://www.newegg.com/GPUs-Video-Graphics-Cards/SubCategory/ID-48"

total_pages = 80

#Lặp qua các trang
for page_number in range(1, total_pages + 1):
    params = {'page': page_number}
    url = urljoin(base_url, '?' + urlencode(params))
#Gọi hàm để crawling từ trang hiện tại
    crawling_page(url, writer)
#Đợi 1 thời gian ngăn mới chuyển sang trang tiếp theo, tránh tình trạng quá tải server    
    time.sleep(2)

Item ID: 14-150-877
Title: XFX SPEEDSTER MERC310 Radeon RX 7900 XT 20GB GDDR6 PCI Express 4.0 x16 Video Card RX-79TMERCB9
Brand: N/A
Rating: 4.4
Number of Ratings: 40
Current Price: 659.99
Shipping: 0
Total Price: 659.99
Image URL: https://c1.neweggimages.com/ProductImageCompressAll300/14-150-877-15.jpg
Features: {"Max Resolution": "7680 x 4320", "HDMI": "1 x HDMI 2.1", "DisplayPort": "3 x DisplayPort 1.4", "Card": null, "Model": "RX-79TMERCB9"}


Item ID: 14-993-004
Title: SPARKLE Intel Arc A770 TITAN OC Edition, 16GB GDDR6, ThermalSync, TORN Cooling, Axial Fan, Metal Backplate, SA770T-16GOC
Brand: N/A
Rating: 4.8
Number of Ratings: 4
Current Price: 269.99
Shipping: 0
Total Price: 269.99
Image URL: https://c1.neweggimages.com/ProductImageCompressAll300/14-993-004-01.png
Features: {"Max Resolution": "7680 x 4320", "HDMI": "1 x HDMI 2.0b", "DisplayPort": "3 x DisplayPort 2.0", "Card": "305 mm x 103 mm", "Model": "SA770T-16GOC"}


Item ID: 14-500-566
Title: ZOTAC GAMING GeForce RTX 4080 