In [1]:
import time
import csv
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup


In [9]:



# Initialize WebDriver
driver = webdriver.Chrome()

# Function to remove everything after the first '('
def remove_after_bracket(input_string):
    """
    Removes everything in the string after the first occurrence of '('.
    If '(' is not found, returns the original string.
    """
    return input_string.split('(')[0].strip()

# Function to extract CPU name
def extract_cpu_name(product_name):
    """
    Extracts and cleans the CPU name by removing text after '('.
    """
    return remove_after_bracket(product_name)

# Function to scrape a page and extract product data
def scrape_page(url, page_num):
    driver.get(url)
    time.sleep(3)  # Wait for the page to load
    
    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Find all product divs in the list
    product_divs = soup.find_all('a', class_='p-name')
    
    if not product_divs:
        return None
    
    products = []
    for product_div in product_divs:
        try:
            # Extract product details
            product_name = product_div.find('h3').get_text(strip=True).upper()  # Full product name in uppercase
            cpu_name = extract_cpu_name(product_name).upper()  # Cleaned CPU name in uppercase
            
            # Extract original price
            original_price = product_div.find_next('div', class_='price-container').find('del', class_='p-old-price').get_text(strip=True).upper()
            
            # Extract discounted price
            discounted_price = product_div.find_next('div', class_='price-container').find('span', class_='p-price').get_text(strip=True).upper()

            # Clean up the prices (remove unwanted characters like ' đ')
            original_price = original_price.replace(" Đ", "").strip()
            discounted_price = discounted_price.replace(" Đ", "").strip()

            # Append to products list
            products.append({
                'Name': product_name,  # Full product name in uppercase
                'Product Name': cpu_name,  # Cleaned CPU name in uppercase
                'Original Price': original_price,  # Original price in uppercase
                'Discounted Price': discounted_price  # Discounted price in uppercase
            })
        except Exception as e:
            print(f"Error extracting product info: {e}")
    
    return products

# Function to check if there's a next page
def has_next_page(page_num):
    try:
        next_button = driver.find_element(By.XPATH, f"//a[contains(@href, '?page={page_num + 1}')]")
        return next_button.is_displayed() and next_button.is_enabled()
    except Exception:
        return False

# Function to scrape a given category
def scrape_category(base_url, category_name):
    page_num = 1
    all_products = []

    print(f"Scraping category: {category_name}")

    while True:
        url = f"{base_url}?page={page_num}"
        print(f"Scraping page {page_num}...")
        
        products = scrape_page(url, page_num)
        if products is None:
            print(f"No products found on page {page_num}, stopping.")
            break
        
        all_products.extend(products)

        if not has_next_page(page_num):
            print("No more pages to scrape.")
            break
        
        page_num += 1

    return all_products

# Function to write products to a CSV file
def write_to_csv(products, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=['Name', 'Product Name', 'Original Price', 'Discounted Price'])
        writer.writeheader()
        for product in products:
            writer.writerow(product)

# Base URLs for scraping
cpu_base_url = 'https://www.anphatpc.com.vn/cpu-bo-vi-xu-ly.html'

# Scrape the CPU category
cpu_products = scrape_category(cpu_base_url, "CPU")

# Write products to CSV file
write_to_csv(cpu_products, 'AnPhat_CPU.csv')

# Close the WebDriver
driver.quit()


Scraping category: CPU
Scraping page 1...
Scraping page 2...
Scraping page 3...
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Scraping page 4...
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Error extracting product info: 'NoneType' object has no attribute 'get_text'
Error extracting product info: 'NoneType' object has no attribute 'get_text'
No more pages to scrape.
