In [1]:
import requests
from bs4 import BeautifulSoup
import time
import csv
import random
import pandas as pd
import re


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains


In [14]:
# Function to click the "View More" button until no more products are loaded
def load_all_products():
    try:
        while True:
            # Wait for the "View More" button to be clickable
            view_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "btn-view-more"))
            )
            
            # Check if the button is visible, then click it
            if view_more_button.is_displayed():
                view_more_button.click()
                time.sleep(3)  # Wait for products to load
            else:
                break  # Exit if the button is no longer displayed
    except Exception as e:
        print("No more 'View More' button or error:", e)


In [15]:
# Function to scrape product data after all products are loaded
def scrape_product_data():
    # Wait for product list to be visible
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "product-list-holder"))
    )

    # Get all products displayed on the page
    products = driver.find_elements(By.CLASS_NAME, "p-info")
    product_data = []

    for product in products:
        try:
            # Extract product name
            product_name = product.find_element(By.CLASS_NAME, "p-name").text.strip()
            
            # Extract original price and discounted price
            original_price = product.find_element(By.CLASS_NAME, "p-mprice").text.strip() if product.find_elements(By.CLASS_NAME, "p-mprice") else "N/A"
            discounted_price = product.find_element(By.CLASS_NAME, "p-price").text.strip() if product.find_elements(By.CLASS_NAME, "p-price") else "N/A"

            # Save the product data (excluding SKU)
            product_data.append([product_name, original_price, discounted_price])
        except Exception as e:
            print(f"Error extracting product info: {e}")

    return product_data


In [16]:
# Save the scraped data to a CSV file
def save_to_csv(data, filename):
    try:
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            # Write headers (excluding SKU)
            writer.writerow(["Product Name", "Original Price", "Discounted Price"])
            # Write product data (excluding SKU)
            for product in data:
                writer.writerow(product)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error saving data: {e}")


In [17]:
# Main function to scrape from each URL
def scrape_from_urls(urls, fileanme):
    all_products = []

    for url in urls:
        print(f"Scraping products from: {url}")
        driver.get(url)
        
        # Give the page time to load
        time.sleep(3)
        
        # Load all products by clicking the "View More" button
        load_all_products()

        # Scrape the product data from the page
        products = scrape_product_data()

        # Add the scraped products to the main list
        all_products.extend(products)

    # Save all the scraped product data to a CSV file
    save_to_csv(all_products, fileanme)


In [18]:
# List of vendor URLs
GPU_urls_list = [
    "https://hacom.vn/vga-nvidia",
    "https://hacom.vn/vga-amd",
    "https://hacom.vn/vga-intel"
]

CPU_urls_list = [
    "https://hacom.vn/cpu-intel",
    "https://hacom.vn/cpu-amd"
]

# Set up Selenium WebDriver (no need for specifying path if ChromeDriver is in PATH)
driver = webdriver.Chrome()

# Run the scraping process
scrape_from_urls(GPU_urls_list,"HaComp_GPU_List.csv")
scrape_from_urls(CPU_urls_list,"HaComp_CPU_List.csv")

# Close the driver after scraping is complete
driver.quit()


Scraping products from: https://hacom.vn/vga-nvidia
No more 'View More' button or error: Message: 
Stacktrace:
#0 0x5563161504ca <unknown>
#1 0x556315c63620 <unknown>
#2 0x556315cb2306 <unknown>
#3 0x556315cb25a1 <unknown>
#4 0x556315cf7a04 <unknown>
#5 0x556315cd639d <unknown>
#6 0x556315cf4de0 <unknown>
#7 0x556315cd6113 <unknown>
#8 0x556315ca4be0 <unknown>
#9 0x556315ca5bbe <unknown>
#10 0x55631611ce4b <unknown>
#11 0x556316120de2 <unknown>
#12 0x556316109d2c <unknown>
#13 0x556316121957 <unknown>
#14 0x5563160ef4bf <unknown>
#15 0x55631613f348 <unknown>
#16 0x55631613f510 <unknown>
#17 0x55631614f346 <unknown>
#18 0x7f2dff6dfac3 <unknown>

Scraping products from: https://hacom.vn/vga-amd
No more 'View More' button or error: Message: 
Stacktrace:
#0 0x5563161504ca <unknown>
#1 0x556315c63620 <unknown>
#2 0x556315cb2306 <unknown>
#3 0x556315cb25a1 <unknown>
#4 0x556315cf7a04 <unknown>
#5 0x556315cd639d <unknown>
#6 0x556315cf4de0 <unknown>
#7 0x556315cd6113 <unknown>
#8 0x556315ca4b

In [19]:
# URL of the webpage
url = "https://www.cpubenchmark.net/top-gaming-cpus.html"

# Send a GET request to the webpage
response = requests.get(url)
if response.status_code != 200:
    print(f"Failed to retrieve data. HTTP Status Code: {response.status_code}")
else:
    print("Webpage successfully retrieved!")


Webpage successfully retrieved!


In [20]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find the parent container with class "chartlist"
chartlist = soup.find("ul", {"class": "chartlist"})
if not chartlist:
    print("Could not find the 'chartlist' container on the page.")
else:
    print("Found 'chartlist' container!")


Found 'chartlist' container!


In [21]:
# Initialize a list to store CPU data
cpu_data = []

# Iterate over each list item in the chartlist
for cpu in chartlist.find_all("li", {"class": "platform-cpu"}):
    try:
        # Extract the CPU name
        prdname = cpu.find("span", {"class": "prdname"}).text.strip()

        # Extract the count
        count = cpu.find("span", {"class": "count"}).text.strip()

        # Append to the CPU data list
        cpu_data.append({"Product Name": prdname, "Score": count})
    except AttributeError:
        # Skip items that don't match the expected structure
        print("Skipping an item due to missing data.")

# Display the extracted data
print("Scraped CPU Data:")
for item in cpu_data:
    print(f"Product Name: {item['Product Name']}, Score: {item['Score']}")

Scraped CPU Data:
Product Name: AMD Ryzen 7 9800X3D, Score: 12,206
Product Name: Intel Core Ultra 9 285K, Score: 11,172
Product Name: AMD Ryzen 9 7900X3D, Score: 10,889
Product Name: AMD Ryzen 5 7600X3D, Score: 10,717
Product Name: AMD Ryzen 7 7800X3D, Score: 10,331
Product Name: AMD Ryzen 9 7950X3D, Score: 9,870
Product Name: Intel Core Ultra 7 265K, Score: 9,453
Product Name: Intel Core Ultra 5 245K, Score: 9,310
Product Name: Intel Core i9-14900KS, Score: 9,252
Product Name: AMD Ryzen 5 5600X3D, Score: 8,942
Product Name: Intel Core i9-13900KS, Score: 8,610
Product Name: AMD Ryzen 7 5800X3D, Score: 8,368
Product Name: AMD Ryzen 9 9900X, Score: 8,273
Product Name: Intel Core i9-14900K, Score: 8,206
Product Name: Intel Core i9-14900KF, Score: 8,165
Product Name: Intel Core i9-13900K, Score: 7,900
Product Name: AMD Ryzen 7 5700X3D, Score: 7,861
Product Name: Intel Core i7-14700KF, Score: 7,738
Product Name: Intel Core i9-13900KF, Score: 7,626
Product Name: Intel Core i7-14700K, Score: 

In [22]:
# Convert Product Name to uppercase and rename "Count" to "Score"
for item in cpu_data:
    item["Product Name"] = item["Product Name"].upper()
    item["Score"] = int(item.pop("Score").replace(",", ""))  # Rename and convert "Count" to integer

# Find the highest score to normalize
max_score = max(item["Score"] for item in cpu_data)

# Add a normalized percentage column
for item in cpu_data:
    item["Normalized Score (%)"] = round((item["Score"] / max_score) * 100, 2)  # Calculate percentage

# Save the modified data to a CSV file
output_file = "CPU_Performance_list.csv"
try:
    with open(output_file, "w", newline="", encoding="utf-8") as csvfile:
        # Add "Normalized Score (%)" to the fieldnames
        fieldnames = ["Product Name", "Score", "Normalized Score (%)"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(cpu_data)

    print(f"Data successfully saved to '{output_file}' with normalized scores.")
except Exception as e:
    print(f"Failed to save data to CSV. Error: {e}")


Data successfully saved to 'CPU_Performance_list.csv' with normalized scores.


In [23]:
# Print a confirmation message
print(f"Total CPUs scraped: {len(cpu_data)}")
print(f"CSV file created: {output_file}")


Total CPUs scraped: 300
CSV file created: CPU_Performance_list.csv


In [24]:
# Merging the performance to the specific CPU SKU

# Step 1: Read the two CSV files
eshop_df = pd.read_csv('HaComp_CPU_List.csv')  # Replace with the path to your first file
cpu_score_df = pd.read_csv('CPU_Performance_list.csv')  # Replace with the path to your second file

# Step 2: Clean up column names (remove any leading/trailing spaces)
cpu_score_df.columns = cpu_score_df.columns.str.strip()  # Remove extra spaces from column names
eshop_df.columns = eshop_df.columns.str.strip()

# Step 3: Ensure all 'Product Name' values are strings and handle NaN values
eshop_df['Product Name'] = eshop_df['Product Name'].astype(str).fillna('')
cpu_score_df['Product Name'] = cpu_score_df['Product Name'].astype(str).fillna('')

eshop_df = eshop_df.rename(columns={
    'Original Price': "Ha Comp's Price (Original)",
    'Discounted Price': "Ha Comp's Price (Discounted)"
})

# Step 4: Define the function to match product names and retrieve the Normalize Score (%)
def match_product_name(product_name, score_df):
    for _, row in score_df.iterrows():
        if row['Product Name'] in product_name:  # Check if the name from the second file is in the first file's name
            return row['Normalized Score (%)']  # Use the correct column name
    return None  # Return None if no match is found

# Step 5: Apply the matching function to the e-shop DataFrame
eshop_df['Normalized Score (%)'] = eshop_df['Product Name'].apply(
    lambda x: match_product_name(x, cpu_score_df)
)

# Step 6: Save the updated DataFrame to a new CSV file
eshop_df.to_csv('HaCom_With_Score.csv', index=False)

# Step 7: Optionally, preview the updated DataFrame
print(eshop_df.head())

  Product Name Ha Comp's Price (Original) Ha Comp's Price (Discounted)  \
0          nan                        NaN                          NaN   
1          nan                        NaN                          NaN   
2          nan                        NaN                          NaN   
3          nan                        NaN                          NaN   
4          nan                        NaN                          NaN   

   Normalized Score (%)  
0                   NaN  
1                   NaN  
2                   NaN  
3                   NaN  
4                   NaN  


In [3]:

# Load CSV
df = pd.read_csv("HaComp_CPU_List.csv")

# Remove blank rows and columns
df = df.dropna(how='all')
df = df.dropna(axis=1, how='all')

# Fill missing values with 0
df = df.fillna(0)

# Drop duplicate rows
df = df.drop_duplicates()

# Save the cleaned data
df.to_csv("HaComp_CPU_List.csv", index=False)
