In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from requests.exceptions import RequestException

#### Get soup Functions

In [33]:
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15",
]

def get_soup(url, max_retries=3):
    retries = 0
    while retries < max_retries:
        try:
            # Randomly select a User-Agent
            headers = {
                "User-Agent": random.choice(user_agents),
            }

            # Use requests session to handle cookies and headers
            session = requests.Session()
            session.headers.update(headers)

            # Use Selenium to handle dynamic content loading
            options = webdriver.ChromeOptions()
            options.add_argument("--headless")
            options.add_argument(f"user-agent={headers['User-Agent']}")
            driver = webdriver.Chrome(options=options)

            driver.get(url)
            time.sleep(1)  # Wait for the content to load, adjust as needed

            page_source = driver.page_source
            driver.quit()

            # Parse the page source with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')

            # Introduce a random delay to mimic human behavior
            time.sleep(random.uniform(1, 2))  # Sleep for a random duration between 1 and 3 seconds

            print(url)
            return soup

        except RequestException as e:
            retries += 1
            print(f"Request failed: {e}. Retrying ({retries}/{max_retries})...")
            time.sleep(2 ** retries)  # Exponential backoff

        except Exception as e:
            retries += 1
            print(f"Error: {e}. Retrying ({retries}/{max_retries})...")
            time.sleep(2 ** retries)  # Exponential backoff

    print(f"Failed to fetch {url} after {max_retries} retries.")
    return None


#### Amazon Functions

In [34]:
def extract_products_amazon(soup, limit = 9999):
    products = []
    product_containers = soup.find_all('div', {'data-component-type': 's-search-result'})
    count = 0
    for container in product_containers:
        if count >= limit:
            break        
        try:
            model = container.find('span', {'class': 'a-size-base-plus a-color-base a-text-normal'}).text
            price = container.find('span', {'class': 'a-offscreen'}).text
            link = container.find('a', {'class': 'a-link-normal s-no-hover s-underline-text s-underline-link-text s-link-style a-text-normal'})['href']
            full_link = "https://www.amazon.com.mx" + link
            products.append({'Model': model, 'Price': price, 'Link': full_link})
            count += 1
        except AttributeError:
            continue

    return products

def get_data_amazon(data, components,websites):
    for part, query in components.items():
        url = websites["Amazon"] + query.replace(" ", "+")
        soup = get_soup(url)
        if soup:
            print("Retreived succesfully.")
            products = extract_products_amazon(soup, limit = 5)
            for product in products:
                data.append(["Amazon", part, query, product['Price'], product['Model'], product['Link']])
        else:
            print("Failed to retrieve the page.")
    return data

#### Cyberpuerta Functions

In [35]:

def extract_products_cyberpuerta(soup):
    products = []
    product_containers = soup.find_all('div', {'class': 'emproduct_right'})
    for container in product_containers:
        try:
            model = container.find('a').text
            price = container.find('label', {'class': 'price'}).text
            link = container.find('a')['href']
            full_link = link
            products.append({'Model': model, 'Price': price, 'Link': full_link})
        except AttributeError:
            continue

    return products

def get_data_cyberpuerta(data, components, websites):    
    for part, query in components.items():
        url = websites["Cyberpuerta"] + query.replace(" ", "+")
        soup = get_soup(url)
        if soup:
            print("Retreived succesfully.")
            products = extract_products_cyberpuerta(soup)
            for product in products:
                data.append(["Cyberpuerta", part, query, product['Price'], product['Model'],  product['Link']])
        else:
            print("Failed to retrieve the page.")
    return data

#### Google Functions

In [38]:
def extract_google_shopping_data(soup, limit = 9999):
    products = []
    product_containers = soup.select('div.sh-dgr__grid-result')
    count = 0
    for container in product_containers:
        if count >= limit:
            break        
        try:
            model = container.find('h3', {'class': 'tAxDx'}).text.strip()
            price = container.find('span', {'class': 'a8Pemb OFFNJ'}).text.strip()
            link = container.find('a', {'class': 'shntl'})['href']

            # Ensure the link is complete
            if link.startswith("/url"):
                full_link = "https://www.google.com" + link
            elif link.startswith("http"):
                full_link = link
            else:
                continue

#            full_link = "https:" + link.split("https:")[1]
            products.append({'Model': model, 'Price': price, 'Link': full_link})
            count += 1
        except AttributeError:
            continue
    return products


def get_data_google_shopping(data, components, websites):
    for part, query in components.items():
        url = websites["Google"] + query.replace(" ", "+")
        soup = get_soup(url)
        if soup:
            print("Retreived succesfully.")
            products = extract_google_shopping_data(soup, limit = 15)
            for product in products:
                data.append(["Google", part, query, product['Price'], product['Model'],  product['Link']])
        else:
            print("Failed to retrieve the page.")
    return data

#### Tester

In [44]:
#https://pcreathors.mx/producto/ma11-equipo-amd-ryzen-9-7950x-64gb-ram-rtx-4080-super-aero-oc-2tb-ssd

components = {
    "CPU": "AMD 7950X",
    "MB": "Tarjeta madre AM5 AMD",
    "GPU": "4070 Ti Super",
    "GAB": "Gabinete computadora",
    "SSD": "SSD 2TB",
    "RAM": "DDR5 RAM 64 Gb",
    "COOL": "MSI MAG CORELIQUID E360",
    "PSU": "PSU 850W 80 PLUS Gold"
}

websites = {
    "Amazon": "https://www.amazon.com.mx/s?k=",
    "Cyberpuerta": "https://www.cyberpuerta.mx/index.php?cl=search&searchparam=",
    "Google" : "https://www.google.com/search?tbm=shop&q="
}

data = []
data = get_data_amazon(data, components, websites)
data = get_data_cyberpuerta(data, components, websites)
data = get_data_google_shopping(data, components, websites)


https://www.amazon.com.mx/s?k=AMD+7950X
Retreived succesfully.
https://www.amazon.com.mx/s?k=Tarjeta+madre+AM5+AMD
Retreived succesfully.
https://www.amazon.com.mx/s?k=4070+Ti+Super
Retreived succesfully.
https://www.amazon.com.mx/s?k=Gabinete+computadora
Retreived succesfully.
https://www.amazon.com.mx/s?k=SSD+2TB
Retreived succesfully.
https://www.amazon.com.mx/s?k=DDR5+RAM+64+Gb
Retreived succesfully.
https://www.amazon.com.mx/s?k=MSI+MAG+CORELIQUID+E360
Retreived succesfully.
https://www.amazon.com.mx/s?k=PSU+850W+80+PLUS+Gold
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=AMD+7950X
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=Tarjeta+madre+AM5+AMD
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=4070+Ti+Super
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=Gabinete+computadora
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=searc

In [45]:
df = pd.DataFrame(data, columns=['location', 'part', 'query','price','full',  'link'])
df.to_csv('computer_components.tsv', sep='\t', index=False)
backup = df.copy()
backup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228 entries, 0 to 227
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   location  228 non-null    object
 1   part      228 non-null    object
 2   query     228 non-null    object
 3   price     228 non-null    object
 4   full      228 non-null    object
 5   link      228 non-null    object
dtypes: object(6)
memory usage: 10.8+ KB


In [46]:
df = backup.copy()
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

# Updated clean_price function to handle various formats
def clean_price(price):

    # Check if the price starts with a dollar sign
    if price.startswith('$'):
        # Remove the dollar sign and any commas, then convert to float
        price = price.replace('$', '').replace(',', '')
        try:
            return float(price)
        except ValueError:
            return float('NaN')
    elif price.startswith('M'):
        # Remove the dollar sign and any commas, then convert to float
        price = price.replace('MXN', '').replace(',', '')
        try:
            return float(price)
        except ValueError:
            return float('NaN')
    else:
        # Return None for non-monetary values
        return float('NaN')

# Apply the clean_price function to the Price column
df['price'] = df['price'].apply(clean_price)
df = df.dropna()
df.to_csv('computer_components.tsv', sep='\t', index=False)
df.head()



Unnamed: 0,location,part,query,price,full,link
0,Amazon,CPU,AMD 7950X,11248.11,AMD Ryzen 9 7950X3D Procesador de Escritorio d...,https://www.amazon.com.mx/AMD-7950X3D-Procesad...
1,Amazon,CPU,AMD 7950X,10154.54,AMD Ryzen™ 9 7950X Procesador,https://www.amazon.com.mx/AMD-RyzenTM-9-7950X-...
2,Amazon,CPU,AMD 7950X,6944.0,AMD Ryzen™ 9 7900X Procesador,https://www.amazon.com.mx/AMD-RyzenTM-9-7900X-...
3,Amazon,CPU,AMD 7950X,7794.04,Amd CPU RYZEN 7 7800X3D Radeon Graphics AM5 (1...,https://www.amazon.com.mx/AMD-7800X3D-Radeon-G...
4,Amazon,CPU,AMD 7950X,7943.13,AMD Ryzen 9 7900X3D Procesador de Escritorio d...,https://www.amazon.com.mx/AMD-Ryzen-7900X3D-Pr...


In [29]:
# Drop rows with NaN in the Price column
df = df.dropna(subset=['price'])

# Get the indices of the minimum prices for each part
min_price_indices = df.groupby('part')['price'].idxmin()

# Select the rows with the minimum prices
cheapest_options = df.loc[min_price_indices]

cheapest_options.to_csv('cheapest_options.tsv', sep='\t', index=False)
cheapest_options


Unnamed: 0,location,part,query,price,full,link
31,Amazon,COOL,MSI MAG CORELIQUID E360,1714.77,MSI MAG CORELIQUID C360 Refrigeración Líquida ...,https://www.amazon.com.mx/MSI-CORELIQUID-Venti...
2,Amazon,CPU,AMD 7950X,6944.0,AMD Ryzen™ 9 7900X Procesador,https://www.amazon.com.mx/AMD-RyzenTM-9-7900X-...
17,Amazon,GAB,MSI MAG PANO M100R PZ,780.0,MSI Gabinete Gaming mag Shield M301,https://www.amazon.com.mx/MSI-Gabinete-Gaming-...
53,Cyberpuerta,GPU,4070 Ti Super,15179.0,Tarjeta de Video Zotac NVIDIA GAMING GeForce R...,https://www.cyberpuerta.mx/Computo-Hardware/Co...
41,Cyberpuerta,MB,MSI B650M,2199.0,"Tarjeta Madre MSI Micro-ATX PRO B650M-P, S-AM5...",https://www.cyberpuerta.mx/Computo-Hardware/Co...
36,Amazon,PSU,XPG KYBER 850W 80 PLUS Gold,1519.0,XPG Fuente De Poder Gamer KYBER 750W 80 Plus G...,https://www.amazon.com.mx/XPG-Fuente-Poder-Gam...
29,Amazon,RAM,DDR5 64GB,3089.0,Corsair Vengeance DDR5 RAM de 64 GB (2 x 32 GB...,https://www.amazon.com.mx/Corsair-optimizada-c...
73,Cyberpuerta,SSD,SSD 2TB,1809.0,"SSD Acer FA200 NVMe, 2TB, PCI Express 4.0, M.2",https://www.cyberpuerta.mx/Computo-Hardware/Di...


In [None]:
sum_t = cheapest_options['price'].values.sum()
print(f'The configuration requires {sum_t}')

The configuration requires 21999.21


In [2]:
from PC_parts_scrapper import get_data_amazon, get_data_cyberpuerta, get_data_google_shopping, clean_price
import pandas as pd
def scrape(components, csv = ''):
    
    websites = {
        "Amazon": "https://www.amazon.com.mx/s?k=",
        "Cyberpuerta": "https://www.cyberpuerta.mx/index.php?cl=search&searchparam=",
        "Google" : "https://www.google.com/search?tbm=shop&q="
    }

    data = []
    data = get_data_amazon(data, components, websites)
    data = get_data_cyberpuerta(data, components, websites)
    data = get_data_google_shopping(data, components, websites)

    df = pd.DataFrame(data, columns=['location', 'part', 'query','price','full',  'link'])

    # Apply the clean_price function to the Price column
    df['price'] = df['price'].apply(clean_price)
    df = df.dropna()
    if csv: 
        previous_df = pd.read_csv(csv, sep='\t')
        df = pd.concat([previous_df, df], ignore_index=True)
    df.to_csv('computer_components.tsv', sep='\t', index=False)

    return df
    
components = {
    "CPU": "AMD 7950X",
    "MB": "Tarjeta madre AM5 AMD",
    "GPU": "4070 Ti Super",
    "GAB": "Gabinete computadora ATX",
}
scrape(components)
components = {
    "SSD": "SSD 2TB",
    "RAM": "DDR5 RAM 64 Gb",
    "COOL": "MSI MAG CORELIQUID E360",
    "PSU": "PSU 850W 80 PLUS Gold"
}
scrape(components, csv = 'computer_components.tsv')
components = {
    "CPU": "Intel Core i7 14700",
    "MB": "Tarjeta madre LGA1700 INTEL",
    "GPU": "4070 Ti",
    "SSD": "SSD 4TB",
}
scrape(components, csv = 'computer_components.tsv')
components = {
    "GPU": "4070 Super",
    "GAB": "Gabinete Midi-Tower",
    "RAM": "DDR5 RAM 32 Gb",
    "COOL": "Corsair iCUE Link H150i RGB",
}
scrape(components, csv = 'computer_components.tsv')

https://www.amazon.com.mx/s?k=AMD+7950X
Retreived succesfully.
https://www.amazon.com.mx/s?k=Tarjeta+madre+AM5+AMD
Retreived succesfully.
https://www.amazon.com.mx/s?k=4070+Ti+Super
Retreived succesfully.
https://www.amazon.com.mx/s?k=Gabinete+computadora+ATX
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=AMD+7950X
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=Tarjeta+madre+AM5+AMD
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=4070+Ti+Super
Retreived succesfully.
https://www.cyberpuerta.mx/index.php?cl=search&searchparam=Gabinete+computadora+ATX
Retreived succesfully.
https://www.google.com/search?tbm=shop&q=AMD+7950X
Retreived succesfully.
https://www.google.com/search?tbm=shop&q=Tarjeta+madre+AM5+AMD
Retreived succesfully.
https://www.google.com/search?tbm=shop&q=4070+Ti+Super
Retreived succesfully.
https://www.google.com/search?tbm=shop&q=Gabinete+computadora+ATX
Retreived succ

Unnamed: 0,location,part,query,price,full,link
0,Amazon,CPU,AMD 7950X,9982.60,AMD Ryzen™ 9 7950X Procesador,https://www.amazon.com.mx/AMD-RyzenTM-9-7950X-...
1,Amazon,CPU,AMD 7950X,11248.11,AMD Ryzen 9 7950X3D Procesador de Escritorio d...,https://www.amazon.com.mx/AMD-7950X3D-Procesad...
2,Amazon,CPU,AMD 7950X,6944.00,AMD Ryzen™ 9 7900X Procesador,https://www.amazon.com.mx/AMD-RyzenTM-9-7900X-...
3,Amazon,CPU,AMD 7950X,7553.48,Amd CPU RYZEN 7 7800X3D Radeon Graphics AM5 (1...,https://www.amazon.com.mx/AMD-7800X3D-Radeon-G...
4,Amazon,CPU,AMD 7950X,7943.13,AMD Ryzen 9 7900X3D Procesador de Escritorio d...,https://www.amazon.com.mx/AMD-Ryzen-7900X3D-Pr...
...,...,...,...,...,...,...
289,Google,COOL,Corsair iCUE Link H150i RGB,3799.00,Enf. Líquido Corsair iCUE H150i Elite Capellix...,https://www.google.com/url?url=https://ddtech....
290,Google,COOL,Corsair iCUE Link H150i RGB,3443.49,Corsair Refrigeración Líquida Icue H150i Elite...,https://www.google.com/url?url=https://www.tra...
291,Google,COOL,Corsair iCUE Link H150i RGB,2148.06,Enfriamiento Liquido Corsair H150 Argb 360Mm I...,https://www.google.com/url?url=https://mipc.co...
292,Google,COOL,Corsair iCUE Link H150i RGB,4199.00,Corsair Icue Link H100i Rgb Aio Enfriamiento L...,https://www.google.com/url?url=https://www.liv...
