-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
116 lines (93 loc) · 3.95 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
from bs4 import BeautifulSoup
import requests
def extract_clean_string(string:str):
return string.replace('\u00a0', '').replace('\u00e9', '').strip()
def parse_product(product_html):
product_info = {}
soup = BeautifulSoup(product_html, 'html.parser')
try:
# Extract product name
product_info['name'] = soup.select_one('.woo-loop-product__title a').text.strip()
except AttributeError:
product_info['name'] = None
try:
# Extract product link
product_info['link'] = soup.select_one('.woo-loop-product__title a')['href']
except (AttributeError, TypeError):
product_info['link'] = None
try:
# Extract product image URL
product_info['image_url'] = soup.select_one('.mf-product-thumbnail img')['src']
except (AttributeError, TypeError):
product_info['image_url'] = None
try:
# Extract product brand (if available)
product_info['brand'] = soup.select_one('.meta-brand a').text.strip() if soup.select_one('.meta-brand a') else None
except AttributeError:
product_info['brand'] = None
try:
# Extract product description
product_info['description'] = soup.select_one('.woocommerce-product-details__short-description').text.strip()
except AttributeError:
product_info['description'] = None
try:
# Extract product price
product_info['price'] = soup.select_one('span.woocommerce-Price-amount.amount').text.strip()
except AttributeError:
product_info['price'] = None
try:
# Extract discounted price (if available)
discounted_price = soup.select_one('.price del .woocommerce-Price-amount bdi')
product_info['discounted_price'] = discounted_price.text.strip() if discounted_price else None
except AttributeError:
product_info['discounted_price'] = None
return product_info
def get_product_list_and_parse(html):
soup = BeautifulSoup(html, 'html.parser')
# Find the product container
product_container = soup.select_one('.products-content')
print(product_container)
# Find all products within the container
product_list = product_container.find_all('li')
# Apply parse_product to each product in the list
parsed_products = [parse_product(str(product)) for product in product_list]
return parsed_products
def get_html_with_selenium(url):
driver = None
try:
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu') # Last I checked this was necessary.
driver = webdriver.Chrome(options=options)
driver.get(url)
sleep(2)
driver.save_screenshot('screenshot.png')
html_content = driver.page_source
# // write to file
# write_to_file(html_content, 'nextlevelpc.html')
return get_product_list_and_parse(html_content)
except Exception as e:
print(f"Error: {e}")
return None
finally:
if driver:
driver.quit()
def write_to_file(data, filename):
with open(filename, 'w') as file:
json.dump(data, file, indent=2) # indent=2 for pretty formatting
API_ENDPOINT='http://localhost:3000/products'
if __name__ == "__main__":
target_url = 'https://nextlevelpc.ma/produits-les-plus-vendus/'
# target_url = 'https://nextlevelpc.ma/page/2/?s=intel+i5+&post_type=product'
# target_url = 'https://nextlevelpc.ma/?s=i5+&post_type=product'
html_content = get_html_with_selenium(target_url)
if html_content is not None:
for product in html_content:
print(product)
requests.post(url=API_ENDPOINT, data=product)
# write_to_file(html_content, 'nextlevelpc.json')