In [1]:
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from io import BytesIO
import re

from openpyxl import load_workbook

import threading
import time

In [2]:
import sys, os
sys.path.append(os.path.join(os.path.dirname('__file__'), '..', 'DB_and_Azure'))
import sql_db_functions as SQLf


## Creating functions

In [3]:
class TimeoutException(Exception):
    pass

def create_soup(url, timeout=5):
    def fetch_soup():
        nonlocal soup, error
        try:
            ua = UserAgent()
            header = {'User-Agent': str(ua.chrome)}

            # Send an HTTP request to the URL
            response = requests.get(url, headers=header)
            soup = BeautifulSoup(response.content, 'html.parser')
        except Exception as e:
            error = e

    soup = None
    error = None

    thread = threading.Thread(target=fetch_soup)
    thread.start()
    thread.join(timeout)

    if thread.is_alive():
        raise TimeoutException("Timeout while fetching the URL")
    if error:
        raise error

    return soup

In [4]:
def contains_required_number(line):
    
    keeping = ['http','http']
    # 001: Prod
    # 004: back
    # 005: 3/4 image
    return any(num in line for num in keeping)

In [5]:
def get_image_links(prod_soup):
    
    item_list_1 = prod_soup.select('div div div div div ul li button div img')

    ##### get image
    images = []
    i = 0
    for item in item_list_1: 
            
        if (i%2 == 0) & (i >0):
            images = images + [item['src']]

        i += 1
    
    return images

In [6]:
def get_description(prod_soup):
    
    description_list = prod_soup.select('dd')

    #Get description
    description = ''
    for d in description_list:
        description = description + ' ' + d.text

    #Get Materials
    description_list = prod_soup.find_all(class_ = 'f8f91e f02838' )

    description = description + ' Materials: '

    for d in description_list[0:2]:
        description = description + ' ' + d.text

    return description
    

In [7]:
def get_price(prod_soup):

    text = prod_soup.select('span.edbe20')[0].text


    # Remove any non-numeric characters except for ',' and '.'
    cleaned_text = re.sub(r'[^\d,\.]', '', text)
    
    # Replace comma with a period if there's no period already (to handle decimal part)
    if ',' in cleaned_text and '.' not in cleaned_text:
        cleaned_text = cleaned_text.replace(',', '.')

    elif ',' not in cleaned_text and '.' in cleaned_text:
        cleaned_text = cleaned_text.replace('.', '')

    elif ',' in cleaned_text and '.' in cleaned_text:
        # If both ',' and '.' are present, keep only the period as the decimal separator
        cleaned_text = cleaned_text.replace('.', '')
        cleaned_text = cleaned_text.replace(',', '.')
    
    # Convert the string to a float
    number = float(cleaned_text)
    
    return number

In [8]:
def HM(category_url,n_products,Clothing_type):

    soup = create_soup(category_url)
    item_list = soup.select('li section article div div div div ul li a')


    br = 0
    pass_i = 0
    for item in item_list:
        pass_i +=1
        if br == n_products:
            break
        elif pass_i == 5:

            retry_attempts = 3
            while retry_attempts > 0:
                try:
                    # Scrape product details
                    try:

                        prod_url = item['href']
                        prod_soup = create_soup(prod_url)
                    except TimeoutException:
                        print(f"Timeout fetching product {br}. Retrying in 5 seconds...")
                        time.sleep(5)
                        retry_attempts -= 1
                        continue
                    except Exception as e:
                        print(f"Error fetching product {br}: {e}")
                        break


                    prod_images_links = get_image_links(prod_soup=prod_soup)
                    prod_description = get_description(prod_soup=prod_soup)
                    prod_price = get_price(prod_soup=prod_soup)
                     

                    print(f'Starting product {br}')
                    
                    time.sleep(3)
                    # Links to image, load to blob and return prod_images_names

                    time.sleep(1)
                    conn, cursor = SQLf.sql_db_functions.connect_sql()

                    SQLf.sql_db_functions.insert_description_image_to_db(
                        conn=conn,
                        cursor=cursor,
                        brand='HM',
                        descript=prod_description,
                        price=prod_price,
                        prod_link = prod_url,
                        Clothing_type = Clothing_type,
                        images_links=prod_images_links
                    )
                    
                    pass_i = 0

                    conn.close()
                    cursor.close()
                    time.sleep(10)
                    break  # Exit the retry loop if successful
                except Exception as e:
                    print(f"Error processing product {br}: {e}")
                    break  # Exit the retry loop if an exception occurs

            if retry_attempts == 0:
                print(f"Failed to process product {br} after 3 attempts. Exiting function.")
                return False  # Exit the function if failed after 3 attempts  

        br += 1



# test

In [19]:
  
prod_url = 'https://www2.hm.com/it_it/productpage.1247954001.html'

prod_soup = create_soup(prod_url)
                    

prod_images_links = get_image_links(prod_soup=prod_soup)
prod_description = get_description(prod_soup=prod_soup)
prod_price = get_price(prod_soup=prod_soup)

In [14]:
prod_url = 'https://www2.hm.com/it_it/productpage.1247954001.html'

prod_soup = create_soup(prod_url)

description_list = prod_soup.select('dd')

#Get description
description = ''
for d in description_list:
    description = description + ' ' + d.text

#Get Materials
description_list = prod_soup.find_all(class_ = 'f8f91e f02838' )

description = description + ' Materials: '

for d in description_list[0:2]:
    description = description + ' ' + d.text


In [18]:
description

' Il modello è alto 177cm/5\'10" e indossa la taglia S Corto Senza maniche Slim fit A fascia, Senza spalline Marrone scuro, Tinta unita DIVIDED Mesh L\'elastan è una fibra sintetica elastica ricavata dal petrolio (una risorsa fossile). Il poliestere è una fibra sintetica che si ottiene dal petrolio (una risorsa fossile). Materials: '

In [10]:
prod_images_links

['https://image.hm.com/assets/hm/64/e5/64e575ec868c59263700db854477c56b6db21caf.jpg?imwidth=2160',
 'https://image.hm.com/assets/hm/37/69/37690d9a17be5948d0b73bfbd7ed84ec9b60e276.jpg?imwidth=2160']

In [52]:
def get_image_links(prod_soup):
    
    item_list_1 = prod_soup.select('div div div div div ul li button div img')

    ##### get image
    images = []
    i = 0
    for item in item_list_1: 
            
        if (i%2 == 0) & (i >0) : #(i == len(item_list_1)-3) | (i == len(item_list_1)-1):
            images = images + [item['src']]

        i += 1
    
    return images

In [53]:
get_image_links(prod_soup)

['https://image.hm.com/assets/hm/21/db/21db7c306fb50d30321d5429a24d8ab361a91f39.jpg?imwidth=2160',
 'https://image.hm.com/assets/hm/93/db/93dbf233e0c7dfa9ff366941c7ac586edd3ab406.jpg?imwidth=2160',
 'https://image.hm.com/assets/hm/ea/af/eaaf76a442f8aca352cbbb3e90e8d361ba5b6e9a.jpg?imwidth=2160',
 'https://image.hm.com/assets/hm/c9/a1/c9a176ffd8f3c3284070be8f5ac6d37d91455179.jpg?imwidth=2160',
 'https://image.hm.com/assets/hm/64/e5/64e575ec868c59263700db854477c56b6db21caf.jpg?imwidth=2160',
 'https://image.hm.com/assets/hm/37/69/37690d9a17be5948d0b73bfbd7ed84ec9b60e276.jpg?imwidth=2160']

# aaa

In [None]:
# Define the URL you want to scrape
url = 'https://www2.hm.com/it_it/donna/acquista-per-prodotto/top.html'


# Need to have # of assets times 6
#
HM(url,120, 'TOP DA DONNA E MAGLIETTE')
