# 0 - IMPORTS 

In [227]:
from bs4 import BeautifulSoup
from datetime import datetime

import requests
import pandas as pd
import numpy as np 

# 1 - START 

In [2]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""


In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [4]:
soup 


<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [5]:
soup.body 

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [6]:
soup.find_all( 'a', id = 'link2')[0].string

'Lacie'

# 2 - Data Extraction (H&M) - Showcase page data

In [None]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
page = requests.get( url, headers = headers)

soup = BeautifulSoup(page.text, 'html.parser')

products = soup.find('ul', class_ = 'products-listing small')

products_list = products.find_all('article', class_ = 'hm-product-item' )

In [None]:
# Products ID
products_id = [n.get('data-articlecode')for n in products_list]

# Products Category 
products_category = [n.get('data-category')for n in products_list]

# Product name by link class
products_list1 = products.find_all('a', class_ = 'link')

# Products names
products_names = [p.get_text()for p in products_list1]

# Producs Price 
price = products.find_all('span', class_ = 'price regular')
products_price = [p.get_text() for p in price]

In [64]:
data = pd.DataFrame([products_id, products_category, products_names, products_price ]).T
data.columns = ['products_id', 'products_category', 'products_names', 'products_price']

# Scrapy datetime
data['scrapy_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# 3 - Pagination at H&M

In [None]:
# Pagination
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
url = "https://www2.hm.com/en_us/men/products/jeans.html"
page = requests.get( url, headers = headers)
soup = BeautifulSoup(page.text, 'html.parser')
total_item = soup.find_all('h2', class_ = 'load-more-heading')[0].get('data-total')
page_number = np.round (int(total_item) / 36)
url02 = url + '?page-size=' + str(int(page_number * 36 ))

In [None]:
# Requests with url02
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}
page = requests.get( url02, headers = headers)
soup = BeautifulSoup(page.text, 'html.parser')
products = soup.find('ul', class_ = 'products-listing small')
products_list = products.find_all('article', class_ = 'hm-product-item' )

In [210]:
# Products ID
products_id = [n.get('data-articlecode')for n in products_list]

# Products Category 
products_category = [n.get('data-category')for n in products_list]

# Product name by link class
products_list1 = products.find_all('a', class_ = 'link')

# Products names
products_names = [p.get_text()for p in products_list1]

# Producs Price 
price = products.find_all('span', class_ = 'price regular')
products_price = [p.get_text() for p in price]

In [211]:
data = pd.DataFrame([products_id, products_category, products_names, products_price ]).T
data.columns = ['products_id', 'products_category', 'products_names', 'products_price']

# Scrapy datetime
data['scrapy_time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# 4 - Products colors

## 4.1 Just one product

In [196]:
# API Requests
url = 'https://www2.hm.com/en_us/productpage.1130309007.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

page = requests.get(url, headers = headers)
soup = BeautifulSoup(page.text, 'html.parser')

# Product color
product_list = soup.find_all('a', class_ ="filter-option miniature")
color_name = [p.get('data-color')for p in product_list]
products_id = [p.get('data-articlecode') for p in product_list]

#Data frame with color names and products id 
df_color = pd.DataFrame([products_id, color_name]).T
df_color.columns = ['products_id', 'color_name']

# style ID + color id
df_color['style_id'] = df_color['products_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['products_id'].apply(lambda x: x[-3:])

## 4.2 Multiple Products 

In [213]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'}

df_detalis = pd.DataFrame()

for i in range (len (data)):

    url = 'https://www2.hm.com/en_us/productpage.'+ data.loc[i, 'products_id'] + '.html'
    page = requests.get(url, headers = headers)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Product color
    product_list = soup.find_all('a', class_ ="filter-option miniature")
    color_name = [p.get('data-color')for p in product_list]
    products_id = [p.get('data-articlecode') for p in product_list]

    #Data frame with color names and products id 
    df_color = pd.DataFrame([products_id, color_name]).T
    df_color.columns = ['products_id', 'color_name']

    # style ID + color id
    df_color['style_id'] = df_color['products_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['products_id'].apply(lambda x: x[-3:])

    df_detalis = pd.concat([df_detalis, df_color], axis= 0 )




In [214]:
df_detalis

Unnamed: 0,products_id,color_name,style_id,color_id
0,1130309001,Denim black,1130309,001
1,1130309002,Light denim blue,1130309,002
2,1130309004,Dark denim gray,1130309,004
3,1130309005,Light denim blue,1130309,005
4,1130309008,Denim red,1130309,008
...,...,...,...,...
22,0811993068,Dark denim blue,0811993,068
23,0811993070,Denim blue,0811993,070
24,0811993071,Denim blue,0811993,071
25,0811993072,Denim blue,0811993,072


In [216]:
# Style_id and color_id for data
data['style_id'] = data['products_id'].apply(lambda x: x[:-3])
data['color_id'] = data['products_id'].apply(lambda x: x[-3:])

# 5 - End of web scraping

In [221]:
df_raw = pd.merge(data, df_detalis[['style_id', 'color_name']], how = 'left', on = 'style_id' )

In [224]:
df_raw.sample(20)

Unnamed: 0,products_id,products_category,products_names,products_price,scrapy_time,style_id,color_id,color_name
3473,1024711006,men_jeans_slim,Slim Jeans,$ 39.99,2024-03-06 16:56:49,1024711,6,Dark blue
1684,979945049,men_jeans_loose,Loose Jeans,$ 39.99,2024-03-06 16:56:49,979945,49,Black
2041,979945045,men_jeans_loose,Loose Jeans,$ 39.99,2024-03-06 16:56:49,979945,45,Olive green
2287,1008549005,men_jeans_regular,Straight Regular Jeans,$ 29.99,2024-03-06 16:56:49,1008549,5,Dark gray
1833,979945039,men_jeans_loose,Loose Jeans,$ 39.99,2024-03-06 16:56:49,979945,39,Light denim blue
611,1210576002,men_jeans_regular,Straight Regular Jeans,$ 39.99,2024-03-06 16:56:49,1210576,2,Denim blue
1618,979945049,men_jeans_loose,Loose Jeans,$ 39.99,2024-03-06 16:56:49,979945,49,Light gray
3,1130309007,men_jeans_loose,Baggy Jeans,$ 39.99,2024-03-06 16:56:49,1130309,7,Light denim blue
3358,1024256011,men_jeans_slim,Slim Jeans,$ 24.99,2024-03-06 16:56:49,1024256,11,Pale denim blue
3766,875105037,men_jeans_relaxed,Relaxed Jeans,$ 39.99,2024-03-06 16:56:49,875105,37,Denim blue
