# Enviroment

## Installations

In [4]:
#!pip install beautifulsoup4
#!pip install pandas
#!pip3 install bs4



## Imports

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import numpy as np
import math

# Web Scraping

## Beautifull Soup

### Step 1

Top page

In [42]:
# Request informations
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
page = requests.get(url, headers=headers)

# BeautifulSoup html page
soup = BeautifulSoup(page.text, 'html.parser')

# BeautifulSoup html products content
products = soup.find('ul', class_="products-listing small")
product_list = products.find_all('article', class_='hm-product-item')

# product id
product_id = [p.get('data-articlecode') for p in product_list]

# product category
product_category = [p.get('data-category') for p in product_list]

# product names
products_names = products.find_all('a', class_ = 'link')
products_names = [p.get_text() for p in products_names]

# product price
product_price = products.find_all('span', class_ = 'price regular')
product_price = [p.get_text() for p in product_price]

# DataFrame with scrapy data
data = pd.DataFrame([product_id, products_names, product_category, product_price]).T
data.columns = ['product_id', 'name', 'category', 'price']
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H-%M-%S')

In [43]:
data

Unnamed: 0,product_id,name,category,price,scrapy_datetime
0,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99,2021-10-13 11-24-08
1,985197005,Slim Jeans,men_jeans_slim,$ 19.99,2021-10-13 11-24-08
2,1004476004,Freefit® Slim Jeans,men_jeans_slim,$ 49.99,2021-10-13 11-24-08
3,938875007,Slim Tapered Jeans,men_jeans_slim,$ 39.99,2021-10-13 11-24-08
4,811993028,Regular Jeans,men_jeans_regular,$ 29.99,2021-10-13 11-24-08
5,927964002,Regular Tapered Crop Jeans,men_jeans_regular,$ 19.99,2021-10-13 11-24-08
6,1018704002,Relaxed Pull-on Jeans,men_jeans_relaxed,$ 24.99,2021-10-13 11-24-08
7,1013317004,Hybrid Regular Tapered Joggers,men_jeans_regular,$ 39.99,2021-10-13 11-24-08
8,1018704001,Relaxed Pull-on Jeans,men_jeans_relaxed,$ 24.99,2021-10-13 11-24-08
9,974202002,Regular Denim Joggers,men_jeans_loose,$ 29.99,2021-10-13 11-24-08


In [11]:
# product color

In [12]:
# product composition

### Step 2

Paginations

In [4]:
# Request informations
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
page = requests.get(url, headers=headers)

# BeautifulSoup html page
soup = BeautifulSoup(page.text, 'html.parser')

In [5]:
total_items = int(soup.find('h2', class_ = 'load-more-heading').get('data-total'))
total_items

87

In [6]:
# How to round to upper number
page_number = math.ceil(total_items / 36)
page_number

3

In [7]:
url_2 = url + '?page-size=' + str(page_number * 36)
url_2

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

### Step 3

Specific product page (subpage)

In [102]:
#------------------------------API Request----------------------------
url = 'https://www2.hm.com/en_us/productpage.0690449051.html'
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
page = requests.get(url, headers=header).text

#------------------------------BeautifulSoup Object--------------------
soup = BeautifulSoup(page)

#------------------------------Product Color---------------------------

product_detail = soup.find_all('a', class_ = 'filter-option miniature')
product_color = [p.get('data-color') for p in product_detail]
product_code = [p.get('data-articlecode') for p in product_detail]

df_color = pd.DataFrame({'product_code': product_code, 'product_color': product_color})
df_color['color_id'] = df_color['product_code'].apply(lambda x: x[-3:])
df_color['style_id'] = df_color['product_code'].apply(lambda x: x[:-3])
df_color.head()

Unnamed: 0,product_code,product_color,color_id,style_id
0,690449001,Light denim blue/trashed,1,690449
1,690449002,Denim blue,2,690449
2,690449006,Black/washed,6,690449
3,690449007,Light denim blue,7,690449
4,690449009,Black washed out,9,690449


In [103]:
#------------------------------Product Composition---------------------------
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in soup.find_all('div', class_ = 'pdp-description-list-item')]

df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]
df_composition.drop(index=0, inplace=True)

df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]
df_composition.columns = df_composition.columns.str.lower()
df_composition.columns = ['fit', 'composition', 'product_code']
df_composition.fillna(method='ffill', inplace=True)
df_composition['style_id'] = df_composition['product_code'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['product_code'].apply(lambda x: x[-3:])
df_composition

Unnamed: 0,fit,composition,product_code,style_id,color_id
1,Skinny fit,"Cotton 98%, Elastane 2%",690449051,690449,51


In [104]:
#------------------------------SKU Product---------------------------
df_sku = pd.merge(df_color, df_composition[['style_id', 'fit', 'composition']], how='left', on='style_id')
df_sku

Unnamed: 0,product_code,product_color,color_id,style_id,fit,composition
0,690449001,Light denim blue/trashed,1,690449,Skinny fit,"Cotton 98%, Elastane 2%"
1,690449002,Denim blue,2,690449,Skinny fit,"Cotton 98%, Elastane 2%"
2,690449006,Black/washed,6,690449,Skinny fit,"Cotton 98%, Elastane 2%"
3,690449007,Light denim blue,7,690449,Skinny fit,"Cotton 98%, Elastane 2%"
4,690449009,Black washed out,9,690449,Skinny fit,"Cotton 98%, Elastane 2%"
5,690449011,White,11,690449,Skinny fit,"Cotton 98%, Elastane 2%"
6,690449013,Black/washed,13,690449,Skinny fit,"Cotton 98%, Elastane 2%"
7,690449021,Dark denim blue/trashed,21,690449,Skinny fit,"Cotton 98%, Elastane 2%"
8,690449022,Black/trashed,22,690449,Skinny fit,"Cotton 98%, Elastane 2%"
9,690449024,Dark blue/Trashed,24,690449,Skinny fit,"Cotton 98%, Elastane 2%"
