In [1]:
#!pip install beautifulsoup4

# Business Understanding

### 1 - Problema de Negócio
    - Qual o melhor preço de venda para calças?
    
### Saída: (Produto Final)
    1. A resposta para a pergunta
        - Mediana dos preços dos concorrentes
        
    2. Formato da entrega
        - Tabela ou gráfico
        
    3. Local da entrega
        - App streamlit
        
### Processo (Passo a Passo)
    1. Passo a passo para construir o cálculo da mediana ou média
        - Realizar o cálculo da mediana sobre o produto, tipo e cor
        
    2. Definir o formato da entrega (visualizaçao, tabela, frase)
        - Gráficos de barras com a mediana dos preços dos produtos, por tipo e cor dos últimos 30 dias.
        - Tabela com as seguintes colunas: id | product_name | product_type | product_color | product_price
        - Definiçao do schema: colunas e seu tipo
        - Definição da infraestrutura de armazenamento (SQLITE3)
        - Design do ETL (Scripts de Extração, Transformação e Carga)
        - Planejamento de agendamento dos scripts (dependência entre os scripts)
        - Fazer as visualizações
        - Entrega do produto final
        
    3. Decidir o local de entrega (powerBi, telegram, email, streamlit, intranet)
        - App com Streamlit      

### Entrada (fonte de dados)
    1. Fonte de dados:
        - Site da H&M
        - Site da Macys
        
    2. Ferramentas:
        - Pyhton 3.8.0
        - Bibliotecas de webscrapping (BS4, Selenium)
        - Pycharm
        - Jupyter Notebook (análises e prototipagens)
        - Crontjob, Airflow
        - Streamlit

### 2 - Problema de Negócio
    - Quantos tipos de calças e suas cores para o produto inicial?
    
### Saída: (Produto Final)
    1. A resposta para a pergunta
        - Tipos e cores de calças mais frequentes por concorrentes
        
    2. Formato da entrega
        - Lista/Tabela/Gráfico
        
    3. Local da entrega
        - App streamlit
        
### Processo (Passo a Passo)
    1. Passo a passo para construir o cálculo da mediana ou média
        - Realizar webscrapping identificando tipo e cor mais frequentes nos sites dos concorrentes
        
    2. Definir o formato da entrega (visualizaçao, tabela, frase)
        - Gráficos de barras com a frequência dos produtos, por tipo e cor dos últimos 30 dias.
        - Tabela com as seguintes colunas: id | product_name | product_type | product_color | supplier
        - Definiçao do schema: colunas e seu tipo
        - Definição da infraestrutura de armazenamento (SQLITE3)
        - Design do ETL (Scripts de Extração, Transformação e Carga)
        - Planejamento de agendamento dos scripts (dependência entre os scripts)
        - Fazer as visualizações
        - Entrega do produto final
        
    3. Decidir o local de entrega (powerBi, telegram, email, Streamlit, intranet)
        - App com Streamlit      

### Entrada (fonte de dados)
    1. Fonte de dados:
        - Site da H&M
        - Site da Macys
        
    2. Ferramentas:
        - Pyhton 3.8.0
        - Bibliotecas de webscrapping (BS4, Selenium)
        - Pycharm
        - Jupyter Notebook (análises e prototipagens)
        - Crontjob, Airflow
        - Streamlit

### 3 - Problema de Negócio
    - Quais as matérias-primas para confecionar as calças?
    
### Saída: (Produto Final)
    1. A resposta para a pergunta
        - MP mais frequentes por tipo de calça
        
    2. Formato da entrega
        - Lista/Tabela/Gráfico
        
    3. Local da entrega
        - App streamlit
        
### Processo (Passo a Passo)
    1. Passo a passo para construir o cálculo da mediana ou média
        - Realizar webscrapping identificando as MP mais frequentes usadas por tipo de calças
        
    2. Definir o formato da entrega (visualizaçao, tabela, frase)
        - Gráficos de barras com as MP dos produtos dos últimos 30 dias.
        - Tabela com as seguintes colunas: id | product_name | product_type | product_composition
        - Definiçao do schema: colunas e seu tipo
        - Definição da infraestrutura de armazenamento (SQLITE3)
        - Design do ETL (Scripts de Extração, Transformação e Carga)
        - Planejamento de agendamento dos scripts (dependência entre os scripts)
        - Fazer as visualizações
        - Entrega do produto final
        
    3. Decidir o local de entrega (powerBi, telegram, email, Streamlit, intranet)
        - App com Streamlit      

### Entrada (fonte de dados)
    1. Fonte de dados:
        - Site da H&M
        - Site da Macys
        
    2. Ferramentas:
        - Pyhton 3.8.0
        - Bibliotecas de webscrapping (BS4, Selenium)
        - Pycharm
        - Jupyter Notebook (análises e prototipagens)
        - Crontjob, Airflow
        - Streamlit

# Showcase

In [21]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import numpy as np
import re

In [3]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
    
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get (url, headers=headers)

soup = BeautifulSoup (page.text, 'html.parser')

In [4]:
products = soup.find ('ul', class_= 'products-listing small')
products_list = products.find_all ('article', class_ = 'hm-product-item')

#product id
products_id = [p.get('data-articlecode') for p in products_list]

#product category
products_category = [p.get('data-category') for p in products_list]

In [5]:
#products name
products_list = products.find_all ('a', class_ = 'link')
products_name = [p.get_text() for p in products_list]

In [6]:
#product price
products_list = products.find_all ('span', class_ = 'price regular')
products_price = [p.get_text() for p in products_list]

In [7]:
#dataframe
data = pd.DataFrame([products_id, products_category, products_name, products_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

#scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H: %M: %S')

In [8]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49
1,427159006,men_jeans_ripped,Trashed Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49
2,730863005,men_jeans_skinny,Skinny Jeans,$ 29.99,2021-09-14 20: 40: 49
3,636207010,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-14 20: 40: 49
4,636207006,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-14 20: 40: 49
5,720504008,men_jeans_skinny,Skinny Jeans,$ 24.99,2021-09-14 20: 40: 49
6,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-09-14 20: 40: 49
7,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-14 20: 40: 49
8,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49
9,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49


# Pagination

In [9]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
    
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get (url, headers=headers)

soup = BeautifulSoup (page.text, 'html.parser')

In [10]:
total_item = soup.find_all ('h2', class_='load-more-heading')[0].get('data-total')
total_item

page_number = np.round(int(total_item)/36)

url2 = url + "?page-size=" + str(int(page_number*36))

# One product

In [11]:
#API request
url = 'https://www2.hm.com/en_us/productpage.0636207006.html'
    
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}
page = requests.get (url, headers=headers)

#beautiful soup object
soup = BeautifulSoup (page.text, 'html.parser')

In [12]:
#color product
product_list = soup.find_all('a', class_='filter-option miniature')
product_color = [p.get('data-color') for p in product_list]

product_id = [p.get('data-articlecode') for p in product_list]

df_color = pd.DataFrame([product_id, product_color]).T
df_color.columns = ['product_id', 'product_color']

#generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

In [13]:
product_composition_list = soup.find_all ('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

#rename df
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]

#delete first row
df_composition = df_composition.iloc[1:].fillna(method='ffill')

#generate style id+color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

#merge data color + decomposition
data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

# Multiple products

In [14]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36'}

#unique features
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Size']
df_pattern = pd.DataFrame(columns=cols)

#empty dataframe
df_detail = pd.DataFrame()

for i in range(len(data)):
    #API request
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    
    page = requests.get (url, headers=headers)
    
    #beautiful soup object
    soup = BeautifulSoup (page.text, 'html.parser')
    
    
    #color product
    product_list = soup.find_all('a', class_='filter-option miniature')
    product_color = [p.get('data-color') for p in product_list]
    product_id = [p.get('data-articlecode') for p in product_list]
    
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['product_id', 'product_color']
    
    #generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])


    product_composition_list = soup.find_all ('div', class_='pdp-description-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

    #rename df
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]

    #delete first row
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    #garantee same size of attributes
    df_composition = pd.concat([df_pattern, df_composition], axis=0)
    
    #generate style id+color id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    aux = aux + df_composition.columns.tolist()
    
    #merge data color + decomposition
    data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'Size']], how='left', on='style_id')
    
    #all products
    df_detail = pd.concat([df_detail, data_sku], axis=0)

In [15]:
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])

data_raw = pd.merge(data, df_detail[['style_id', 'product_color', 'Fit', 'Composition', 'Size']], how = 'left', on = 'style_id') 

In [25]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,product_color,Fit,Composition,Size
0,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49,690449,22,Light denim blue/trashed,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32"
1,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49,690449,22,Light denim blue/trashed,Skinny fit,"Cotton 98%, Elastane 2%","The model is 184cm/6'0"" and wears a size 31/32"
2,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49,690449,22,Denim blue,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32"
3,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49,690449,22,Denim blue,Skinny fit,"Cotton 98%, Elastane 2%","The model is 184cm/6'0"" and wears a size 31/32"
4,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-09-14 20: 40: 49,690449,22,Black/washed,Skinny fit,Lining: Polyester 100%,"The model is 184cm/6'0"" and wears a size 31/32"


In [37]:
data_raw.dtypes

product_id          object
product_category    object
product_name        object
product_price       object
scrapy_datetime     object
style_id            object
color_id            object
product_color       object
Fit                 object
Composition         object
Size                object
dtype: object

# Cleaning

In [91]:
data = data_raw
#product id
data = data.dropna(subset=['product_id'])

#product name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ','_').lower())

#product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace('$', '')).astype(float)

#scrapy datetime
data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d  %H: %M: %S')

#style id
data['style_id'] = data['style_id'].astype(int)

#color id
data['color_id'] = data['color_id'].astype(int)

#color name
data['product_color'] = data['product_color'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower() if pd.notnull(x) else x)

#fit
data['fit'] = data['Fit'].apply(lambda x: x.replace(' ','_').lower() if pd.notnull(x) else x)

#size number
data['size_number'] = data['Size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x)
data['size_number'] = data['size_number'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

#size model
data['size_model'] = data['Size'].str.extract('(\d+/\\d+)')

#composition
data = data[~data['Composition'].str.contains('Pocket lining:', na=False)]
data = data[~data['Composition'].str.contains('Lining:', na=False)]
data = data[~data['Composition'].str.contains('Shell:', na=False)]
data = data[~data['Composition'].str.contains('Pocket:', na=False)]

#break composition by comma
df1 = data['Composition'].str.split(',', expand=True)
#data = data.drop(columns=['Size', 'Fit', 'Composition'], axis=1)

#cotton | polyester | elastane | elasterell
df_ref = pd.DataFrame(index=np.arange(len(data)),  columns=['cotton', 'polyester', 'elastane', 'elasterell'])

#cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'

df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True),1]
df_polyester.name = 'polyester'

df_ref = pd.concat([df_ref, df_polyester], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#elastane
df_elastane = df1.loc[df1[1].str.contains('Elastane', na=True),1]
df_elastane.name = 'elastane'

df_ref = pd.concat([df_ref, df_elastane], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#elasterell
df_elasterell = df1.loc[df1[1].str.contains('Elasterell-P', na=True),1]
df_elasterell.name = 'elastane'

df_ref = pd.concat([df_ref, df_elasterell], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep='last')]

#join dfs
data = pd.concat([data, df_ref], axis=1)

#format composition
data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['elastane'] = data['elastane'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['elasterell'] = data['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

In [92]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,product_color,Fit,Composition,Size,fit,size_number,size_model,elasterell,cotton,polyester,elastane
0,,,,,NaT,,,,,,,,,,,,,
1,0690449022,men_jeans_ripped,skinny_jeans,39.99,2021-09-14 20:40:49,690449.0,22.0,light_denim_blue_trashed,Skinny fit,"Cotton 98%, Elastane 2%","The model is 184cm/6'0"" and wears a size 31/32",skinny_fit,184,31/32,,0.98,,
2,,,,,NaT,,,,,,,,,,,,,
3,0690449022,men_jeans_ripped,skinny_jeans,39.99,2021-09-14 20:40:49,690449.0,22.0,denim_blue,Skinny fit,"Cotton 98%, Elastane 2%","The model is 184cm/6'0"" and wears a size 31/32",skinny_fit,184,31/32,,0.98,,
4,,,,,NaT,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1383,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-14 20:40:49,1004476.0,5.0,dark_denim_blue,Slim fit,"Cotton 90%, Elasterell-P 8%, Elastane 2%",,slim_fit,,,,0.90,,0.08
1384,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-14 20:40:49,1004476.0,5.0,black_no_fade_black,Slim fit,"Cotton 90%, Elasterell-P 8%, Elastane 2%",,slim_fit,,,,0.90,,0.08
1385,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-14 20:40:49,1004476.0,5.0,light_denim_blue,Slim fit,"Cotton 90%, Elasterell-P 8%, Elastane 2%","The model is 182cm/6'0"" and wears a size 31/32",slim_fit,182,31/32,,0.90,,0.08
1386,1004476005,men_jeans_slim,freefit®_slim_jeans,49.99,2021-09-14 20:40:49,1004476.0,5.0,blue,Slim fit,"Cotton 90%, Elasterell-P 8%, Elastane 2%","The model is 182cm/6'0"" and wears a size 31/32",slim_fit,182,31/32,,0.90,,0.08


In [84]:
df1[1].unique()

array([' Elastane 2%', ' Polyester 6%', ' Elastane 1%', ' Polyester 10%',
       ' Polyester 26%', ' Elasterell-P 8%', nan, ' Polyester 21%'],
      dtype=object)