# 1 - Imports

In [49]:
from bs4 import BeautifulSoup as bts
from datetime import datetime

import requests
import pandas as pd
import numpy as np
import re

pd.set_option('display.float_format', '{:.2f}'.format)

# 2 - Helper Functions

## 2.1 - Data processing

In [30]:
###################################################################
#                     Data Processing H&M
###################################################################

def dp_hm (path):
    
    data = pd.read_csv(path, sep=';')
    
    
    # Removendo dados duplicados
    
    data = data.drop_duplicates()
    
    
    # Removendo dados nulos/faltantes das principais colunas
    
    data = data.dropna(subset=['product_id', 'product_name', 'product_price', 'product_composition', 'product_fit'])
        
    
    # Removendo dados fora de contexto das principais colunas
    
    data = data.loc[~((data['product_fit'] == """The model is 189cm/6'2" and wears a size 31/32""") |
                     (data['product_fit'] == """The model is 187cm/6'2" and wears a size 31/32""")),:]
    
    data = data.reset_index(drop=True)
    
    return data



###################################################################
#                     Data Processing Macy's
###################################################################

def dp_macys (path2):
    
    data = pd.read_csv(path2)
    
    # Removendo dados duplicados
    
    data = data.drop_duplicates()
    
        
    # Extraindo o estilo da calça de 'product_ref'
    
    data['product_fit'] = data['product_ref'].apply(lambda x: 'Straight Jeans' if 'Straight' in x  else
                                                              'Straight Jeans' if 'Regular'  in x  else
                                                              'Bootcut Jeans'  if 'Bootcut'  in x  else
                                                              'Relaxed Jeans'  if 'Relaxed'  in x  else
                                                              'Skinny Jeans'   if 'Skinny'   in x  else
                                                              'Slim Jeans'     if 'Slim'     in x  else
                                                              'Baggy Jeans'    if 'Baggy'    in x  else
                                                              'Athletic Jeans' if 'Athletic' in x  else 
                                                              'Modern Jeans')
    
    return data

## 2.2 - Regular expression

In [24]:
###################################################################
#                         Regex H&M
###################################################################

def regex_hm(data):
    
    # Extraindo informações da coluna 'product_composition'
    
    y = []
    
    regex = 'Shell:\s(.+%)[A-Z]|Shell:\s(.+%)'
    regex2 = 'Composition([Cotton].+%)[A-Z]|%([Cotton].+%)'
    regex3 = 'Composition([Cotton].+%)|%([Cotton].+%)'
        
    for i in range(len(data)):
        
        if "Shell" in data.loc[i, 'product_composition']:
            if re.search(regex, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex, data.loc[i, 'product_composition']).group(2))

        elif "Pocket" in data.loc[i, 'product_composition']:
            if re.search(regex2, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(2))
        
        elif "Lining" in data.loc[i, 'product_composition']:
            if re.search(regex2, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(2))
                
        else:
            if re.search(regex3, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex3, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex3, data.loc[i, 'product_composition']).group(2))
        
    data1 = pd.DataFrame([y]).T
    data1.columns = ['product_composition']
    data1 = data1['product_composition'].str.split(',', expand=True)
    
    data_ref = pd.DataFrame(index=np.arange(len(data)) ,columns=['cotton', 'spandex', 'polyester', 'elastomultiester'])
    
    df_cotton = data1[0]
    df_cotton.name = 'cotton'
    data_ref = pd.concat([data_ref, df_cotton], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')]
    
    
    df_spandex = data1.loc[data1[1].str.contains('Spandex', na=True),1]
    df_spandex.name = 'spandex'
    df_spandex = df_spandex.combine_first(data1[2])
    data_ref = pd.concat([data_ref, df_spandex], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')] 
    data_ref['spandex'] = data_ref['spandex'].fillna('Spandex 0%')

    
    df_polyester = data1.loc[data1[1].str.contains('Polyester', na=True),1]
    df_polyester.name = 'polyester'
    data_ref = pd.concat([data_ref, df_polyester], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')] 
    data_ref['polyester'] = data_ref['polyester'].fillna('Polyester 0%')

    
    df_elasto = data1.loc[data1[1].str.contains('Elastomultiester', na=True), 1]
    df_elasto.name = 'elastomultiester'

    data_ref = pd.concat([data_ref, df_elasto], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')] 
    data_ref['elastomultiester'] = data_ref['elastomultiester'].fillna('Elastomultiester 0%')
    
    
    data = pd.concat([data, data_ref], axis=1)

    data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100)
    data['spandex'] = data['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100)
    data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100)
    data['elastomultiester'] = data['elastomultiester'].apply(lambda x: int(re.search('\d+', x).group(0))/100)

    
    # Extraindo o preço das colunas 'product_price' e 'product_price_new'
    
    data['product_price'] = data['product_price'].apply(lambda x: re.search('\w.+', x).group(0))
    data['product_price_new'] = data['product_price_new'].apply(lambda x: re.search('\w.+', x).group(0) if pd.notnull(x) else x)
    
    
    return data




###################################################################
#                         Regex Macy's
###################################################################

def regex_macys(data):
    
    # Extraindo os preços de venda e de promoção de 'product_price'
    
    regex = '.(\d+\.\d+)'
    regex2 = 'Sale..(\d+\.\d+)'
    
    
    data['product_sale'] = data['product_price'].apply(lambda x: re.search(regex2, x).group(1) if "Sale" in x else 0)
    data['product_price'] = data['product_price'].apply(lambda x: re.search(regex, x).group(1))
    
    
    return data


## 2.3 - Plotting

# 3 - Web Scraping

## 3.1 - Web scraping H&M

A extração dos dados do site da H&M foi realizado com a biblioteca Selenium, sendo executado na IDE PyCharm. O arquivo ".py" pode ser encontrado no repositório.

## 3.2 - Web scraping Macy's

In [None]:
url = 'https://www.macys.com/shop/mens-clothing/mens-jeans/Productsperpage/120?id=11221'

headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'}

page = requests.get(url, headers=headers)

soup = bts(page.text, 'html.parser')

# Step 1: All items of the first page

    ##################################
    # Product id
    ##################################

    vitrine = soup.find('ul', class_='items grid-x small-up-2 medium-up-3 large-up-3')
    vitrine_items = vitrine.find_all('div', class_='productThumbnail redesignEnabled')
    product_id = [i.get('id') for i in vitrine_items]


    ##################################
    # Product name
    ##################################

    vitrine_name = vitrine.find_all('div', class_='productBrand')
    product_name = [i.get_text().strip() for i in vitrine_name]


    ##################################
    # Product reference
    ##################################

    vitrine_ref = vitrine.find_all('a', class_='productDescLink')
    product_ref = []

    for i in range(0, len(vitrine_ref), 2):

        product_ref.append(vitrine_ref[i].get('title'))


    ##################################
    # Product price
    ##################################

    vitrine_price = vitrine.find_all('div', class_='prices')
    product_price = [i.get_text().strip() for i in vitrine_price]


    
# Step 2: Extracting data all pages

    ##################################
    # Pagination
    ##################################

    j = soup.find('div', class_='cell small-12')
    k = j.find_all('option')


    for i in range(2, (len(k)+1)):

        url2 = 'https://www.macys.com/shop/mens-clothing/mens-jeans/Pageindex,Productsperpage/' + str(i) + ',120?id=11221'

        page2 = requests.get(url2, headers=headers)

        soup2 = bts(page2.text, 'html.parser')

        ##################################
        # Product id
        ##################################

        vitrine2 = soup2.find('ul', class_='items grid-x small-up-2 medium-up-3 large-up-3')
        vitrine_items2 = vitrine2.find_all('div', class_='productThumbnail redesignEnabled')
        product_id2 = [i.get('id') for i in vitrine_items2]

        product_id = [*product_id, *product_id2]


        ##################################
        # Product name
        ##################################

        vitrine_name2 = vitrine2.find_all('div', class_='productBrand')
        product_name2 = [i.get_text().strip() for i in vitrine_name2]

        product_name = [*product_name, *product_name2]


        ##################################
        # Product reference
        ##################################

        vitrine_ref2 = vitrine2.find_all('a', class_='productDescLink')
        product_ref2 = []

        for j in range(0, len(vitrine_ref2), 2):

            product_ref2.append(vitrine_ref2[j].get('title'))

        product_ref = [*product_ref, *product_ref2]

        ##################################
        # Product price
        ##################################

        vitrine_price2 = vitrine2.find_all('div', class_='prices')
        product_price2 = [i.get_text().strip() for i in vitrine_price2]

        product_price = [*product_price, *product_price2]

    

# Step 3: Creating dataframe

data1 = pd.DataFrame([product_id, product_name, product_ref, product_price]).T
data1.columns = ['product_id', 'product_name', 'product_ref', 'product_price']
data1['data_scrapy'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

data = pd.read_csv('data.csv')

data_macys = pd.concat([data1, data], axis=0, ignore_index=True)

data_macys.to_csv('data.csv', index=False)

# 4 - Data Processing

## 4.1 - Processing H&M

In [61]:
data_hm = dp_hm('data_h&m.csv')
data_hm = regex_hm(data_hm)

def dtypes_hm(data):

    data['product_id'] = data['product_id'].astype('int64').astype('str')
    
    return data

data_hm['data_scrapy'] = data_hm['data_scrapy'].apply(lambda x: datetime.strptime(x, "%d/%m/%Y %H:%M"))
data_hm.dtypes
data_hm

Unnamed: 0,product_id,product_name,product_price,product_price_new,product_color,product_fit,product_size,product_materials,product_composition,data_scrapy,cotton,spandex,polyester,elastomultiester
0,1024256001.00,Slim Jeans,29.99,,Black,Slim fit,"The model is 185cm/6'1"" and wears a size 31/32",Lining: Recycled cotton 20%,"CompositionPocket lining: Polyester 65%, Cotto...",2022-04-12 13:40:00,0.99,0.01,0.00,0.00
1,1024256002.00,Slim Jeans,29.99,,Light denim blue,Slim fit,"The model is 189cm/6'2"" and wears a size 31/32","Lining: Recycled polyester 65%, Recycled cotto...","CompositionShell: Cotton 99%, Spandex 1%Pocket...",2022-04-12 13:40:00,0.99,0.01,0.00,0.00
2,1024256003.00,Slim Jeans,29.99,,Light denim blue,Slim fit,"The model is 189cm/6'2"" and wears a size 31/32",Shell: Recycled cotton 20%,"CompositionShell: Cotton 99%, Spandex 1%Pocket...",2022-04-12 13:40:00,0.99,0.01,0.00,0.00
3,1024256004.00,Slim Jeans,29.99,,Denim blue,Slim fit,,"Lining: Recycled polyester 65%, Recycled cotto...","CompositionShell: Cotton 99%, Spandex 1%Pocket...",2022-04-12 13:40:00,0.99,0.01,0.00,0.00
4,1024256005.00,Slim Jeans,29.99,,Dark blue,Slim fit,"The model is 180cm/5'11"" and wears a size 31/32",Shell: Recycled cotton 20%,"CompositionPocket lining: Polyester 65%, Cotto...",2022-04-12 13:40:00,0.99,0.01,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2375,993887002.00,Hybrid Regular Denim Joggers,39.99,,Denim blue,Regular fit,"The model is 187cm/6'2"" and wears a size 31/32",Lining: Recycled polyester 65%,"CompositionCotton 77%, Polyester 21%, Spandex 2%",2022-04-12 23:24:00,0.77,0.02,0.21,0.00
2376,993887004.00,Hybrid Regular Denim Joggers,39.99,26.99,Black,Regular fit,,"Shell: Recycled polyester 20%, Recycled cotton...","CompositionCotton 79%, Polyester 20%, Spandex 1%",2022-04-12 23:24:00,0.79,0.01,0.20,0.00
2377,1048642001.00,Regular Bootcut Jeans,39.99,,Denim blue,Regular fit,,Lining: Recycled polyester 80%,"CompositionShell: Cotton 99%, Spandex 1%Pocket...",2022-04-12 23:24:00,0.99,0.01,0.00,0.00
2378,875105001.00,Relaxed Jeans,39.99,22.99,Denim blue,Relaxed fit,"The model is 189cm/6'2"" and wears a size 31/32",Shell: Recycled cotton 20%,CompositionCotton 100%,2022-04-12 23:24:00,1.00,0.00,0.00,0.00


In [53]:
date = '2021-05-21 11:22:03'
datem = datetime.strptime(date, "%Y-%m-%d %H:%M:%S")

print(datem.day)        # 21
print(datem.month)      # 5
print(datem.year)       # 2021
print(datem.hour)       # 11
print(datem.minute)     # 22
print(datem.second)     # 3

21
5
2021
11
22
3


In [54]:
type(datem)

datetime.datetime

# 5 - Data Description

# 6 - EDA

# 7 - Key Insights

# 8 - Conclusion and Next Steps