# 1 - Imports

In [1]:
from bs4 import BeautifulSoup as bts
from datetime import datetime

import requests
import pandas as pd
import re

pd.set_option('display.float_format', '{:.2f}'.format)

# 2 - Helper Functions

## 2.1 - Data processing

In [2]:
###################################################################
#                     Data Processing H&M
###################################################################

def dp_hm(path):
    
    data = pd.read_csv(path)
    
    
    # Removendo dados duplicados
    
    data = data.drop_duplicates()
    
    
    # Removendo dados nulos/faltantes das principais colunas
    
    data = data.dropna(subset=['product_id', 'product_name', 'product_price', 'product_composition', 'product_fit'])
    data = data.reset_index(drop=True)
    
    
    # Removendo dados fora de contexto das principais colunas
    
    data = data.loc[~((data['product_fit'] == """The model is 189cm/6'2" and wears a size 31/32""") |
                     (data['product_fit'] == """The model is 187cm/6'2" and wears a size 31/32""")),:]
    
    
    return data


## 2.2 - Regular expression

In [8]:
###################################################################
#                         Regex H&M
###################################################################

def regex_hm(data):
    
    # Extraindo informações da coluna 'product_composition'
    
    y = []
    
    regex = 'Shell:\s(.+%)[A-Z]|Shell:\s(.+%)'
    regex2 = 'Composition([Cotton].+%)[A-Z]|%([Cotton].+%)'
    regex3 = 'Composition([Cotton].+%)|%([Cotton].+%)'
        
    for i in range(len(data)):
        
        if "Shell" in data.loc[i, 'product_composition']:
            if re.search(regex, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex, data.loc[i, 'product_composition']).group(2))

        elif "Pocket" in data.loc[i, 'product_composition']:
            if re.search(regex2, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(2))
        
        elif "Lining" in data.loc[i, 'product_composition']:
            if re.search(regex2, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex2, data.loc[i, 'product_composition']).group(2))
                
        else:
            if re.search(regex3, data.loc[i, 'product_composition']).group(2) == None:
                y.append(re.search(regex3, data.loc[i, 'product_composition']).group(1))

            else:
                y.append(re.search(regex3, data.loc[i, 'product_composition']).group(2))
        
    data1 = pd.DataFrame([y]).T
    data1.columns = ['product_composition']
    data1 = data1['product_composition'].str.split(',', expand=True)
    
    data_ref = pd.DataFrame(index=np.arange(len(data)) ,columns=['cotton', 'spandex', 'polyester', 'elastomultiester'])
    
    df_cotton = data1[0]
    df_cotton.name = 'cotton'
    data_ref = pd.concat([data_ref, df_cotton], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')]
    
    
    df_spandex = data1.loc[data1[1].str.contains('Spandex', na=True),1]
    df_spandex.name = 'spandex'
    df_spandex = df_spandex.combine_first(data1[2])
    data_ref = pd.concat([data_ref, df_spandex], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')] 
    data_ref['spandex'] = data_ref['spandex'].fillna('Spandex 0%')

    
    df_polyester = data1.loc[data1[1].str.contains('Polyester', na=True),1]
    df_polyester.name = 'polyester'
    data_ref = pd.concat([data_ref, df_polyester], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')] 
    data_ref['polyester'] = data_ref['polyester'].fillna('Polyester 0%')

    
    df_elasto = data1.loc[data1[1].str.contains('Elastomultiester', na=True), 1]
    df_elasto.name = 'elastomultiester'

    data_ref = pd.concat([data_ref, df_elasto], axis=1)
    data_ref = data_ref.iloc[:, ~data_ref.columns.duplicated(keep='last')] 
    data_ref['elastomultiester'] = data_ref['elastomultiester'].fillna('Elastomultiester 0%')
    
    
    data = pd.concat([data, data_ref], axis=1)

    data['cotton'] = data['cotton'].apply(lambda x: int(re.search('\d+', x).group(0))/100)
    data['spandex'] = data['spandex'].apply(lambda x: int(re.search('\d+', x).group(0))/100)
    data['polyester'] = data['polyester'].apply(lambda x: int(re.search('\d+', x).group(0))/100)
    data['elastomultiester'] = data['elastomultiester'].apply(lambda x: int(re.search('\d+', x).group(0))/100)

    
    # Extraindo o preço das colunas 'product_price' e 'product_price_new'
    
    data['product_price'] = data['product_price'].apply(lambda x: re.search('\w.+', x).group(0))
    data['product_price_new'] = data['product_price_new'].apply(lambda x: re.search('\w.+', x).group(0) if pd.notnull(x) else x)
    
    
    return data

# 3 - Web Scraping

# 4 - Data Processing

# 5 - Data Description

# 6 - EDA

# 7 - Key Insights

# 8 - Conclusion and Next Steps