# BUSINESS PROBLEM STAR JEAN'S

Eduardo and Marcelo are two Brazilians, friends and business partners. After several successful business, they are planning to enter the fashion market. US as an E-commerce business model.

The initial idea is to enter the market with just one product and for a specific audience, in this case the product would be Jeans for the male audience. The objective is to maintain the operating cost low and scale as they get customers.

However, even with the input product and audience defined, the two partners do not have experience in this fashion market and therefore they don't know how to define basic things like price, the type of pants and the material for the manufacture of each piece.

Thus, the two partners hired a Data Science consultancy to answer the following questions:

* What is the best selling price for the pants?
* How many types of pants and yours colors for initial product?
* What are the raw materials needed to make the pants?

The main competitors of the Start Jeans company are the American companies H&M and Macys.

# 1. Imports

In [34]:
# webscrapping
import requests
from bs4 import BeautifulSoup

# date
from datetime import datetime

# data manipulation
import pandas as pd
import numpy as np

# monitoring
from tqdm import tqdm

# regex
import re

# database
import sqlite3
from sqlalchemy import create_engine

# 2. WebScrapping (EXTRACT)

In [2]:
# extract data from competitors
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"}

# request
page = requests.get(url, headers = headers)

In [3]:
# instantiate BeautifulSoup
soup = BeautifulSoup(page.text, 'html.parser')

# find total number of products
total_item = soup.find_all('h2', class_ = 'load-more-heading')[0].get('data-total')

# find number of items shown in page
items_page = soup.find_all('h2', class_ = 'load-more-heading')[0].get('data-items-shown')

# get number of pages
pages = np.ceil(int(total_item) / int(items_page))

# get section with all products
products = soup.find('ul', class_ = 'products-listing small')

# get list of products overall
product_list = soup.find_all('article', class_ = 'hm-product-item')

len(product_list)

36

## 2.1. Generate table

Table format: 

Id | Category | Name | Price | Color | Decomposition

### 2.1.1 Id, Category, Name, Price

In [4]:
# product_id
product_id = [p.get('data-articlecode') for p in product_list]

# product_category
product_category = [p.get('data-category') for p in product_list]

In [5]:
# get list of products to get name
product_list = products.find_all('a', class_ = 'link')

# product_name
product_name = [p.get_text() for p in product_list]

In [6]:
# get list of products to get price
product_list = products.find_all('span', class_ = 'price regular')

# product_price
product_price = [p.get_text() for p in product_list]

In [7]:
# pass data to DataFrame
df_products = pd.DataFrame([product_id, product_category, product_name, product_price]).T
df_products.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# scrapy datetime
df_products['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

df_products.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55
1,985197005,men_jeans_slim,Slim Jeans,$ 19.99,2021-10-08 21:03:55
2,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2021-10-08 21:03:55
3,974202002,men_jeans_loose,Regular Denim Joggers,$ 29.99,2021-10-08 21:03:55
4,1004476004,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-10-08 21:03:55


### 2.1.2. Color, Decomposition

In [8]:
# getting color for all products
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"}
cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame(columns = cols)
df_details = pd.DataFrame()

for index in tqdm(product_id):
    url = f'https://www2.hm.com/en_us/productpage.{index}.html'
    
    # request
    page = requests.get(url, headers = headers)
    
    # instantiate BeatifulSoup
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # COLOR
    ## product list
    product_list = soup.find_all('a', class_ = 'filter-option miniature')
    
    ## color name
    color_name = [p.get('data-color') for p in product_list]
    
    # ID FOR MERGE
    ## product_id
    color_product_id = [p.get('data-articlecode') for p in product_list]
    
    # pass to dataframe
    df_color = pd.DataFrame([color_product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']
    
    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])
    
    # COMPOSITION
    ## composition list
    product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')
    
    ## composition names
    product_composition = [list(filter(None, c.get_text().split('\n'))) for c in product_composition_list]
    
    # pass to dataframe
    df_composition = pd.DataFrame(product_composition).T
    
    # set columns
    df_composition.columns = df_composition.iloc[0]
    
    # delete first row and fill na
    df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

    # garantee same number of columns
    df_composition = pd.concat([df_pattern, df_composition], axis = 0).reset_index(drop = True)
    
    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # merge color and composition
    df_color_composition = df_color.merge(df_composition[['style_id', 'Fit', 'Composition', 'Size', 'Product safety']], 
                                          how = 'left', on = 'style_id')
    
    # all details from products
    df_details = pd.concat([df_details, df_color_composition], axis = 0).reset_index(drop = True)

df_details.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [00:32<00:00,  1.12it/s]


Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition,Size,Product safety
0,690449001,Light denim blue/trashed,690449,1,Skinny fit,"Cotton 98%, Elastane 2%",,
1,690449002,Denim blue,690449,2,Skinny fit,"Cotton 98%, Elastane 2%",,
2,690449006,Black/washed,690449,6,Skinny fit,"Cotton 98%, Elastane 2%",,
3,690449007,Light denim blue,690449,7,Skinny fit,"Cotton 98%, Elastane 2%",,
4,690449009,Black washed out,690449,9,Skinny fit,"Cotton 98%, Elastane 2%",,


### 2.1.3. Merge with main product list

In [9]:
# generate style id + color id for df_products
df_products['style_id'] = df_products['product_id'].apply(lambda x: x[:-3])
# df_products['color_id'] = df_products['product_id'].apply(lambda x: x[-3:])

# final merge
data_raw = df_products.merge(df_details[['style_id', 'color_id', 'color_name', 'Fit', 'Composition', 'Size', 'Product safety']], 
                                how = 'left', on = ['style_id'])
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition,Size,Product safety
0,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,1,Light denim blue/trashed,Skinny fit,"Cotton 98%, Elastane 2%",,
1,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,2,Denim blue,Skinny fit,"Cotton 98%, Elastane 2%",,
2,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,6,Black/washed,Skinny fit,"Cotton 98%, Elastane 2%",,
3,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,7,Light denim blue,Skinny fit,"Cotton 98%, Elastane 2%",,
4,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,9,Black washed out,Skinny fit,"Cotton 98%, Elastane 2%",,


In [10]:
data_raw.shape

(1294, 12)

In [11]:
# lower column names
data_raw.columns = data_raw.columns.str.lower()

In [12]:
data_raw.to_csv('./data/data_raw.csv', index = False)

# 3. Data Cleaning (TRANSFORMING)

In [13]:
df = pd.read_csv('./data/data_raw.csv')
df.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,fit,composition,size,product safety
0,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,1,Light denim blue/trashed,Skinny fit,"Cotton 98%, Elastane 2%",,
1,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,2,Denim blue,Skinny fit,"Cotton 98%, Elastane 2%",,
2,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,6,Black/washed,Skinny fit,"Cotton 98%, Elastane 2%",,
3,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,7,Light denim blue,Skinny fit,"Cotton 98%, Elastane 2%",,
4,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-10-08 21:03:55,690449,9,Black washed out,Skinny fit,"Cotton 98%, Elastane 2%",,


In [14]:
# check missing values
df.isna().sum()

product_id             0
product_category       0
product_name           0
product_price          0
scrapy_datetime        0
style_id               0
color_id               0
color_name             0
fit                    0
composition            0
size                 386
product safety      1268
dtype: int64

In [15]:
# check data types
df.dtypes

product_id           int64
product_category    object
product_name        object
product_price       object
scrapy_datetime     object
style_id             int64
color_id             int64
color_name          object
fit                 object
composition         object
size                object
product safety      object
dtype: object

In [16]:
# product_id
df = df.dropna(subset = ['product_id'])
df['product_id'] = df['product_id'].astype(int)

# product name - change format
df['product_name'] = df['product_name'].apply(lambda x: x.lower().replace(' ', '_'))

# product price - remove $
df['product_price'] = df['product_price'].apply(lambda x: x.replace('$', '') if pd.notnull(x) else x).astype(float)

# scrapy datetime
df['scrapy_datetime'] = pd.to_datetime(df['scrapy_datetime'], errors = 'coerce')

# style id
df['style_id'] = df['style_id'].astype(int)

# color id
df['color_id'] = df['color_id'].astype(int)

# color name - change format
df['color_name'] = df['color_name'].apply(lambda x: x.lower().replace(' ', '_').replace('/', '_') if pd.notnull(x) else x)

# fit
df['fit'] = df['fit'].apply(lambda x: x.lower().replace(' ', '_') if pd.notnull(x) else x)

# size number
df['size_number'] = df['size'].apply(lambda x: re.search('\d{3}cm', x).group(0) if pd.notnull(x) else x)
df['size_number'] = df['size_number'].apply(lambda x: re.search('\d+', x).group(0) if pd.notnull(x) else x)

# size model
df['size_model'] = df['size'].str.extract('(\d+/\\d+)')

# composition
df = df[~df['composition'].str.contains('Pocket lining:', na = False)]
df = df[~df['composition'].str.contains('Lining:', na = False)]
df = df[~df['composition'].str.contains('Shell:', na = False)]
df = df[~df['composition'].str.contains('Pocket:', na = False)]

# drop duplicates
df = df.drop_duplicates(subset = ['product_id', 'product_category', 'product_name', 'product_price',
                                  'scrapy_datetime', 'style_id', 'color_id', 'color_name', 'fit'],
                        keep = 'last')

# reset index
df = df.reset_index(drop = True)

# break composition by comma
df_aux = df['composition'].str.split(',', expand = True)

df.head(2)

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,fit,composition,size,product safety,size_number,size_model
0,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,43,light_denim_blue_trashed,skinny_fit,"Cotton 98%, Elastane 2%","The model is 184cm/6'0"" and wears a size 31/32",,184,31/32
1,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,1,light_denim_blue_trashed,skinny_fit,"Cotton 98%, Elastane 2%","The model is 187cm/6'2"" and wears a size 32/32",,187,32/32


In [17]:
# cotton / polyester / elastane / elasterell
df_ref = pd.DataFrame(index = np.arange(len(df)), columns = ['cotton', 'polyester', 'elastane', 'elasterell'])

# cotton
df_cotton = df_aux[0]
df_cotton.name = 'cotton'

df_ref = pd.concat([df_ref, df_cotton], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]
df_ref = df_ref.fillna('Cotton 0%')

# polyester
df_polyester = df_aux.loc[df_aux[1].str.contains('Polyester', na = True), 1]
df_polyester.name = 'polyester'

df_ref = pd.concat([df_ref, df_polyester], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]
df_ref = df_ref.fillna('Polyester 0%')

# elastane
df_elastane = df_aux.loc[df_aux[1].str.contains('Elastane', na = True), 1]
df_elastane.name = 'elastane'

# combine elastane from columns 1 and 2 from df_aux
df_elastane = df_elastane.combine_first(df_aux[2])

df_ref = pd.concat([df_ref, df_elastane], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]
df_ref = df_ref.fillna('Elastane 0%')

# elasterell
df_elasterell = df_aux.loc[df_aux[1].str.contains('Elasterell', na = True), 1]
df_elasterell.name = 'elasterell'

df_ref = pd.concat([df_ref, df_elasterell], axis = 1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep = 'last')]
df_ref = df_ref.fillna('Elasterell-P 0%')

# final join
df = pd.concat([df, df_ref], axis = 1)

# format composition data
df['cotton'] = df['cotton'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)
df['polyester'] = df['polyester'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)
df['elastane'] = df['elastane'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)
df['elasterell'] = df['elasterell'].apply(lambda x: int(re.search('\d+', x).group(0)) / 100 if pd.notnull(x) else x)

# drop columns
df = df.drop(columns = ['size', 'product safety', 'composition'])

# drop duplicates
df = df.drop_duplicates()
df.shape

(127, 15)

In [18]:
df.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,fit,size_number,size_model,cotton,polyester,elastane,elasterell
0,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,43,light_denim_blue_trashed,skinny_fit,184,31/32,0.98,0.0,0.02,0.0
1,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,1,light_denim_blue_trashed,skinny_fit,187,32/32,0.98,0.0,0.02,0.0
2,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,2,denim_blue,skinny_fit,187,32/32,0.98,0.0,0.02,0.0
3,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,6,black_washed,skinny_fit,187,32/32,0.98,0.0,0.02,0.0
4,690449051,men_jeans_ripped,skinny_jeans,39.99,2021-10-08 21:03:55,690449,7,light_denim_blue,skinny_fit,187,32/32,0.98,0.0,0.02,0.0


In [19]:
df.dtypes

product_id                   int32
product_category            object
product_name                object
product_price              float64
scrapy_datetime     datetime64[ns]
style_id                     int32
color_id                     int32
color_name                  object
fit                         object
size_number                 object
size_model                  object
cotton                     float64
polyester                  float64
elastane                   float64
elasterell                 float64
dtype: object

In [20]:
df.to_csv('./data/data_clean.csv')

# 4. Save data in a database (LOAD)

In [21]:
df_clean = pd.read_csv('./data/data_clean.csv')

In [30]:
query_showroom_schema = """
CREATE TABLE showroom (
    product_id INTEGER, 
    product_category TEXT, 
    product_name TEXT, 
    product_price REAL,
    scrapy_datetime TEXT, 
    style_id INTEGER, 
    color_id TEXT, 
    color_name TEXT, 
    fit TEXT,
    size_number REAL, 
    size_model TEXT, 
    cotton REAL,
    polyester REAL,
    elastane REAL,
    elasterell REAL
    )
"""

In [31]:
# connect to database
conn = sqlite3.connect('./database/hm_db.sqlite')
cursor = conn.execute(query_showroom_schema)
conn.commit()
conn.close()

In [38]:
# create engine sqlalchemy
conn = create_engine('sqlite:///./database/hm_db.sqlite', echo = False)

In [44]:
# insert data to table
df.to_sql('showroom', con = conn, if_exists = 'append', index = False)

In [48]:
# extract data from database
query = """
SELECT
    *
FROM
    showroom
"""
df = pd.read_sql_query(query, conn)
df.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,fit,size_number,size_model,cotton,polyester,elastane,elasterell
0,690449051,tshirt,skinny_jeans,39.99,2021-10-08 21:03:55.000000,690449,43,light_denim_blue_trashed,skinny_fit,184.0,31/32,0.98,0.0,0.02,0.0
1,690449051,tshirt,skinny_jeans,39.99,2021-10-08 21:03:55.000000,690449,1,light_denim_blue_trashed,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
2,690449051,tshirt,skinny_jeans,39.99,2021-10-08 21:03:55.000000,690449,2,denim_blue,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
3,690449051,tshirt,skinny_jeans,39.99,2021-10-08 21:03:55.000000,690449,6,black_washed,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0
4,690449051,tshirt,skinny_jeans,39.99,2021-10-08 21:03:55.000000,690449,7,light_denim_blue,skinny_fit,187.0,32/32,0.98,0.0,0.02,0.0


In [47]:
# # command - UPDATE
# query = """
# UPDATE showroom
# SET product_category = 'tshirt'
# WHERE product_id = 690449051
# """

#command - DROP
# query = """
# DROP TABLE showroom
# """

# command - ALTER TABLE
# query = """
# ALTER TABLE showroom
# RENAME showroom_two
# """

# command - CREATE INDEX
query = """
CREATE INDEX idx_product_id
ON showroom (product_id)
"""

conn = sqlite3.connect('./database/hm_db.sqlite')
cursor = conn.execute(query)
conn.commit()

In [49]:
conn.close()