# Verificação de Scripts


### Imports

In [1]:
import re
import requests
import pandas as pd
from numpy import int64
from bs4   import BeautifulSoup

### Collect Data - WOMEN

In [2]:
# parameters
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}

# URL
url = 'https://www2.hm.com/en_us/search-results.html?q=t-shirt%20ladies&sort=stock&image-size=small&image=stillLife&offset=0&page-size=216'

# Request to URL
page = requests.get( url, headers=headers )

# Beautiful Soup object #
soup = BeautifulSoup( page.text, 'html.parser' )

#=========================== Products e columns ===========================
products = soup.find( 'ul', class_='products-listing small' )
product_list = soup.find_all( 'article', class_='hm-product-item' )

# id 
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# category 
product_category = [p.get( 'data-category' ) for p in product_list]

# product name #
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price #
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

# Create dataframe 
data_women = pd.DataFrame( [product_id, product_category, product_name, product_price] ).T
data_women.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# create columns gender
data_women['gender'] = None
data_women['gender'] = data_women['gender'].fillna('female')

In [3]:
data_women.head()

Unnamed: 0,product_id,product_category,product_name,product_price,gender
0,963662002,ladies_basics_tops_shortsleeve,Cotton T-shirt,$ 5.99,female
1,762558221,ladies_tops_printed_tshirts,Oversized Printed T-shirt,$ 17.99,female
2,1031652022,ladies_tops_printed_tshirts,Printed T-shirt,$ 17.99,female
3,762558229,ladies_tops_printed_tshirts,Oversized Printed T-shirt,$ 17.99,female
4,963662001,ladies_basics_tops_shortsleeve,Cotton T-shirt,$ 5.99,female


### Collect Data - MEN

In [4]:
# parameters
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}

# URL
url = 'https://www2.hm.com/en_us/men/products/t-shirts-tank-tops.html?sort=stock&image-size=small&image=model&offset=0&page-size=367'

# Request to URL
page = requests.get( url, headers=headers )

# Beautiful Soup object #
soup = BeautifulSoup( page.text, 'html.parser' )

#=========================== Products e columns ===========================
products = soup.find( 'ul', class_='products-listing small' )
product_list = soup.find_all( 'article', class_='hm-product-item' )

# id 
product_id = [p.get( 'data-articlecode' ) for p in product_list]

# category 
product_category = [p.get( 'data-category' ) for p in product_list]

# product name #
product_list = products.find_all( 'a', class_='link' )
product_name = [p.get_text() for p in product_list]

# price #
product_list = products.find_all( 'span', class_='price regular' )
product_price = [p.get_text() for p in product_list]

# Create dataframe 
data_men = pd.DataFrame( [product_id, product_category, product_name, product_price] ).T
data_men.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# create columns gender
data_men['gender'] = None
data_men['gender'] = data_men['gender'].fillna('male')

In [5]:
data_men.head()

Unnamed: 0,product_id,product_category,product_name,product_price,gender
0,685816053,men_tshirtstanks_shortsleeve,Regular Fit Crew-neck T-shirt,$ 5.99,male
1,624684003,men_tshirtstanks_multipacks,5-pack Slim Fit T-shirts,$ 34.99,male
2,598755001,men_tshirtstanks_shortsleeve,Long Fit T-shirt,$ 9.99,male
3,685816002,men_tshirtstanks_shortsleeve,Regular Fit Crew-neck T-shirt,$ 5.99,male
4,608945008,men_tshirtstanks_shortsleeve,Relaxed Fit T-shirt,$ 12.99,male


## Join of Tables 

In [6]:
data = pd.concat([data_women, data_men])
data

Unnamed: 0,product_id,product_category,product_name,product_price,gender
0,0963662002,ladies_basics_tops_shortsleeve,Cotton T-shirt,$ 5.99,female
1,0762558221,ladies_tops_printed_tshirts,Oversized Printed T-shirt,$ 17.99,female
2,1031652022,ladies_tops_printed_tshirts,Printed T-shirt,$ 17.99,female
3,0762558229,ladies_tops_printed_tshirts,Oversized Printed T-shirt,$ 17.99,female
4,0963662001,ladies_basics_tops_shortsleeve,Cotton T-shirt,$ 5.99,female
...,...,...,...,...,...
358,1084761001,men_tshirtstanks,Wool-blend Base-layer Top,$ 64.99,male
359,0653275070,men_tshirtstanks_shortsleeve,Sports Shirt in DryMove™,$ 12.99,male
360,0653706054,men_tshirtstanks_shortsleeve,Slim Fit Sports Shirt,$ 17.99,male
361,1025399019,men_tshirtstanks_shortsleeve,Running Shirt in DryMove™,$ 17.99,male


In [7]:
# Index reset
data.reset_index(drop=True, inplace=True)

### Data Cleaning

In [8]:
data.columns # view columns

Index(['product_id', 'product_category', 'product_name', 'product_price',
       'gender'],
      dtype='object')

In [13]:
data.dtypes # Types the data

product_id            int64
product_category     object
product_name         object
product_price       float64
gender               object
dtype: object

In [14]:
data.isna().sum() # There are no null values

product_id          0
product_category    0
product_name        0
product_price       0
gender              0
dtype: int64

In [9]:
# product_id
data['product_id'] = data['product_id'].astype('int64')


# product_name
data['product_name'] = data['product_name'].apply( lambda x: x.replace( ' ', '_' ).lower() )

# product_price 
data['product_price'] = data['product_price'].apply( lambda x: float( re.search( '\d+\.?\d+', x ).group(0)))

In [10]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,gender
0,963662002,ladies_basics_tops_shortsleeve,cotton_t-shirt,5.99,female
1,762558221,ladies_tops_printed_tshirts,oversized_printed_t-shirt,17.99,female
2,1031652022,ladies_tops_printed_tshirts,printed_t-shirt,17.99,female
3,762558229,ladies_tops_printed_tshirts,oversized_printed_t-shirt,17.99,female
4,963662001,ladies_basics_tops_shortsleeve,cotton_t-shirt,5.99,female
...,...,...,...,...,...
574,1084761001,men_tshirtstanks,wool-blend_base-layer_top,64.99,male
575,653275070,men_tshirtstanks_shortsleeve,sports_shirt_in_drymove™,12.99,male
576,653706054,men_tshirtstanks_shortsleeve,slim_fit_sports_shirt,17.99,male
577,1025399019,men_tshirtstanks_shortsleeve,running_shirt_in_drymove™,17.99,male


## create file csv

In [16]:
data.to_csv('web_scraping', index=False)
