# Enviroment

## Installations

In [4]:
#!pip install beautifulsoup4
#!pip install pandas
#!pip3 install bs4



## Imports

In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import math
import re

# Screaping
from   bs4 import BeautifulSoup
import requests

# Time
from datetime import datetime

# Visualization
from tabulate import tabulate

## Functions

In [22]:
def clean_html(input_html):
    '''
    Remove spaces and "/n" caracters
    
    :param input_html: BeautifulSoup object to clean
    
    return New object without /n and spaces
    '''
    return ' '.join(input_html.split()).replace('> <', '><')

# Web Scraping

## Beautifull Soup

### Step 1

Top page

In [43]:
# Request informations
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
page = requests.get(url, headers=headers)

# BeautifulSoup html page
soup = BeautifulSoup(page.text, 'html.parser')

# BeautifulSoup html products content
products = soup.find('ul', class_="products-listing small")
product_list = products.find_all('article', class_='hm-product-item')

# product id
product_id = [p.get('data-articlecode') for p in product_list]

# product category
product_category = [p.get('data-category') for p in product_list]

# product names
products_names = products.find_all('a', class_ = 'link')
products_names = [p.get_text() for p in products_names]

# product price
product_price = products.find_all('span', class_ = 'price regular')
product_price = [p.get_text() for p in product_price]

# DataFrame with scrapy data
data = pd.DataFrame([product_id, products_names, product_category, product_price]).T
data.columns = ['product_id', 'name', 'category', 'price']
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H-%M-%S')

In [45]:
data.head()

Unnamed: 0,product_id,name,category,price,scrapy_datetime
0,985197001,Slim Jeans,men_jeans_slim,$ 19.99,2021-11-17 10-11-07
1,985159001,Skinny Jeans,men_jeans_skinny,$ 19.99,2021-11-17 10-11-07
2,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99,2021-11-17 10-11-07
3,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99,2021-11-17 10-11-07
4,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99,2021-11-17 10-11-07


In [11]:
# product color

In [12]:
# product composition

### Step 2

Paginations

In [16]:
# Request informations
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
page = requests.get(url, headers=headers)

# BeautifulSoup html page
soup = BeautifulSoup(page.text, 'html.parser')

In [5]:
total_items = int(soup.find('h2', class_ = 'load-more-heading').get('data-total'))
total_items

87

In [6]:
# How to round to upper number
page_number = math.ceil(total_items / 36)
page_number

3

In [7]:
url_2 = url + '?page-size=' + str(page_number * 36)
url_2

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

### Step 3

Specific product page (subpage)

#### One Product

In [17]:
#------------------------------API Request----------------------------
url = 'https://www2.hm.com/en_us/productpage.0690449051.html'
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
page = requests.get(url, headers=header).text

#------------------------------BeautifulSoup Object--------------------
soup = BeautifulSoup(page)

#------------------------------Product Color---------------------------

product_detail = soup.find_all('a', class_ = 'filter-option miniature')
product_color = [p.get('data-color') for p in product_detail]
product_code = [p.get('data-articlecode') for p in product_detail]

df_color = pd.DataFrame({'product_code': product_code, 'product_color': product_color})
df_color['color_id'] = df_color['product_code'].apply(lambda x: x[-3:])
df_color['style_id'] = df_color['product_code'].apply(lambda x: x[:-3])
df_color.head()

Unnamed: 0,product_code,product_color,color_id,style_id
0,690449001,Light denim blue/trashed,1,690449
1,690449002,Denim blue,2,690449
2,690449006,Black/washed,6,690449
3,690449007,Light denim blue,7,690449
4,690449009,Black washed out,9,690449


In [18]:
#------------------------------Product Composition---------------------------
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in soup.find_all('div', class_ = 'pdp-description-list-item')]

df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]
df_composition.drop(index=0, inplace=True)

df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]
df_composition.columns = df_composition.columns.str.lower()
df_composition.columns = ['fit', 'composition', 'product_code']
df_composition.fillna(method='ffill', inplace=True)
df_composition['style_id'] = df_composition['product_code'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['product_code'].apply(lambda x: x[-3:])
df_composition

Unnamed: 0,fit,composition,product_code,style_id,color_id
1,Skinny fit,"Cotton 98%, Spandex 2%",690449051,690449,51


In [4]:
#------------------------------SKU Product---------------------------
df_sku = pd.merge(df_color, df_composition[['style_id', 'fit', 'composition']], how='left', on='style_id')
df_sku

Unnamed: 0,product_code,product_color,color_id,style_id,fit,composition
0,690449001,Light denim blue/trashed,1,690449,Skinny fit,"Cotton 98%, Elastane 2%"
1,690449002,Denim blue,2,690449,Skinny fit,"Cotton 98%, Elastane 2%"
2,690449006,Black/washed,6,690449,Skinny fit,"Cotton 98%, Elastane 2%"
3,690449007,Light denim blue,7,690449,Skinny fit,"Cotton 98%, Elastane 2%"
4,690449009,Black washed out,9,690449,Skinny fit,"Cotton 98%, Elastane 2%"
5,690449011,White,11,690449,Skinny fit,"Cotton 98%, Elastane 2%"
6,690449013,Black/washed,13,690449,Skinny fit,"Cotton 98%, Elastane 2%"
7,690449021,Dark denim blue/trashed,21,690449,Skinny fit,"Cotton 98%, Elastane 2%"
8,690449022,Black/trashed,22,690449,Skinny fit,"Cotton 98%, Elastane 2%"
9,690449024,Dark blue/Trashed,24,690449,Skinny fit,"Cotton 98%, Elastane 2%"


#### All products

In [66]:
#------------------------------API Request----------------------------
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
# URL model = 'https://www2.hm.com/en_us/productpage + product code + .html'

df_details = pd.DataFrame()
columns_available = list()
df_pattern = pd.DataFrame(columns=['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size'])

for i in range(len(data)):
    # URL composition and request
    url = f"https://www2.hm.com/en_us/productpage.{data.loc[i, 'product_id']}.html"
    page = requests.get(url, headers=header).text
    
    #------------------------------BeautifulSoup Object--------------------
    soup = BeautifulSoup(page)

    #------------------------------Product Color---------------------------

    product_detail = soup.find_all('a', class_ = 'filter-option miniature')
    product_color = [p.get('data-color') for p in product_detail]
    product_code = [p.get('data-articlecode') for p in product_detail]

    df_color = pd.DataFrame({'product_code': product_code, 'product_color': product_color})
    df_color['color_id'] = df_color['product_code'].apply(lambda x: x[-3:])
    df_color['style_id'] = df_color['product_code'].apply(lambda x: x[:-3])
    
    #------------------------------Product Composition---------------------------
    ## Get composition information
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in soup.find_all('div', class_ = 'pdp-description-list-item')]
    
    ## Composition to DataFrame
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]
    df_composition.drop(index=0, inplace=True)
    
    #columns_available = columns_available + df_composition.columns.to_list()
    
    ## Garantee the patters columns
    df_composition = pd.concat([df_pattern, df_composition])
    
    #df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]
    
    # Rename columns and fill data below
    df_composition.columns = df_composition.columns.str.lower()
    df_composition.columns = ['product_code', 'composition', 'fit', 'sustainable_materials', 'size']
    df_composition.fillna(method='ffill', inplace=True)
    
    
    # Derivate new features
    df_composition['style_id'] = df_composition['product_code'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['product_code'].apply(lambda x: x[-3:])
    
    
    #------------------------------SKU Product---------------------------
    df_sku = pd.merge(df_color, df_composition[['style_id', 'fit', 'composition', 'size', 'sustainable_materials']], 
                      how='left', on='style_id')
    
    
    #------------------------------All products--------------------------
    df_details = pd.concat([df_all_products, df_sku], axis=0)
    

# Join showroom data + details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])
data_raw = pd.merge(data, df_details[['style_id', 'product_color', 'fit', 'composition', 'size', 
                                      'sustainable_materials']], on='style_id', how='left')

In [75]:
# Save file
data_raw.to_csv('../data/products_hnm.csv', index=False)

## Exercises

### Collect the following information on the page: https://books.toscrape.com

- Catalog

    - Classics

    - Science Fiction

    - Humor

    - Business

- Collect the following information about the books:
    
    - Name of book
  
    - Price in pound
 
    - Rating
 
    - Stock information
 
### Deliverable
- A plan to all questions:
 
    - Output: The simulation of a table or final graph
 
    - Process: Step sequence organized by logical execution
 
    - Input: The link for data source
   
- A csv file with all collected information.

### Plan

<font size="3"><b>Output</b></font>

Columns in the table

Catalog | Name of Book | Price | Rating | Stocks Status

In [37]:
tab = [
    ['catalog', 'book_name', 'price', 'rating', 'stock_status'],
    ['Cassic', 'The Secret Garden', '£15.08', '4', 'In stock'],
    ['Science Fiction', 'Mesaerion', '£37.59', '1', 'In stock']
]

print(tabulate(tab, headers='firstrow'))

catalog          book_name          price      rating  stock_status
---------------  -----------------  -------  --------  --------------
Cassic           The Secret Garden  £15.08          4  In stock
Science Fiction  Mesaerion          £37.59          1  In stock


<font size="3"><b>Process</b></font>

Step 1: Create a soup object from the book collection page

Step 2: Identify book attribute tags for capture

Step 3: Get all the information from the book collection

Step 4: Repeat process for all book collections

Step 5: Create a DataFrame Pandas with all the information from the book collections

Step 6: Generate a csv file with the collected information

<font size="3"><b>Input</b></font>

<a href="https://books.toscrape.com/catalogue/category/books/classics_6/index.html">Classics</a> 

<a href="https://books.toscrape.com/catalogue/category/books/science-fiction_16/index.html">Science Fiction</a> 

<a href="https://books.toscrape.com/catalogue/category/books/humor_30/index.html">Humor</a> 

<a href="https://books.toscrape.com/catalogue/category/books/business_35/index.html">Business</a> 

#### Step 1 to 3

##### Top page

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}

In [76]:
# Request information

url_classics = 'https://books.toscrape.com/catalogue/category/books/classics_6/index.html'
page = requests.get(url_classics, headers=headers)

In [77]:
# Clean html
page = clean_html(page.text)

In [78]:
# BeautifulSoup object
soup = BeautifulSoup(page, 'html.parser')

In [144]:
# Name of collection
catalog = soup.find('h1').text

In [69]:
# Name of book
book_name = soup.find_all('h3')
book_name_list = [name.get_text() for name in book_name]
book_name_list

['The Secret Garden',
 'The Metamorphosis',
 "The Pilgrim's Progress",
 'The Hound of the ...',
 'Little Women (Little Women ...',
 'Gone with the Wind',
 'Candide',
 'Animal Farm',
 'Wuthering Heights',
 'The Picture of Dorian ...',
 'The Complete Stories and ...',
 'Beowulf',
 'And Then There Were ...',
 'The Story of Hong ...',
 'The Little Prince',
 'Sense and Sensibility',
 'Of Mice and Men',
 'Emma',
 "Alice in Wonderland (Alice's ..."]

In [70]:
# Price of book
prices = soup.find_all('p', class_ = 'price_color')
prices_list = [price.get_text() for price in prices]
prices_list

['Â£15.08',
 'Â£28.58',
 'Â£50.26',
 'Â£14.82',
 'Â£28.07',
 'Â£32.49',
 'Â£58.63',
 'Â£57.22',
 'Â£17.73',
 'Â£29.70',
 'Â£26.78',
 'Â£38.35',
 'Â£35.01',
 'Â£43.19',
 'Â£45.42',
 'Â£37.46',
 'Â£47.11',
 'Â£32.93',
 'Â£55.53']

In [72]:
# Stock
stock_prod = soup.find_all('p', class_ = 'instock availability')
stock_status_list = [stock.get_text() for stock in stock_prod]
stock_status_list

[' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ',
 ' In stock ']

In [121]:
soup.find_all('p', class_ = True)[3].attrs['class']

['star-rating', 'One']

In [129]:
# Rating
#soup.find_all('p', class_ = True)[0].attrs['class'][1]
ratings = soup.find_all('p', class_ = True)
rating_list = [r.attrs['class'] for r in ratings]
rating_list

[['star-rating', 'Four'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'One'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Two'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Two'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Four'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Three'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Three'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Three'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Three'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Two'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Four'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Two'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Two'],
 ['price_color'],
 ['instock', 'availability'],
 ['star-rating', 'Four'],
 ['price_color

In [143]:
rating_list_clean = list()
for i in range(0, len(rating_list), 3):
    rating_list_clean.append(rating_list[i][1])
rating_list_clean

['Four',
 'One',
 'Two',
 'Two',
 'Four',
 'Three',
 'Three',
 'Three',
 'Three',
 'Two',
 'Four',
 'Two',
 'Two',
 'Four',
 'Two',
 'One',
 'Two',
 'Two',
 'One']

In [137]:
rating_list[1]

['price_color']

In [146]:
pd.DataFrame({
    'catalog': catalog, 
    'book_name': book_name_list, 
    'price': prices_list, 
    'rating': rating_list_clean, 
    'stock_status': stock_status_list
})

Unnamed: 0,catalog,book_name,price,rating,stock_status
0,Classics,The Secret Garden,Â£15.08,Four,In stock
1,Classics,The Metamorphosis,Â£28.58,One,In stock
2,Classics,The Pilgrim's Progress,Â£50.26,Two,In stock
3,Classics,The Hound of the ...,Â£14.82,Two,In stock
4,Classics,Little Women (Little Women ...,Â£28.07,Four,In stock
5,Classics,Gone with the Wind,Â£32.49,Three,In stock
6,Classics,Candide,Â£58.63,Three,In stock
7,Classics,Animal Farm,Â£57.22,Three,In stock
8,Classics,Wuthering Heights,Â£17.73,Three,In stock
9,Classics,The Picture of Dorian ...,Â£29.70,Two,In stock


##### Book Page

In [178]:
soup.find('article', class_ = 'product_pod')

<article class="product_pod"><div class="image_container"><a href="../../../the-dirty-little-secrets-of-getting-your-dream-job_994/index.html"><img alt="The Dirty Little Secrets of Getting Your Dream Job" class="thumbnail" src="../../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg"/></a></div><p class="star-rating Four"><i class="icon-star"></i><i class="icon-star"></i><i class="icon-star"></i><i class="icon-star"></i><i class="icon-star"></i></p><h3><a href="../../../the-dirty-little-secrets-of-getting-your-dream-job_994/index.html" title="The Dirty Little Secrets of Getting Your Dream Job">The Dirty Little Secrets ...</a></h3><div class="product_price"><p class="price_color">Â£33.34</p><p class="instock availability"><i class="icon-ok"></i> In stock </p><form><button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button></form></div></article>

#### Step 4 to 6

In [23]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'}
df_books = pd.DataFrame()
html_collections = ['https://books.toscrape.com/catalogue/category/books/classics_6/index.html',
                   'https://books.toscrape.com/catalogue/category/books/science-fiction_16/index.html', 
                    'https://books.toscrape.com/catalogue/category/books/humor_30/index.html',
                    'https://books.toscrape.com/catalogue/category/books/business_35/index.html']

for html in html_collections:
    # Page request
    page = requests.get(html, headers=headers)
    
    # Clean html
    page = clean_html(page.text)
    
    # BeautifulSoup object
    soup = BeautifulSoup(page, 'html.parser')
    
    # Name of collection
    catalog = soup.find('h1').text
    
    # Name of book
    book_name = soup.find_all('h3')
    book_name_list = [name.get_text() for name in book_name]
    
    # Price of book
    prices = soup.find_all('p', class_ = 'price_color')
    prices_list = [price.get_text() for price in prices]
    
    # Rating
    ratings = soup.find_all('p', class_ = True)
    rating_list = [r.attrs['class'] for r in ratings]
    
    # Stock
    stock_prod = soup.find_all('p', class_ = 'instock availability')
    stock_status_list = [stock.get_text() for stock in stock_prod]
    rating_list_clean = list()
    for i in range(0, len(rating_list), 3):
        rating_list_clean.append(rating_list[i][1])
        
    df_catalog = pd.DataFrame({'catalog': catalog, 
                                'book_name': book_name_list, 
                                'price': prices_list, 
                                'rating': rating_list_clean, 
                                'stock_status': stock_status_list})
    
    df_books = pd.concat([df_books, df_catalog])

In [150]:
df_books

Unnamed: 0,catalog,book_name,price,rating,stock_status
0,Classics,The Secret Garden,Â£15.08,Four,In stock
1,Classics,The Metamorphosis,Â£28.58,One,In stock
2,Classics,The Pilgrim's Progress,Â£50.26,Two,In stock
3,Classics,The Hound of the ...,Â£14.82,Two,In stock
4,Classics,Little Women (Little Women ...,Â£28.07,Four,In stock
5,Classics,Gone with the Wind,Â£32.49,Three,In stock
6,Classics,Candide,Â£58.63,Three,In stock
7,Classics,Animal Farm,Â£57.22,Three,In stock
8,Classics,Wuthering Heights,Â£17.73,Three,In stock
9,Classics,The Picture of Dorian ...,Â£29.70,Two,In stock


The top page don't get the book title properly

# Python Avançado

## Lambda

Recommended to local an simply function, without replication purpose

In [2]:
# Traditional function
def calc(a, b):
    return a + b

In [4]:
calc(2, 5)

7

In [5]:
# Lambda
cal_ = lambda a, b: a + b

In [6]:
cal_(2, 5)

7

## Map

Call a function to operate over a data struture

In [None]:
#map(function, data_structure)

In [8]:
upper = ['MARCOS', 'GALVAO']

lower = lambda x: x.lower()

list(map(lower, upper))

['marcos', 'galvao']

## Apply

Work as same as Map, but specific to Pandas library

Map -> General use

Apply -> Pandas use

In [10]:
df = pd.DataFrame({
    'name': ['MARCOS', 'GALVAO'],
    'age': [30, 31]
})

In [12]:
df['name'].apply(lower)

0    marcos
1    galvao
Name: name, dtype: object

## Lambda, Map e Apply II

In [25]:
df_books.head()

Unnamed: 0,catalog,book_name,price,rating,stock_status
0,Classics,The Secret Garden,Â£15.08,Four,In stock
1,Classics,The Metamorphosis,Â£28.58,One,In stock
2,Classics,The Pilgrim's Progress,Â£50.26,Two,In stock
3,Classics,The Hound of the ...,Â£14.82,Two,In stock
4,Classics,Little Women (Little Women ...,Â£28.07,Four,In stock


In [41]:
# Remove Â£ from price
df_books['price'] = df_books['price'].apply(lambda x: float(x.replace('Â£', '')))
df_books.head()

Unnamed: 0,catalog,book_name,price,rating,stock_status
0,Classics,The Secret Garden,15.08,Four,In stock
1,Classics,The Metamorphosis,28.58,One,In stock
2,Classics,The Pilgrim's Progress,50.26,Two,In stock
3,Classics,The Hound of the ...,14.82,Two,In stock
4,Classics,Little Women (Little Women ...,28.07,Four,In stock


In [55]:
df_books[df_books['rating'] == 'Five'].head()

Unnamed: 0,catalog,book_name,price,rating,stock_status
1,Science Fiction,Join,35.67,Five,In stock
1,Humor,Old School (Diary of ...,11.83,Five,In stock
3,Humor,Hyperbole and a Half: ...,14.75,Five,In stock
6,Humor,When You Are Engulfed ...,30.89,Five,In stock
8,Humor,Lamb: The Gospel According ...,55.5,Five,In stock


In [54]:
# Increse price in 100% if the catalog is business and stock status is in 
 
df_books.apply(lambda x:  x['price'] * 2
               if (x['catalog'] == 'Business') & (x['rating'] == 'Five') 
               else x, axis=1).query('rating == "Five"').head()

Unnamed: 0,catalog,book_name,price,rating,stock_status
1,Science Fiction,Join,35.67,Five,In stock
1,Humor,Old School (Diary of ...,11.83,Five,In stock
3,Humor,Hyperbole and a Half: ...,14.75,Five,In stock
6,Humor,When You Are Engulfed ...,30.89,Five,In stock
8,Humor,Lamb: The Gospel According ...,55.5,Five,In stock


In [51]:
df_books['stock_status'][0]

0     In stock 
0     In stock 
0     In stock 
0     In stock 
Name: stock_status, dtype: object

## Regex - Regular Expression

<font size=5>**Marcadores de início e fim de linha (^ e $)**</font>


**^Python** – Encontra qualquer string que esteja em início de linha e contenham os caracteres “Python”. 

**Python$** – Encontra qualquer string que esteja em fim de linha e contenham os caracteres “Python”. 

<br/>
<br/>

<font size=5>**Marcadores de quantidade (* + ?  e {})**</font>
 

**Pyt*** – Encontra strings com a sequência de caracteres “Py” seguida de zero ou mais ocorrências do caractere “t”.

**Pyt+** – Encontra strings com a sequência de caracteres “Py” seguida de uma ou mais ocorrências do caractere “t”.

**Pyt?** –  Encontra strings com a sequência de caracteres “Py” seguida de zero ou uma ocorrência do caractere “t”.

**Pyt{3}** – Encontra strings com a sequência de caracteres “Py” seguida de exatamente 3 ocorrências do caractere “t”.

**Pyt{2, 4}** – Encontra strings com a sequência de caracteres “Py” seguida de 2 a 4 ocorrências do caractere “t”.

**Pyt{3,}** – Encontra strings com a sequência de caracteres “Py” seguida de 3 ou mais ocorrências do caractere “t”. 

**Py(th)*** – Encontra strings com a sequência de caracteres “Py” seguida de zero ou mais ocorrências do da sequência “th”. 
    
<br/>
<br/>

<font size=5>**Marcador de alternativa ou operador OR ( | )**</font>

**Pyt|hon** – Encontra strings com a sequência de caracteres “Pyt” ou strings com a sequência de caracteres “hon”. 

**Py(t|h)** – Encontra strings com a sequência de caracteres “Py” seguida do caractere “t” ou do caractere “h”. 

**Py(t|h|o)** – Encontra strings com a sequência de caracteres “Py” seguida do caractere “t” ou do caractere “h” ou do caractere “o”. 
    
<br/>
<br/>
    
<font size=5>**Marcador de definição de Conjuntos ([ ])**</font>

**Py[tho]** – Encontra strings com a sequência de caracteres “Py” seguida de um caractere pertencente ao conjunto (t, h, o). 

**Py[0-9]** – Encontra strings com a sequência de caracteres “Py” seguida de um caractere pertencente ao conjunto de dígitos de 0 a 9. 

**Py[a-z]** – Encontra strings com a sequência de caracteres “Py” seguida de um caractere pertencente ao conjunto de caracteres alfabéticos de “a” a “z”. 

<br/>
<br/>
    
<font size=5>**Marcadores de classes de caracteres (\w, \d, \s e .)**</font>
    
**\w** – Representa um caractere alfanumérico, incluindo ocorrências maiúsculas e minúsculas das letras e o caractere “_”.

**\d** – Representa um caractere numérico e equivale a definição do conjunto [0-9]

**\s** – Representa um espaço em branco, incluindo tabulações e quebras de linha

<br/>
<br/>
    
**.** – Como já mencionado, este caractere é utilizado como coringa e representa qualquer caractere.

A utilização de \W, \D e \S serve como negação de suas correspondentes descritas acima. \W vai encontrar, por exemplo, qualquer caractere que não seja relevante para \w.

### Match

    For begining of a string
    
    CPF 001.223.344-55
    
    sequency: number point number point number trace number
    padron: 3 numbers 1 point 3 numbers 1 point 3 numbers 1 trace 2 numbers 
    
    Regex mask colect the information based on the string padron

In [4]:
cpf = '001.223.344-55'
rg = '12.420.530-10'
cel = '+5522989527342'
address = 'Cecilia Chapman 711-2880 Nulla St. Mankato Mississippi 96522 (257) 563-7401'
address_2 = 'Calista Wise 7292 Dictum Av. San Antonio MI 47096 (492) 709-6392'

mask_cpf = '^\d{3}\.\d{3}\.\d{3}\-\d{2}$'

mask_rg = '^\d{2}\.\d{3}\.\d{3}\-\d{2}$'

mask_cel = '^\+\d{13}$'

mask_cel_inside = '\+\d{13}'

text = f'cpf: {cpf}, rg: {rg} and celphone is {cel}'

bool(re.match(mask_cel, text))

False

### Search

    Any text position

In [22]:
# Street

mask = '\w{2}\..*\d{5}'
mask_2 = '(\w{2}\..+)\s\('

print(re.search(mask_2, address_2).group(0))

Av. San Antonio MI 47096 (


In [21]:
# Name

mask = '^\w+\s\w+'

print(re.search(mask, address).group(0))

Cecilia Chapman


## Clean Data with Regex

In [79]:
data = pd.read_csv('../data/products_hnm.csv')
data = data[['product_id', 'name', 'category', 'price',
       'style_id', 'color_id', 'product_color', 'fit', 'composition', 'size',
       'sustainable_materials', 'scrapy_datetime']]
data.head(3)

Unnamed: 0,product_id,name,category,price,style_id,color_id,product_color,fit,composition,size,sustainable_materials,scrapy_datetime
0,985197001,Slim Jeans,men_jeans_slim,$ 19.99,985197,1,Midnight blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,2021-11-17 10-11-07
1,985197001,Slim Jeans,men_jeans_slim,$ 19.99,985197,1,Midnight blue,Slim fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",,2021-11-17 10-11-07
2,985197001,Slim Jeans,men_jeans_slim,$ 19.99,985197,1,Denim blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,2021-11-17 10-11-07


In [124]:
display(data.isna().sum())
data.info()

product_id                 0
name                       0
category                   0
price                      0
style_id                   0
color_id                   0
product_color              0
fit                        0
composition                0
size                     739
sustainable_materials    826
scrapy_datetime            0
dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype                                 
---  ------                 --------------  -----                                 
 0   product_id             1728 non-null   int64                                 
 1   name                   1728 non-null   object                                
 2   category               1728 non-null   object                                
 3   price                  1728 non-null   float64                               
 4   style_id               1728 non-null   int64                                 
 5   color_id               1728 non-null   int64                                 
 6   product_color          1728 non-null   object                                
 7   fit                    1728 non-null   object                                
 8   composition            1728 non-null   object             

In [101]:
# name
data['name'] = data['name'].str.lower().str.replace(' ', '_')

# price
data['price'] = data['price'].apply(lambda x: float(x.replace('$ ', '')))

# scrapy_datetime
data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S')

# fit
data['fit'] = data['fit'].str.lower().str.replace(' ', '_')

# product color
data['product_color'] = data['product_color'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower())

In [122]:
data.head(3)

Unnamed: 0,product_id,name,category,price,style_id,color_id,product_color,fit,composition,size,sustainable_materials,scrapy_datetime
0,985197001,slim_jeans,men_jeans_slim,19.99,985197,1,midnight_blue,slim_fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,2021-11-17 10:00:00-07:00
1,985197001,slim_jeans,men_jeans_slim,19.99,985197,1,midnight_blue,slim_fit,"Shell: Cotton 98%, Spandex 2%","The model is 189cm/6'2"" and wears a size 32/32",,2021-11-17 10:00:00-07:00
2,985197001,slim_jeans,men_jeans_slim,19.99,985197,1,denim_blue,slim_fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32",,2021-11-17 10:00:00-07:00


In [125]:
data['size'].unique()

array(['The model is\xa0189cm/6\'2"\xa0and wears a size\xa032/32',
       'The model is\xa0182cm/6\'0"\xa0and wears a size\xa031/32',
       'The model is\xa0187cm/6\'2"\xa0and wears a size\xa031/32', nan,
       'The model is\xa0187cm/6\'2"\xa0and wears a size\xa032/32',
       'The model is\xa0184cm/6\'0"\xa0and wears a size\xa031/32',
       'The model is\xa0187cm/6\'2"\xa0and wears a size\xa031/30',
       'The model is\xa0188cm/6\'2"\xa0and wears a size\xa031/32',
       'The model is\xa0186cm/6\'1"\xa0and wears a size\xa031/32'],
      dtype=object)