## Web Scraping Amazon
### Preparando ambiente

In [35]:
import urllib.request as urllib_request
import pandas as pd
from bs4 import BeautifulSoup

### Extract

In [36]:
items_list = []
for i in range(1, 10):
    url = f'https://www.amazon.com.br/s?i=electronics&rh=n%3A16243822011%2Cp_72%3A17833786011&page={i}&content-id=amzn1.sym.578aa6a5-6bfa-4747-975c-cee0f889732e&pd_rd_r=7538e36b-adc4-4bb5-b22c-4244b566357f&pd_rd_w=QzjCE&pd_rd_wg=CzeX1&pf_rd_p=578aa6a5-6bfa-4747-975c-cee0f889732e&pf_rd_r=M5ST1MD384C6WAYW9P0H&qid=1692993545&ref=sr_pg_{i}'
    response = urllib_request.urlopen(url)
    html = response.read()

    soup = BeautifulSoup(html, 'html.parser')
    items_list = items_list + soup.find_all('div', {'class': 'a-section a-spacing-base'})

### Transform

In [37]:
names = [item.find('div', {'class': 'a-section a-spacing-none a-spacing-top-small s-title-instructions-style'}).get_text().strip() for item in items_list]
names[0]

"Smart TV LED 32'' HD Samsung LH32BETBLGGXZD"

In [38]:
rates = [item.find('div', {'class': 'a-section a-spacing-none a-spacing-top-micro'}).get_text().split(' ')[0] for item in items_list]
rates[0]

'4,7'

In [39]:
reviews_n = [item.find('div', {'class': 'a-section a-spacing-none a-spacing-top-micro'}).get_text().strip().split(' ')[-1] for item in items_list]
reviews_n[0]

'3.140'

In [40]:
prices = []
for item in items_list:
    cur_item = item.find(
                    'div', 
                    {'class': 'a-section a-spacing-none a-spacing-top-small s-price-instructions-style'}
                )
    if cur_item is not None:
        cur_item_price = cur_item.find('span')
        if cur_item_price is not None:
            price = cur_item_price.get_text()
            price = ''.join(price.split('\xa0'))
            price = price.split('R$')
            if len(price) > 1:
                prices.append(price[1])
            else:
                prices.append(None)
        else:
            prices.append(None)
    else:
        prices.append(None)
prices[0]

'1.038,06'

In [41]:
list(zip(names, rates, prices))[0]

("Smart TV LED 32'' HD Samsung LH32BETBLGGXZD", '4,7', '1.038,06')

In [42]:
df = pd.DataFrame(list(zip(names, rates, prices)), columns=['product_desc', 'product_rate', 'product_price'])
df.head(15)

Unnamed: 0,product_desc,product_rate,product_price
0,Smart TV LED 32'' HD Samsung LH32BETBLGGXZD,47,"1.038,06"
1,"Samsung 60BU8000 - Smart TV LED 60' 4K UHD, Wi...",47,
2,"Smart TV Crystal 43"" 4K UHD Samsung CU7700 - A...",48,"1.996,70"
3,"Smart TV LED 55"" 4K UHD LG 55UQ801COSB.BWZ - I...",48,"2.449,00"
4,"PHILIPS Smart TV 50"" 4K Android Ambilight 50PU...",47,"2.279,00"
5,"Smart TV Crystal 50"" 4K UHD Samsung CU7700 - A...",44,"2.385,25"
6,Smart TV 32” Philco PTV32G7ER2CPBLH Dolby Audi...,45,99079
7,"Smart TV Crystal 55"" 4K UHD Samsung CU7700 - A...",47,"2.574,99"
8,"2022 Smart TV LG 32"" HD 32LQ620 WiFi Bluetooth...",47,"1.118,99"
9,"Smart TV 55"" 4K LG UHD ThinQ AI 55UR8750PSA HD...",41,


In [43]:
df['product_price'] = df['product_price'].str.replace('.', '')
df['product_price'] = df['product_price'].str.replace(',', '.')
df['product_price'] = pd.to_numeric(df['product_price'])
df.head()

  df['product_price'] = df['product_price'].str.replace('.', '')


Unnamed: 0,product_desc,product_rate,product_price
0,Smart TV LED 32'' HD Samsung LH32BETBLGGXZD,47,1038.06
1,"Samsung 60BU8000 - Smart TV LED 60' 4K UHD, Wi...",47,
2,"Smart TV Crystal 43"" 4K UHD Samsung CU7700 - A...",48,1996.7
3,"Smart TV LED 55"" 4K UHD LG 55UQ801COSB.BWZ - I...",48,2449.0
4,"PHILIPS Smart TV 50"" 4K Android Ambilight 50PU...",47,2279.0


In [44]:
df['product_rate'] = df['product_rate'].str.replace(',', '.')
df['product_rate'] = pd.to_numeric(df['product_rate'])
df.head()

Unnamed: 0,product_desc,product_rate,product_price
0,Smart TV LED 32'' HD Samsung LH32BETBLGGXZD,4.7,1038.06
1,"Samsung 60BU8000 - Smart TV LED 60' 4K UHD, Wi...",4.7,
2,"Smart TV Crystal 43"" 4K UHD Samsung CU7700 - A...",4.8,1996.7
3,"Smart TV LED 55"" 4K UHD LG 55UQ801COSB.BWZ - I...",4.8,2449.0
4,"PHILIPS Smart TV 50"" 4K Android Ambilight 50PU...",4.7,2279.0


### Load

In [46]:
df.to_csv('dados_TVs.csv')