# Web Scraping con BeautifulSoup

## 1. Instalar e importar librerías

In [None]:
pip install requests

In [None]:
pip install beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup

## 2. Enviar petición HTTP para extraer código fuente de la página web

In [None]:
pagina_web = requests.get('http://books.toscrape.com/')

soup = BeautifulSoup(pagina_web.text, 'html.parser')

print(soup.prettify())

## 3. Analizar código fuente resultante

In [None]:
soup

## 4. Extrae la información relevante

In [None]:
titulo_item = soup.find('a')

print(titulo_item)

In [None]:
titulo_items = soup.find_all('a')

print(titulo_items)

In [None]:
titulos = []
links = []

titulo_items = soup.find_all('h3')

for items in titulo_items:
    titulo = items.a
    titulos.append(titulo['title'])
    links.append(titulo['href'])

In [None]:
titulos

In [None]:
links

In [None]:
precios = []

precio_items = soup.find_all('p', class_="price_color")

for item in precio_items:
    precio = item.text
    precios.append(precio.lstrip("Â£"))

In [None]:
precios

In [None]:
existencias = []

existencia_items = soup.find_all('p', class_='instock availability')

for item in existencia_items:
    existencia = item.text.strip()
    existencias.append(existencia)

In [None]:
existencias

In [None]:
ratings = []

rating_items = soup.find_all('p', class_="star-rating")

for item in rating_items:
    rating = item['class'][1]
    ratings.append(rating)

In [None]:
ratings

In [None]:
import pandas as pd

data = {
    'Titulo': titulos,
    'Precio': precios,
    'Rating': ratings,
    'En existencia': existencias,
    'Links': links
}

df = pd.DataFrame(data)

In [None]:
df

In [None]:
titulos = []
links = []

titulo_items = soup.find_all('h3')

for items in titulo_items:
    titulo = items.a
    titulos.append(titulo['title'])
    links.append("http://books.toscrape.com/" + titulo['href'])

In [None]:
df['Precio'] = df['Precio'].apply(float)

In [None]:
df['Precio'].mean()

In [None]:
df['Precio'].median()

## Paginación

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
def obtener_contenido_pagina(url):
    response = requests.get(url)
    return response.content

In [None]:
def analizar_contenido_html(html):
    return BeautifulSoup(html, 'html.parser')

In [None]:
data = []

def procesar_pagina(soup):
    titulos = []
    links = []
    ratings = []
    existencias = []

    titulo_items = soup.find_all('h3')

    for items in titulo_items:
        titulo = items.a
        titulos.append(titulo['title'])
        links.append(titulo['href'])

    rating_items = soup.find_all('p', class_="star-rating")

    for item in rating_items:
        rating = item['class'][1]
        ratings.append(rating)

    existencia_items = soup.find_all('p', class_='instock availability')

    for item in existencia_items:
        existencia = item.text.strip()
        existencias.append(existencia)

    for i in range(len(titulos)):
        data.append({
            "Título": titulos[i],
            "Enlace": links[i],
            "Rating": ratings[i],
            "Existencia": existencias[i]
        })

In [None]:
def manejar_paginacion(url_base, num_paginas):
    for i in range(1, num_paginas + 1):
        url = url_base + '/page-' + str(i) + ".html"  # Actualiza la URL base con el número de página actual
        contenido_pagina = obtener_contenido_pagina(url)
        soup = analizar_contenido_html(contenido_pagina)
        procesar_pagina(soup)

In [None]:
url_base = 'http://books.toscrape.com/catalogue/category/books_1/'  # Reemplaza con tu URL base
num_paginas = 50  # Reemplaza con el número total de páginas

manejar_paginacion(url_base, num_paginas)

In [None]:
import pandas as pd

df = pd.DataFrame(data)

In [None]:
df

# Ejercicio

In [None]:
page = requests.get('https://www.cyberpuerta.mx/Promociones/')

soup = BeautifulSoup(page.text, 'html.parser')

print(soup.prettify())

In [None]:
lista_items = soup.find('div', class_="emproduct_right")

print(lista_items.div)

In [None]:
lista_items = soup.find('div', class_="emproduct_right")

print(lista_items.a.attrs)

In [None]:
lista_items = soup.find('div', class_="emproduct_right")

print(lista_items.a['title'])

In [None]:
lista_items = soup.find_all('div', class_="emproduct_right")

print(lista_items)

In [None]:
urls = []

for i in range(1, 10):
    response = requests.get(f'http://books.toscrape.com/catalogue/category/books/young-adult_21/page-{i}.html')

    if response.status_code == 200:
        urls.append(response.url)

In [None]:
urls

In [None]:
titulos = []

def extraer_titulos(soup):
    page_items = soup.find_all('h3')

    for item in page_items:
        etiqueta = item.find('a')
        titulo = etiqueta['title']
        titulos.append(titulo)

In [None]:
precios = []

def extraer_precios(soup):
    page_items = soup.find_all('p', class_="price_color")

    for item in page_items:
        precio = item.text
        precios.append(precio)
        

In [None]:
ratings = []

def extraer_ratings(soup):
    page_items = soup.find_all('p', class_="star-rating")

    for item in page_items:
        ratings.append(item['class'][1])

In [None]:
existencias = []

def extraer_existencias(soup):
    page_items = soup.find_all('p', class_="instock availability")

    for item in page_items:
        existencia = item.text
        existencias.append(existencia)

In [None]:
def obtener_contenido_titulos(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    extraer_titulos(soup)
    extraer_precios(soup)
    extraer_ratings(soup)
    extraer_existencias(soup)

In [None]:
for url in urls:
    obtener_contenido_titulos(url)

In [None]:
import pandas as pd

data = {'Titulo': titulos, 'Precio': precios, 'Rating': ratings, 'En existencia': existencias}

df = pd.DataFrame(data)

In [None]:
df