<a href="https://colab.research.google.com/github/LaraV15/NLP/blob/main/Creaci%C3%B3n_del_data_set_de_libros.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


En esta notebook el objetivo es realizar web scraping para conformar un dataset con información sobre los 1000 libros más populares del Proyecto Gutenberg. El enlace a utilizar es el siguiente: https://www.gutenberg.org/browse/scores/top1000.php#books-last1.

El archivo luego será subido a GitHub y utilizado en el notebook principal: 'TP1-Domingo-Valeri'

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def obtener_datos_libros(url="https://www.gutenberg.org/browse/scores/top1000.php#books-last1", limite=1000):
    # Solicitar la página inicial
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Lista para almacenar la información de los libros
    books_data = []

    # Extraemos los enlaces de los libros
    books = soup.select("ol li a")

    for book in books[:limite]:
        title_author = book.get_text()
        link = "https://www.gutenberg.org" + book['href']

        # Solicitar la página de cada libro
        book_response = requests.get(link)
        book_soup = BeautifulSoup(book_response.content, 'html.parser')

        # Diccionario para almacenar la información del libro
        book_info = {
            "Title_Author": title_author,
            "Link": link
        }

        # Extraer la información específica
        try:
            book_info['Author'] = book_soup.find("th", text="Author").find_next_sibling("td").get_text(strip=True)
        except AttributeError:
            book_info['Author'] = "N/A"

        try:
            book_info['Title'] = book_soup.find("th", text="Title").find_next_sibling("td").get_text(strip=True)
        except AttributeError:
            book_info['Title'] = "N/A"

        try:
            book_info['Credits'] = book_soup.find("th", text="Credits").find_next_sibling("td").get_text(strip=True)
        except AttributeError:
            book_info['Credits'] = "N/A"

        try:
            book_info['Summary'] = book_soup.find("th", text="Summary").find_next_sibling("td").get_text(strip=True)
        except AttributeError:
            book_info['Summary'] = "N/A"

        try:
            book_info['Language'] = book_soup.find("th", text="Language").find_next_sibling("td").get_text(strip=True)
        except AttributeError:
            book_info['Language'] = "N/A"

        try:
            # Procesar todas las categorías en una lista y unirlas en una cadena separada por comas
            subjects = book_soup.find_all("th", text="Subject")
            categories = [subject.find_next_sibling("td").get_text(strip=True) for subject in subjects]
            book_info['Category'] = ", ".join(categories) if categories else "N/A"
        except AttributeError:
            book_info['Category'] = "N/A"

        try:
            book_info['Release Date'] = book_soup.find("th", text="Release Date").find_next_sibling("td").get_text(strip=True)
        except AttributeError:
            book_info['Release Date'] = "N/A"

        # Agregar la información del libro a la lista de datos
        books_data.append(book_info)

    # Convertir la lista de datos en un DataFrame
    df_books = pd.DataFrame(books_data)
    return df_books


In [None]:
df_books = obtener_datos_libros()
df_books.head()

  book_info['Author'] = book_soup.find("th", text="Author").find_next_sibling("td").get_text(strip=True)
  book_info['Title'] = book_soup.find("th", text="Title").find_next_sibling("td").get_text(strip=True)
  book_info['Credits'] = book_soup.find("th", text="Credits").find_next_sibling("td").get_text(strip=True)
  book_info['Summary'] = book_soup.find("th", text="Summary").find_next_sibling("td").get_text(strip=True)
  book_info['Language'] = book_soup.find("th", text="Language").find_next_sibling("td").get_text(strip=True)
  subjects = book_soup.find_all("th", text="Subject")
  book_info['Release Date'] = book_soup.find("th", text="Release Date").find_next_sibling("td").get_text(strip=True)


Unnamed: 0,Title_Author,Link,Author,Title,Credits,Summary,Language,Category,Release Date
0,"Frankenstein; Or, The Modern Prometheus by Mar...",https://www.gutenberg.org/ebooks/84,"Shelley, Mary Wollstonecraft, 1797-1851","Frankenstein; Or, The Modern Prometheus","Judith Boss, Christy Phillips, Lynn Hanninen a...","""Frankenstein; Or, The Modern Prometheus"" by M...",English,"Science fiction, Horror tales, Gothic fiction,...","Oct 1, 1993"
1,呻吟語 by Kun Lü (4096),https://www.gutenberg.org/ebooks/25558,"Lü, Kun, 1536-1618",呻吟語,Produced by Chu-Yu Huang,"""呻吟語"" by Kun Lü is a philosophical treatise wr...",Chinese,Conduct of life,"May 22, 2008"
2,Pride and Prejudice by Jane Austen (2513),https://www.gutenberg.org/ebooks/1342,"Austen, Jane, 1775-1817",Pride and Prejudice,Chuck Greif and the Online Distributed Proofre...,"""Pride and Prejudice"" by Jane Austen is a clas...",English,"England -- Fiction, Young women -- Fiction, Lo...","Jun 1, 1998"
3,"Moby Dick; Or, The Whale by Herman Melville (2...",https://www.gutenberg.org/ebooks/2701,"Melville, Herman, 1819-1891","Moby Dick; Or, The Whale","Daniel Lazarus, Jonesey, and David Widger","""Moby Dick; Or, The Whale"" by Herman Melville ...",English,"Whaling -- Fiction, Sea stories, Psychological...","Jul 1, 2001"
4,Romeo and Juliet by William Shakespeare (2424),https://www.gutenberg.org/ebooks/1513,"Shakespeare, William, 1564-1616",Romeo and Juliet,"the PG Shakespeare Team, a team of about twent...","""Romeo and Juliet"" by William Shakespeare is a...",English,"Vendetta -- Drama, Youth -- Drama, Verona (Ita...","Nov 1, 1998"


In [None]:
df_books.to_csv("gutenberg_books_info.csv", index=False)

# Especifica la ruta de la carpeta en Google Drive
file_path = '/content/drive/MyDrive/00-TUIA/4.2 Procesamiento-del-Lenguaje-Natural/TP-1/gutenberg_books_info.csv'

# Guarda el archivo CSV en la ruta especificada
df_books.to_csv(file_path, index=False)

print('Dataset creado exitosamente')

Dataset creado exitosamente
