# Data Scraping

In [None]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from math import ceil

In [None]:
BASE_URL = "https://www.gutenberg.org"
PARSER_NAME = 'lxml'

Code which creates a Pandas dataframe where:
  - Each dictionary within the list contains the information; more specifically, the title, author, and link to the HTML version of the books are stored in each dictionary.
  - The order of these dictionaries within the list is the same order as their 'ranking' on the Gutenberg website.
  - Books which do not have an HTML version (e.g. math textbooks or picture books which contain large amounts of content which isn't plain text) are ignored.
  - The created dictionary will contain exactly `num_2_get` books.

In [None]:
def process_booklinks(bl_element, book_id, num_2_take):
  # Create urls for corresponding books:
  book_urls = [BASE_URL + "/ebooks/" + str(id) for id in book_id]
  # Get soups for each link:
  soup_list = [BeautifulSoup(requests.get(url).text, PARSER_NAME) for url in book_urls]
  # Find URL extensions associated with "HTML (Original)" text links on page:
  html_ext = [s.find('a', text="Read this book online: HTML (original)") for s in soup_list]
  # Get lists of Authors, Titles, and links for books with HTML versions available:
  authors, titles, urls = [], [], []
  # Iterate over each book's unique description page:
  for i, ext in enumerate(html_ext):
    # Only add book if it has an HTML version of the book is available AND
    # if we still haven't taken num_2_take books:
    if ext is not None and len(authors) < num_2_take:
      # Get title from list of booklinks:
      title_i = bl_element[i].find(class_="title").contents[0]
      # Some titles have /r character at end - remove them if so:
      title_i = title_i[:-2] if title_i[-2:] == '\r' else title_i
      titles.append(title_i)
      # Get author from list of booklinks - need to check if book has a non-empty
      # author tag:
      author_i = bl_element[i].find(class_="subtitle")
      author_i = author_i.contents[0] if author_i is not None else "Anonymous"
      authors.append(author_i)
      # Get URL of HTML version of text:
      urls.append(BASE_URL + str(ext['href']))
  # Return these lists:
  return (authors, titles, urls)

def get_book_info(num_2_get):
  # Generate URLs of webpages to visit by appending extension to end of base directory:
  start_url = BASE_URL + "/ebooks/search/?sort_order=downloads"
  # Initialise loop variables:
  current_url = start_url
  num_books = 0 
  num_books_checked = 0
  # Initialise lists to store author, title, and url information for each book:
  author_list, title_list, url_list = [], [], []
  # Get the number of books we've specified:
  while num_books < num_2_get:
    # Get soup of current URL:
    soup = BeautifulSoup(requests.get(current_url).text, PARSER_NAME)
    # Get booklink elements off this page:
    current_el = [x for x in soup.find_all(class_='booklink')]
    # Determine the maximum number of books we need to take from this page:
    num_2_take = min(len(current_el), num_2_get - num_books)
    # Get book 'id' of each book on page:
    current_id = [x.a['href'].split('/')[-1] for x in current_el]
    # Get the Title, Author and URLs for each book which has an HTML version:
    (authors_i, titles_i, urls_i) = process_booklinks(current_el, current_id, num_2_take)
    # Append to our 'grand' lists:
    author_list += authors_i
    title_list += titles_i
    url_list += urls_i
    # Update how many books we've collected and checked from this page:
    num_books += len(authors_i)
    num_books_checked += len(current_id)
    # Update our current_url:
    current_url = start_url + f"&start_index={num_books_checked+1}"
  # Convert these lists into a Pandas dataframe:
  book_df = pd.DataFrame.from_dict({'title': title_list,
                                    'author': author_list,
                                    'url': url_list})
  return book_df

In [None]:
num_2_get = 100
book_df = get_book_info(num_2_get)

Now that we have a dictionary which contains links to the HTML version of each book we want to scrape, let's now actually scrape the text of each book:

In [None]:
def get_text(book_df):
  # Create regex to 'clean' text we'll scrape:
  clean_regex = re.compile('(<.*?>)|Â|([^ \w\.])')
  # Initialise list to store text:
  text_list = []
  # Iterate over urls in dataframe:
  for url in book_df['url']:
    # Get soup of page:
    soup = BeautifulSoup(requests.get(url).text, PARSER_NAME)
    # Initialise string to contain text:
    text_i = ''
    # Iterate over paragraphs on page and add to text:
    for p in soup.find_all('p'):
      text_i += re.sub(clean_regex, '', p.get_text())
    # Append i'th text to list:
    text_list.append(text_i)
  # Add text list as new column to dataframe:
  book_df['text'] = text_list
  return book_df

text_df = get_text(book_df)

Save to CSV:

In [None]:
text_df.to_csv('text.csv')