In [1]:
# We start as always with importing the necessary libraries

import requests
from bs4 import BeautifulSoup #Submodule of bs4 (I don't need the entire package). 
import nltk #natural language processing (NLP)
from collections import Counter


In [3]:
# We can now download the book that we want

r = requests.get("https://www.gutenberg.org/files/2701/2701-h/2701-h.htm")
r.encoding = 'utf-8'
html = r.text
print(html[0:2000])

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<meta http-equiv="Content-Style-Type" content="text/css" />
<title>The Project Gutenberg eBook of Moby Dick; Or the Whale, by Herman Melville</title>

<style type="text/css" xml:space="preserve">

    body {margin-left:15%; margin-right:15%; text-align:justify }
    p { text-indent: 1em; margin-top: .25em; margin-bottom: .25em; }
    H1,H2,H3,H4,H5,H6 { text-align: center; margin-left: 15%; margin-right: 15%; }
    hr  { width: 50%; text-align: center;}
    blockquote {font-size: 100%; margin-left: 0%; margin-right: 0%;}
    .mynote    {background-color: #DDE; color: #000; padding: .5em; margin-left: 10%; margin-right: 10%; font-family: sans-serif; font-size: 95%;}
    .toc       { margin-left: 10%; margin-bottom: .75em;

In [5]:
# We can now extract the useful data from it

soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
print(text[0:2000])






The Project Gutenberg eBook of Moby Dick; Or the Whale, by Herman Melville



The Project Gutenberg eBook of Moby-Dick; or The Whale, by Herman Melville

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online
at www.gutenberg.org. If you
are not located in the United States, you will have to check the laws of the
country where you are located before using this eBook.

Title: Moby-Dick; or The Whale
Author: Herman Melville
Release Date: June, 2001 [eBook #2701]
[Most recently updated: August 18, 2021]
Language: English
Character set encoding: UTF-8
Produced by: Daniel Lazarus, Jonesey, and David Widger
*** START OF THE PROJECT GUTENBERG EBOOK MOBY-DICK; OR THE WHALE ***

      MOBY-DICK;or, THE WHALE.
    




      By Herman Melville
    

 





In [7]:
# The document needs to be tokenized to be analyzed

tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(text)
print(tokens[0:8])

['The', 'Project', 'Gutenberg', 'eBook', 'of', 'Moby', 'Dick', 'Or']


In [9]:
# Some additional pre-processing is needed
words = [token.lower() for token in tokens]
print(words[0:8])

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or']


In [10]:
sw = nltk.corpus.stopwords.words('english')
print(sw[0:8])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves']


In [11]:
words_ns = [word for word in words if word not in sw]
print(words_ns[:5])

['project', 'gutenberg', 'ebook', 'moby', 'dick']


In [12]:
# We can now analyze the data and determine the most common word
count = Counter(words_ns)
top_ten = count.most_common(10)
print(top_ten)

[('whale', 1244), ('one', 925), ('like', 647), ('upon', 568), ('man', 527), ('ship', 519), ('ahab', 517), ('ye', 473), ('sea', 455), ('old', 452)]
