In [1]:
import random
import requests
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')

# Selecting Data
books = [
    {
        "title": "Pride and Prejudice",
        "author": "Jane Austen",
        "genre": "Romance",
        "url": "https://www.gutenberg.org/files/1342/1342-h/1342-h.htm"
    },
    {
        "title": "The Adventures of Sherlock Holmes",
        "author": "Arthur Conan Doyle",
        "genre": "Mystery",
        "url": "https://www.gutenberg.org/files/1661/1661-h/1661-h.htm"
    },
    {
        "title": "Wuthering Heights",
        "author": "Emily Bronte",
        "genre": "Gothic Fiction",
        "url": "https://www.gutenberg.org/files/768/768-h/768-h.htm"
    },
    {
        "title": "Alice's Adventures in Wonderland",
        "author": "Lewis Carroll",
        "genre": "Fantasy",
        "url": "https://www.gutenberg.org/files/11/11-h/11-h.htm"
    },
    {
        "title": "Dracula",
        "author": "Bram Stoker",
        "genre": "Horror",
        "url": "https://www.gutenberg.org/files/345/345-h/345-h.htm"
    }
]

# Sampling Data
num_docs = 200
doc_size = 150

data = []

for book in books:
    # Retrieve the webpage
    response = requests.get(book["url"])

    # Extract the text from the HTML
    soup = BeautifulSoup(response.text, "html.parser")
    text = soup.get_text()

    # Split the text into sentences
    sentences = nltk.sent_tokenize(text)

    # Sample the sentences randomly
    selected_sentences = random.sample(sentences, num_docs)

    # Preprocessing Data
    for sentence in selected_sentences:
        # Tokenize the sentence
        words = nltk.word_tokenize(sentence)

        # Keep only the first doc_size words
        if len(words) > doc_size:
            words = words[:doc_size]

        # Convert the words to lowercase
        words = [word.lower() for word in words]

        # Add the words and book label to the data list
        data.append({
            "words": words,
            "label": book["title"][0]
        })

print(data[:5])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jaysh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


[{'words': ['mr.', 'bennet', ',', 'in', 'equal', 'silence', ',', 'was', 'enjoying', 'the', 'scene', '.'], 'label': 'P'}, {'words': ['he', 'came', ',', 'and', 'in', 'such', 'very', 'good', 'time', ',', 'that', 'the', 'ladies', 'were', 'none', 'of', 'them', 'dressed', '.'], 'label': 'P'}, {'words': ['it', 'is', 'not', 'quite', 'a', 'week', 'since', 'they', 'left', 'brighton', '.'], 'label': 'P'}, {'words': ['she', 'certainly', 'did', 'not', 'hate', 'him', '.'], 'label': 'P'}, {'words': ['mr.', 'wickham', 'had', 'received', 'his', 'commission', 'before', 'he', 'left', 'london', ',', 'and', 'he', 'was', 'to', 'join', 'his', 'regiment', 'at', 'the', 'end', 'of', 'a', 'fortnight', '.'], 'label': 'P'}]


In [5]:
data[0]

{'words': ['mr.',
  'bennet',
  ',',
  'in',
  'equal',
  'silence',
  ',',
  'was',
  'enjoying',
  'the',
  'scene',
  '.'],
 'label': 'P'}