# Key Noun Extraction Notebook

In [3]:
# Import necessary libraries
import ebooklib
from bs4 import BeautifulSoup
import os

## Chapter Extractor
The following code extracts the epub documents into their separate chapters

In [12]:
asset_dir = '../Assets'
book_dir = os.path.join(asset_dir, 'FlyingMachines')
epub_dir = os.path.join(book_dir, 'flying_machines.epub')

# Load corpus
book = ebooklib.epub.read_epub(epub_dir)
print(book)

# Output Directory for chapter text files
output_dir = os.path.join(book_dir, 'chapters')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

chapter_counter = 1

# Function to split by heading tags
def split_by_heading(soup):
    """Split content by <h1>, <h2>, or <h3> tags, which usually represent chapters or sections."""
    chapters = []
    current_chapter = []

    for tag in soup.find_all(True):
        if tag.name in ['h1', 'h2', 'h3']:
            if current_chapter:
                chapters.append("\n".join(current_chapter))
                current_chapter = []
            current_chapter.append(tag.get_text().strip())  # Add the chapter title
        else:
            current_chapter.append(tag.get_text().strip())
    
    if current_chapter:
        chapters.append("\n".join(current_chapter))
    
    return chapters

# Extract and save each chapter as a separate text file
for item in book.get_items():
    if item.get_type() == ebooklib.ITEM_DOCUMENT:
        soup = BeautifulSoup(item.get_body_content(), 'html.parser')

        # Split content into chapters based on heading tags
        chapters = split_by_heading(soup)
            
        for chapter in chapters:
            chapter_text = chapter.strip()
            if chapter_text:
                # Save each chapter as a separate text file
                with open(f'{output_dir}/chapter_{chapter_counter}.txt', 'w', encoding='utf-8') as f:
                    f.write(chapter_text)
                print(f'Chapter {chapter_counter} saved.')
                chapter_counter += 1

print('All chapters extracted and saved!')


<ebooklib.epub.EpubBook object at 0x0000026A8B64AE40>
Chapter 1 saved.
Chapter 2 saved.
Chapter 3 saved.
Chapter 4 saved.
Chapter 5 saved.
Chapter 6 saved.
Chapter 7 saved.
Chapter 8 saved.
Chapter 9 saved.
Chapter 10 saved.
Chapter 11 saved.
Chapter 12 saved.
Chapter 13 saved.
Chapter 14 saved.
Chapter 15 saved.
Chapter 16 saved.
Chapter 17 saved.
Chapter 18 saved.
Chapter 19 saved.
Chapter 20 saved.
Chapter 21 saved.
Chapter 22 saved.
Chapter 23 saved.
Chapter 24 saved.
Chapter 25 saved.
Chapter 26 saved.
Chapter 27 saved.
Chapter 28 saved.
Chapter 29 saved.
Chapter 30 saved.
Chapter 31 saved.
Chapter 32 saved.
Chapter 33 saved.
Chapter 34 saved.
Chapter 35 saved.
Chapter 36 saved.
Chapter 37 saved.
Chapter 38 saved.
Chapter 39 saved.
Chapter 40 saved.
Chapter 41 saved.
Chapter 42 saved.
Chapter 43 saved.
Chapter 44 saved.
Chapter 45 saved.
Chapter 46 saved.
All chapters extracted and saved!


## Preprocessing
Remove the grammatical features in the text to support noun extraction
### Tokenisation
use NLTK


In [13]:
import nltk

In [None]:
# Download NLTK datasets (DO NOT RERUN THIS CELL)
nltk.download('all') 

In [28]:
corpus_txt_dir = os.path.join(book_dir, 'flying_machines.txt')

with open(corpus_txt_dir, 'r', encoding='utf-8') as file:
    corpus_text = file.read()

# Sentence tokenization
corpus_sentences = nltk.tokenize.sent_tokenize(corpus_text)
print("Sentences:", corpus_sentences)

# Word tokenization
corpus_tokenised_words =  nltk.tokenize.word_tokenize(corpus_text)
print("Words:", corpus_tokenised_words)

# Stop words removal and lemmatization
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()
corpus_filtered_words = [lemmatizer.lemmatize(word.lower()) for word in corpus_tokenised_words if word.lower() not in stop_words]
print("Filtered words:", corpus_filtered_words)

# Might need to change to Punkt for sentence and Penn Treebank for word (and do sentence then word)



### Part of Speech (POS) Tagging

In [29]:
corpus_pos_tags = nltk.pos_tag(corpus_filtered_words)
print("POS Tags:", corpus_pos_tags)



### Named Entity Recognition

In [30]:
named_entities = nltk.ne_chunk(corpus_pos_tags)
print("Named Entities:", named_entities)

Named Entities: (S
  ﻿the/NN
  project/NN
  gutenberg/NN
  ebook/NN
  flying/VBG
  machine/NN
  :/:
  construction/NN
  operation/NN
  ebook/NN
  use/NN
  anyone/NN
  anywhere/RB
  united/JJ
  state/NN
  part/NN
  world/NN
  cost/NN
  almost/RB
  restriction/NN
  whatsoever/NN
  ./.
  may/MD
  copy/VB
  ,/,
  give/VB
  away/RP
  re-use/JJ
  term/NN
  project/NN
  gutenberg/NN
  license/NN
  included/VBD
  ebook/JJ
  online/NN
  www.gutenberg.org/NN
  ./.
  located/VBN
  united/JJ
  state/NN
  ,/,
  check/NN
  law/NN
  country/NN
  located/VBD
  using/VBG
  ebook/NN
  ./.
  title/NN
  :/:
  flying/JJ
  machine/NN
  :/:
  construction/NN
  operation/NN
  author/NN
  :/:
  william/NN
  j./NN
  jackman/NN
  octave/VBP
  chanute/NN
  thomas/NN
  herbert/NN
  russell/NN
  release/NN
  date/NN
  :/:
  may/MD
  1/CD
  ,/,
  1997/CD
  [/NNP
  ebook/VBD
  #/#
  907/CD
  ]/NNS
  recently/RB
  updated/VBD
  :/:
  july/NN
  26/CD
  ,/,
  2008/CD
  language/NN
  :/:
  english/JJ
  credit/NN
  :/:
  