# Exploratory Text Analysis: Project Parsing

**Student:** Ian Yung  
**Class:** DS 5001

---

## Introduction

Welcome to my exploratory text analysis project! In this notebook, I'll be diving into the world of text data, analyzing, and extracting insights from various textual sources. This project serves as the culmination of my efforts in DS 5001, where I've learned the fundamentals of exploratory text analysis.

Let's embark on this journey together as we explore the fascinating realm of textual data!

---

## Table of Contents

1. [Introduction](#Introduction)
2. [Load Data](#Load-Data)
3. [Developing OHCO](#Developing-OHCO)
4. [LIB](#LIB)
5. [CORPUS](#CORPUS)
6. [VOCAB](#VOCAB)
7. [Save Files](#Save-Files)

Feel free to navigate through the sections using the links provided above.


In [1]:
# a little overkill with the imports but I'm taking no chances
import pandas as pd
import numpy as np
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import plotly_express as px
import seaborn as sns
from IPython.display import display, HTML

import re

import nltk
from nltk import pos_tag

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import gensim
from scipy.linalg import norm, eigh
from gensim.corpora import Dictionary
from gensim.models import LdaModel, word2vec
from sklearn.manifold import TSNE as tsne

# Load Data

In [2]:
path = "The Complete Works of J.R.R Tolkien/Tolkien/The Fellowship of the Ring/The Fellowship of the Ring_ The - J. R. R. Tolkien.epub"

paths = [
    "The Complete Works of J.R.R Tolkien/Tolkien/The Fellowship of the Ring/The Fellowship of the Ring_ The - J. R. R. Tolkien.epub",
    'The Complete Works of J.R.R Tolkien/Tolkien/The Two Towers/The Two Towers_ The Lord of the - J. R. R. Tolkien.epub',
    "The Complete Works of J.R.R Tolkien/Tolkien/The Return of the King/The Return of the King_ The Lor - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/Bilbo's Last Song/Bilbo's Last Song - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/Tales From the Perilous Realm/Tales From the Perilous Realm - J. R. R. Tolkien.epub",
    'The Complete Works of J.R.R Tolkien/Tolkien/The Book of Lost Tales, Part 1/The Book of Lost Tales, Part 1 - J. R. R. Tolkien.epub',
    "The Complete Works of J.R.R Tolkien/Tolkien/The Book of Lost Tales, Part 2/The Book of Lost Tales, Part 2 - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/The Children of Hurin/The Children of Hurin - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/The Hobbit/The Hobbit - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/The Legend of Sigurd and Gudrun/The Legend of Sigurd and Gudrun - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/The Letters of J.R.R. Tolkien/The Letters of J.R.R. Tolkien - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/The Return of the Shadow/The Return of the Shadow_ The H - J. R. R. Tolkien.epub",
    "The Complete Works of J.R.R Tolkien/Tolkien/The Silmarillion/The Silmarillion - J. R. R. Tolkien.epub",
]

lewis_text_files = [
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_2_-_The_Lion_The_Witch_and_The__djvu.txt",
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_4_-_Prince_Caspian_djvu.txt",
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_5_-_The_Voyage_of_the_Dawn_Trea_djvu.txt",
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_6_-_The_Silver_Chair_djvu.txt",
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_3_-_The_Horse_and_His_Boy_djvu.txt",
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_1_-_The_Magician_s_Nephew_djvu.txt",
                "LewisCSNarnia3TheHorseAndHisBoy/Lewis_C_S_-_Narnia_7_-_The_Last_Battle_djvu.txt",
]

text_beginnings = [9, 9, 10, 6, 4, 5, 4, 4, 12, 11, 4, 2, 11] # either the preface, prologue, or the first chapter
text_endings = [33, 31, 29, 6, 33, 14, 9, 23, 30, 19, 358, 29, 58]

# contains all the regexes to splice the chapters of all the books
regs = [r"\n\s*Chapter\s*(\d+)\s*(.*?)\s*(.*?)\s*\n",
       'Chapter\s+\d+\s*\n.+\n',
       r"\n\s*Chapter\s*(\d+)\s*(.*?)\s*(.*?)\s*\n",
       '\n\s*\n\s*Chapter\s+[IVXLCDM]+\s*\n.*\n',
       "(?:\n\s*){3}([IVXLCDM]+)\s*(.*?)\s*(?:\n\s*){3}",
       "(?:\n\s*){3}([IVXLCDM]+)\s*(.*?)\s*(?:\n\s*){3}",
       '.*? \n ',]

'''
Helper function used to splice the chapters so only the content is left.
'''
def splice_string(string, regex_pattern):
    match = re.search(regex_pattern, string)
    if match:
        return string[match.end():]
    else:
        return string


In [3]:
blacklist = [   '[document]',   'noscript', 'header',   'html', 'meta', 'head','input', 'script',   ]

def epub2thtml(epub_path):
    book = epub.read_epub(epub_path)
    chapters = []
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            chapters.append(item.get_content())
    return chapters

def chap2text(chap):
    output = ''
    soup = BeautifulSoup(chap, 'html.parser')
    text = soup.find_all(text=True)
    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)
    return output

def thtml2ttext(thtml):
    Output = []
    for html in thtml:
        text =  chap2text(html)
        Output.append(text)
    return Output

def epub2text(epub_path):
    chapters = epub2thtml(epub_path)
    ttext = thtml2ttext(chapters)
    return ttext

In [4]:
texts = []
for p in paths:
    text = epub2text(p)
    texts.append(text)

# clip the texts
clipped_texts = []
for text_index in range(len(texts)):
    text_start = text_beginnings[text_index]
    text_end = text_endings[text_index]

    clipped_text = texts[text_index][text_start:text_end+1]
    clipped_texts.append(clipped_text)

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):
  text = soup.find_all(text=True)


# Developing OHCO

In [5]:
OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

def make_lewis_ohco(text_file):
    # Read in the File
    LINES = pd.DataFrame(open(text_file, 'r', encoding='utf-8-sig').readlines(), 
        columns=['line_str'])
    LINES.index.name = 'line_num'
    LINES.line_str = LINES.line_str.str.strip()

    # Match lines to their respective chapters
    chap_lines = LINES.line_str.str.match(r"^\s*(CHAPTER|letter)\s+", case=False)
    LINES.loc[chap_lines]
    
    chap_nums = [i+1 for i in range(LINES.loc[chap_lines].shape[0])]
    LINES.loc[chap_lines, 'chap_num'] = chap_nums
    
    # Forward Fill
    LINES.chap_num = LINES.chap_num.ffill()
    
    # Cleanup
    # LINES = LINES.loc[~LINES.chap_num.isna()] # Remove chapter heading lines
    LINES = LINES.dropna(subset=['chap_num'])
    LINES = LINES.loc[~chap_lines] # Remove everything before Chapter 1
    LINES.chap_num = LINES.chap_num.astype('int') # Convert chap_num from float to int

    pattern = '\\n\\n(.+?)\\n\\n'
    
    CHAPS = LINES.groupby(OHCO[:1]).line_str.apply(lambda x: '\n'.join(x)).to_frame('chap_str')
    CHAPS = CHAPS.map(lambda x: splice_string(x, pattern))

    # Make PARAS
    PARAS = CHAPS['chap_str'].str.split(r'\n\n+', expand=True).stack()\
    .to_frame('para_str')
    PARAS.index.names = OHCO[:2]
    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\n', ' ', regex=True).str.strip()
    PARAS = PARAS[~PARAS['para_str'].str.match(r'^\s*$')] # Remove empty paragraphs

    # Make SENTS
    SENTS = PARAS['para_str'].str.split(r'[.?!;:"]+', expand=True).stack()\
        .to_frame().rename(columns={0:'sent_str'})
    SENTS.index.names = OHCO[:3]
    SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
    SENTS.sent_str = SENTS.sent_str.str.strip()

    # Make TOKENS
    TOKENS = SENTS['sent_str'].str.split(r"[\s',-]+", expand=True).stack()\
        .to_frame('token_str')
    TOKENS.index.names = OHCO[:4]
    TOKENS['term_str'] = TOKENS.token_str.str.replace(r"[\W_]+", '', regex=True).str.lower()


    return TOKENS

In [6]:
def make_ohco(df, reg, para_split=r'\r\n\s+\n'):
    # setup
    OHCO = ['chap_num', 'para_num', 'sent_num', 'token_num']

    # actually create the OHCO
    CHAPS = df.map(lambda x: splice_string(x, reg))
    
    PARAS = CHAPS['chapter'].str.split(para_split, expand=True).stack().to_frame('para_str')
    PARAS['para_str'] = PARAS['para_str'].str.replace(r'\r\n|\n', ' ', regex=True).str.strip() # remove the \r\n's.
    PARAS.index.names = OHCO[:2]

    SENTS = PARAS['para_str'].str.split(r'[.?!;:"]+', expand=True).stack()\
        .to_frame().rename(columns={0:'sent_str'})
    SENTS.index.names = OHCO[:3]
    SENTS = SENTS[~SENTS['sent_str'].str.match(r'^\s*$')] # Remove empty paragraphs
    SENTS.sent_str = SENTS.sent_str.str.strip()

    TOKENS = SENTS['sent_str'].str.split(r"[\s',-]+", expand=True).stack().to_frame('token_str')
    TOKENS.index.names = OHCO[:4]
    TOKENS['term_str'] = TOKENS.token_str.str.replace(r"[\W_]+", '', regex=True).str.lower()

    return TOKENS

In [7]:
fellowship_TEXT= pd.DataFrame({'chapter': clipped_texts[0][2:]}, index=range(0, len(clipped_texts[1])))
fellowship_TEXT.drop([12], inplace=True)

towers_TEXT = pd.DataFrame({'chapter': clipped_texts[1]}, index=range(1, len(clipped_texts[0]) - 1))
towers_TEXT.drop([13], inplace=True)

king_TEXT = pd.DataFrame({'chapter': clipped_texts[2]}, index=range(len(clipped_texts[2])))
king_TEXT.drop([10], inplace=True)

hobbit_TEXT = pd.DataFrame({'chapter': clipped_texts[8]}, index=range(0, len(clipped_texts[8])))

tales_one_TEXT = pd.DataFrame({'chapter': clipped_texts[5]}, index=range(0, len(clipped_texts[5])))
tales_two_TEXT = pd.DataFrame({'chapter': clipped_texts[6]}, index=range(0, len(clipped_texts[6])))

# silmarillion requires a little more work
silmarillion_TEXT = pd.DataFrame({'chapter': clipped_texts[12]}, index=range(0, len(clipped_texts[12])))
mask = silmarillion_TEXT.index % 2 == 0
odd_indices = ~mask
silmarillion_TEXT = silmarillion_TEXT[odd_indices]

# Make the OHCOs!
fellow = make_ohco(fellowship_TEXT, regs[0])
towers = make_ohco(towers_TEXT, regs[1])
king = make_ohco(king_TEXT, regs[2])
hobbit = make_ohco(hobbit_TEXT, regs[3])
talesONE = make_ohco(tales_one_TEXT, regs[4], para_split=r'\n')
talesTWO = make_ohco(tales_two_TEXT, regs[5], para_split=r'\n')
silmarillion = make_ohco(silmarillion_TEXT, regs[6], para_split=r'\n')

# For Lewis
lion = make_lewis_ohco(lewis_text_files[0])
caspian = make_lewis_ohco(lewis_text_files[1])
dawn = make_lewis_ohco(lewis_text_files[2])
silver = make_lewis_ohco(lewis_text_files[3])
horse = make_lewis_ohco(lewis_text_files[4])
magician = make_lewis_ohco(lewis_text_files[5])
battle = make_lewis_ohco(lewis_text_files[6])

# LIB

In [8]:
# Establish Book IDs
fellow['book_id'] = 1
towers['book_id'] = 2
king['book_id'] = 3
hobbit['book_id'] = 4
talesONE['book_id'] = 5
talesTWO['book_id'] = 6
silmarillion['book_id'] = 7

lion['book_id'] = 8
caspian['book_id'] = 9
dawn['book_id'] = 10
silver['book_id'] = 11
horse['book_id'] = 12
magician['book_id'] = 13
battle['book_id'] = 14


# Set OHCOs
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

fellow = fellow.reset_index().set_index(OHCO)
towers = towers.reset_index().set_index(OHCO)
king = king.reset_index().set_index(OHCO)
hobbit = hobbit.reset_index().set_index(OHCO)
talesONE = talesONE.reset_index().set_index(OHCO)
talesTWO = talesTWO.reset_index().set_index(OHCO)
silmarillion = silmarillion.reset_index().set_index(OHCO)

lion = lion.reset_index().set_index(OHCO)
caspian = caspian.reset_index().set_index(OHCO)
dawn = dawn.reset_index().set_index(OHCO)
silver = silver.reset_index().set_index(OHCO)
horse = horse.reset_index().set_index(OHCO)
magician = magician.reset_index().set_index(OHCO)
battle = battle.reset_index().set_index(OHCO)

In [9]:
# %% TODO - add the "n_chaps" column to LIB %%
LIB = pd.DataFrame(columns=['book_title', 'book_author', 'date'], index=[1, 2, 3, 4])
LIB.loc[1, ['book_title', 'book_author', 'date', 'book_len']] = ("Fellowship of the Ring", "J.R.R. Tolkien", "1954", int(len(fellow)))
LIB.loc[2, ['book_title', 'book_author', 'date', 'book_len']] = ("The Two Towers", "J.R.R. Tolkien", "1954", int(len(towers)))
LIB.loc[3, ['book_title', 'book_author', 'date', 'book_len']] = ("Return of the King", "J.R.R. Tolkien", "1955", int(len(king)))
LIB.loc[4, ['book_title', 'book_author', 'date', 'book_len']] = ("The Hobbit", "J.R.R. Tolkien", "1937", int(len(hobbit)))
LIB.loc[5, ['book_title', 'book_author', 'date', 'book_len']] = ("The Book of Lost Tales, Part 1", "J.R.R. Tolkien", "1983", int(len(talesONE)))
LIB.loc[6, ['book_title', 'book_author', 'date', 'book_len']] = ("The Book of Lost Tales, Part 2", "J.R.R. Tolkien", "1984", int(len(talesTWO)))
LIB.loc[7, ['book_title', 'book_author', 'date', 'book_len']] = ("The Silmarillion", "J.R.R. Tolkien", "1977", int(len(silmarillion)))

LIB.loc[8, ['book_title', 'book_author', 'date', 'book_len']] = ("The Lion, the Witch, and the Wardrobe", "C.S. Lewis", "1950", int(len(lion)))
LIB.loc[9, ['book_title', 'book_author', 'date', 'book_len']] = ("Prince Caspian: The Return to Narnia ", "C.S. Lewis", "1951", int(len(caspian)))
LIB.loc[10, ['book_title', 'book_author', 'date', 'book_len']] = ("The Voyage of the Dawn Treader", "C.S. Lewis", "1952", int(len(dawn)))
LIB.loc[11, ['book_title', 'book_author', 'date', 'book_len']] = ("The Silver Chair", "C.S. Lewis", "1953", int(len(silver)))
LIB.loc[12, ['book_title', 'book_author', 'date', 'book_len']] = ("The Horse and His Boy", "C.S. Lewis", "1954", int(len(horse)))
LIB.loc[13, ['book_title', 'book_author', 'date', 'book_len']] = ("The Magician's Nephew", "C.S. Lewis", "1955", int(len(magician)))
LIB.loc[14, ['book_title', 'book_author', 'date', 'book_len']] = ("The Last Battle", "C.S. Lewis", "1956", int(len(battle)))

LIB.index.name = "book_id"

In [10]:
LIB

Unnamed: 0_level_0,book_title,book_author,date,book_len
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Fellowship of the Ring,J.R.R. Tolkien,1954,182850.0
2,The Two Towers,J.R.R. Tolkien,1954,160213.0
3,Return of the King,J.R.R. Tolkien,1955,138585.0
4,The Hobbit,J.R.R. Tolkien,1937,97251.0
5,"The Book of Lost Tales, Part 1",J.R.R. Tolkien,1983,108638.0
6,"The Book of Lost Tales, Part 2",J.R.R. Tolkien,1984,158964.0
7,The Silmarillion,J.R.R. Tolkien,1977,102262.0
8,"The Lion, the Witch, and the Wardrobe",C.S. Lewis,1950,38697.0
9,Prince Caspian: The Return to Narnia,C.S. Lewis,1951,46862.0
10,The Voyage of the Dawn Treader,C.S. Lewis,1952,54270.0


# CORPUS

In [11]:
CORPUS = pd.concat([fellow, towers, king, hobbit, talesONE, talesTWO, silmarillion, lion, caspian, dawn, silver, horse, magician, battle])

In [12]:
# create POS tags
pos_tags = pos_tag(CORPUS.token_str)

CORPUS['pos_tuple'] = pos_tags
CORPUS['pos'] = [j for i, j in pos_tags]
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos_tuple,pos,pos_group
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,0,0,0,When,when,"(When, WRB)",WRB,WR
1,0,0,0,1,Mr,mr,"(Mr, NNP)",NNP,NN
1,0,0,1,0,Bilbo,bilbo,"(Bilbo, NNP)",NNP,NN
1,0,0,1,1,Baggins,baggins,"(Baggins, NNP)",NNP,NN
1,0,0,1,2,of,of,"(of, IN)",IN,IN
...,...,...,...,...,...,...,...,...,...
14,16,58,2,5,better,better,"(better, RBR)",RBR,RB
14,16,58,2,6,than,than,"(than, IN)",IN,IN
14,16,58,2,7,the,the,"(the, DT)",DT,DT
14,16,58,2,8,one,one,"(one, CD)",CD,CD


# VOCAB

In [14]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [15]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
,23524,0,1.839343e-02,5.764666,NNP,NN
0,1,1,7.819006e-07,20.286512,CD,CD
1,135,1,1.055566e-04,13.209696,CD,CD
10,54,2,4.222263e-05,14.531624,CD,CD
100,8,3,6.255205e-06,17.286512,CD,CD
...,...,...,...,...,...,...
þa,1,2,7.819006e-07,20.286512,NNP,NN
þisses,1,6,7.819006e-07,20.286512,VBZ,VB
þunor,1,5,7.819006e-07,20.286512,NNP,NN
þá,1,2,7.819006e-07,20.286512,NNP,NN


## Add Stopwords

In [16]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [17]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['porter_stem'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

# Save Files

In [18]:
LIB.to_csv("data/LIB.csv")

In [19]:
CORPUS.to_csv("data/CORPUS.csv")

In [20]:
VOCAB.to_csv("data/VOCAB.csv")