# Parsing and Annotating Data

Parsing the raw data into the three core tables of your addition: the LIB, CORPUS, and VOCAB tables.

These tables will be stored as CSV files with header rows.

In [2]:
# importing libraries
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import configparser
import os

In [3]:
import sys
sys.path.append("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/analysis")

In [4]:
# importing parser module
from textparser import TextParser

In [5]:
# read in data
source_files = "/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8"

# define OHCO
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [6]:
# removing boiler plates
clip_pats = [
    r"(?m)^THE START\s*$",
    r"(?m)^THE END\s*$"
]

# chunk by chapter

ohco_pat_list = [
    ('BetweenTheActs', r'^###CHAPTER###$'),  # annotation for 5 blank lines
    ('Flush', r'^(CHAPTER\s+[A-Z]+)\s*$'), # CHAPTER X (blank line) chapter name
    ('JacobsRoom', r'^CHAPTER\s+[A-Z]+\s*$'), # CHAPTER X
    ('MrsDalloway', r'^###CHAPTER###$'),  # annotation for 5 blank lines
    ('NightAndDay', r'^CHAPTER\s+[IVXLCDM]+\s*$'),# CHAPTER ? (roman numeral)
    ('Orlando', r'^CHAPTER\s+\d+\.\s*$'), # CHAPTER X. 
    ('TheVoyageOut', r'^Chapter\s+[IVXLCDM]+\s*$'), # Chapter ? (roman numeral)
    ('TheWaves', r'^###CHAPTER###$'),  # annotation for 5 blank lines
    ('TheYears', r'^\s*(18|19)\d{2}\s*$'), # blank line, year, blank line
    ('ToTheLighthouse', r'^\s*\d+\s*$'), # blank line, number, blank line
]

## Creating LIB table

In [7]:
# register each file to a library
source_file_list = sorted(glob(f"{source_files}/*.*"))

book_data = []
for source_file_path in source_file_list:
    book_id = source_file_path.split('/')[-1].replace('.utf8.txt', '')
    book_title = source_file_path.split('/')[-1].replace('.utf8.txt', '')
    book_data.append((book_id, source_file_path, book_title))

In [8]:
# create LIB table
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','title'])\
    .set_index('book_id').sort_index()

# add chapter regexes
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

# add publication year
publication_years = {
    'TheVoyageOut': 1915,
    'NightAndDay': 1919,
    'JacobsRoom': 1922,
    'MrsDalloway': 1925,
    'ToTheLighthouse': 1927,
    'Orlando': 1928,
    'TheWaves': 1931,
    'Flush': 1933,
    'TheYears': 1937,
    'BetweenTheActs': 1941
}

LIB['year'] = LIB['title'].map(publication_years)

# add goodreads rating
goodreads = {
    'TheVoyageOut': 3.75,
    'NightAndDay': 3.75,
    'JacobsRoom': 3.69,
    'MrsDalloway': 3.73,
    'ToTheLighthouse': 3.78,
    'Orlando': 3.86,
    'TheWaves': 4.15,
    'Flush': 3.87,
    'TheYears': 3.77,
    'BetweenTheActs': 3.61
}

LIB['goodreads'] = LIB['title'].map(goodreads)

In [9]:
LIB

Unnamed: 0_level_0,source_file_path,title,chap_regex,year,goodreads
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BetweenTheActs,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,BetweenTheActs,^###CHAPTER###$,1941,3.61
Flush,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,Flush,^(CHAPTER\s+[A-Z]+)\s*$,1933,3.87
JacobsRoom,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,JacobsRoom,^CHAPTER\s+[A-Z]+\s*$,1922,3.69
MrsDalloway,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,MrsDalloway,^###CHAPTER###$,1925,3.73
NightAndDay,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,NightAndDay,^CHAPTER\s+[IVXLCDM]+\s*$,1919,3.75
Orlando,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,Orlando,^CHAPTER\s+\d+\.\s*$,1928,3.86
TheVoyageOut,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,TheVoyageOut,^Chapter\s+[IVXLCDM]+\s*$,1915,3.75
TheWaves,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,TheWaves,^###CHAPTER###$,1931,4.15
TheYears,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,TheYears,^\s*(18|19)\d{2}\s*$,1937,3.77
ToTheLighthouse,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,ToTheLighthouse,^\s*\d+\s*$,1927,3.78


In [10]:
# calculate lengths of each document in characters
length = LIB['source_file_path'].apply(lambda path: len(open(path, 'r', encoding='utf-8').read()))

# find average length
length.mean()

482375.0

In [9]:
# write to csv
LIB.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/lib.csv")

## Creating CORPUS table

In [10]:
# creating chapter markers for books with sections divided by multiple blank lines
def insert_chapter_markers_exact(file_path):
    # read in books
    with open(file_path, encoding='utf-8') as f:
        text = f.read()

    # split text into two parts: before and after "THE START"
    start_match = re.search(r'(?m)^THE START\s*$', text)
    if not start_match:
        print(f"Chapter markers already applied in {file_path}")
        return "already applied"
    
    start_idx = start_match.end()
    header = text[:start_idx]
    body = text[start_idx:]

    # insert chapter marker immediately after "THE START"
    body = re.sub(r'^(\s*)', r'###CHAPTER###\n\1', body, count=1)

    # replace exactly 5 blank lines with chapter marker
    five_blank_pattern = r'(?m)(?:^[ \t]*\r?\n){5}(?=^[^\s])'
    body = re.sub(five_blank_pattern, '\n###CHAPTER###\n', body)

    # write back to file
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(header + body)

    # confirm with print statement
    print(f"✅ Inserted chapter markers after 'THE START' and 5 blank lines in {file_path}")


# apply to the books that need it
target_books = ['BetweenTheActs', 'MrsDalloway', 'TheWaves']
for book_id in target_books:
    file_path = LIB.loc[book_id].source_file_path
    insert_chapter_markers_exact(file_path)
    LIB.at[book_id, 'chap_regex'] = r'^###CHAPTER###$'

Chapter markers already applied in /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/BetweenTheActs.utf8.txt
Chapter markers already applied in /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/MrsDalloway.utf8.txt
Chapter markers already applied in /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/TheWaves.utf8.txt


In [11]:
# tokenizing function
def tokenize_collection(LIB):
    clip_pats = [
    r"(?m)^THE START\s*$",
    r"(?m)^THE END\s*$"
    ]
    
    books = []
    for book_id in LIB.index:
        try:
            print(f"Tokenizing {book_id} {LIB.loc[book_id].title}")
            
            chap_regex = LIB.loc[book_id].chap_regex
            ohco_pats = [('chap', chap_regex, 'm')]
            src_file_path = LIB.loc[book_id].source_file_path

            text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
            text.verbose = True
            text.strip_hyphens = True
            text.strip_whitespace = True

            # debug: check if chapter regex is matching anything
            with open(src_file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            matching_lines = pd.DataFrame({'line': [line.strip() for line in lines]})
            num_matches = matching_lines["line"].str.contains(chap_regex, regex=True).sum()
            print(f"Found {num_matches} matching chapter headings for {book_id}")

            text.import_source().parse_tokens()
            text.TOKENS['book_id'] = book_id
            text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)
            books.append(text.TOKENS)

        except Exception as e:
            print(f"\n Failed on {book_id}: {LIB.loc[book_id].title}")
            print(f"Error: {e}\n")
    
    CORPUS = pd.concat(books).sort_index()
    print("Done")
    return CORPUS

In [12]:
CORPUS = tokenize_collection(LIB)

Tokenizing BetweenTheActs BetweenTheActs
Found 36 matching chapter headings for BetweenTheActs
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/BetweenTheActs.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^###CHAPTER###$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization
Tokenizing Flush Flush
Found 6 matching chapter headings for Flush
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/Flush.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^(CHAPTER\s+[A-Z]+)\s*$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization


  num_matches = matching_lines["line"].str.contains(chap_regex, regex=True).sum()
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)


Tokenizing JacobsRoom JacobsRoom
Found 14 matching chapter headings for JacobsRoom
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/JacobsRoom.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^CHAPTER\s+[A-Z]+\s*$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization
Tokenizing MrsDalloway MrsDalloway
Found 10 matching chapter headings for MrsDalloway
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/MrsDalloway.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^###CHAPTER###$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization
Tokenizing NightAndDay NightAndDay
Found 34 matching chapter headings for NightAndDay
Importing  /Users/lucyshichman/Documents/MSDS/DS

  num_matches = matching_lines["line"].str.contains(chap_regex, regex=True).sum()
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)


Tokenizing ToTheLighthouse ToTheLighthouse
Found 43 matching chapter headings for ToTheLighthouse
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/ToTheLighthouse.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*\d+\s*$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization
Done


In [13]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BetweenTheActs,1,0,0,0,"(It, PRP)",PRP,It,it
BetweenTheActs,1,0,0,1,"(was, VBD)",VBD,was,was
BetweenTheActs,1,0,0,2,"(a, DT)",DT,a,a
BetweenTheActs,1,0,0,3,"(summer's, JJ)",JJ,summer's,summers
BetweenTheActs,1,0,0,4,"(night, NN)",NN,night,night
...,...,...,...,...,...,...,...,...
ToTheLighthouse,43,3,9,10,"(I, PRP)",PRP,I,i
ToTheLighthouse,43,3,9,11,"(have, VBP)",VBP,have,have
ToTheLighthouse,43,3,9,12,"(had, VBN)",VBN,had,had
ToTheLighthouse,43,3,9,13,"(my, PRP$)",PRP$,my,my


In [14]:
CORPUS.groupby('book_id').size()

book_id
BetweenTheActs      47059
Flush               34610
JacobsRoom          55494
MrsDalloway         64389
NightAndDay        168036
Orlando             79225
TheVoyageOut       137843
TheWaves            78067
TheYears           130731
ToTheLighthouse     69913
dtype: int64

In [15]:
# add POS group (per assignment specifications)
CORPUS['pos_group'] = CORPUS.pos.str[:2]

In [16]:
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BetweenTheActs,1,0,0,0,"(It, PRP)",PRP,It,it,PR
BetweenTheActs,1,0,0,1,"(was, VBD)",VBD,was,was,VB
BetweenTheActs,1,0,0,2,"(a, DT)",DT,a,a,DT
BetweenTheActs,1,0,0,3,"(summer's, JJ)",JJ,summer's,summers,JJ
BetweenTheActs,1,0,0,4,"(night, NN)",NN,night,night,NN


In [17]:
# removing anamolies
CORPUS = CORPUS[CORPUS.term_str != '']

In [18]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str,pos_group
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
BetweenTheActs,1,0,0,0,"(It, PRP)",PRP,It,it,PR
BetweenTheActs,1,0,0,1,"(was, VBD)",VBD,was,was,VB
BetweenTheActs,1,0,0,2,"(a, DT)",DT,a,a,DT
BetweenTheActs,1,0,0,3,"(summer's, JJ)",JJ,summer's,summers,JJ
BetweenTheActs,1,0,0,4,"(night, NN)",NN,night,night,NN
...,...,...,...,...,...,...,...,...,...
ToTheLighthouse,43,3,9,10,"(I, PRP)",PRP,I,i,PR
ToTheLighthouse,43,3,9,11,"(have, VBP)",VBP,have,have,VB
ToTheLighthouse,43,3,9,12,"(had, VBN)",VBN,had,had,VB
ToTheLighthouse,43,3,9,13,"(my, PRP$)",PRP$,my,my,PR


In [19]:
# write to csv
CORPUS.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/corpus.csv")

## Creating VOCAB table

In [41]:
# building vocab table
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'

# length
VOCAB['n_chars'] = VOCAB.index.str.len()

# p and i
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)

# max_pos and pax_pos_group
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)

In [42]:
# adding porter stems
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

# adding stopwords
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1

VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')

In [43]:
VOCAB.sample(10)

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stem_porter,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
driver,14,6,1.6e-05,15.908146,NN,NN,driver,0
snowing,1,7,1e-06,19.715501,VBG,VB,snow,0
unapprehended,1,13,1e-06,19.715501,JJ,JJ,unapprehend,0
amuse,8,5,9e-06,16.715501,VB,VB,amus,0
mumble,1,6,1e-06,19.715501,JJ,JJ,mumbl,0
arnold,4,6,5e-06,17.715501,NNP,NN,arnold,0
bawling,5,7,6e-06,17.393573,VBG,VB,bawl,0
wisher,1,6,1e-06,19.715501,NN,NN,wisher,0
massively,1,9,1e-06,19.715501,RB,RB,massiv,0
because,444,7,0.000516,10.921085,IN,IN,becaus,1


In [45]:
# write to csv
VOCAB.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/vocab.csv")

In [46]:
VOCAB

Unnamed: 0_level_0,n,n_chars,p,i,max_pos,max_pos_group,stem_porter,stop
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4,1,0.000005,17.715501,NN,NN,1,0
10,2,2,0.000002,18.715501,JJ,NN,10,0
1030,1,4,0.000001,19.715501,CD,CD,1030,0
10th,1,4,0.000001,19.715501,CD,CD,10th,0
112,1,3,0.000001,19.715501,CD,CD,112,0
...,...,...,...,...,...,...,...,...
zwinglers,2,9,0.000002,18.715501,NNP,NN,zwingler,0
à,1,1,0.000001,19.715501,NN,NN,à,0
éclair,2,6,0.000002,18.715501,NN,NN,éclair,0
éclairs,1,7,0.000001,19.715501,NNP,NN,éclair,0
