# Parsing and Annotating Data

Parsing the raw data into the three core tables of your addition: the LIB, CORPUS, and VOCAB tables.

These tables will be stored as CSV files with header rows.

In [1]:
# importing libraries
import pandas as pd
import numpy as np
from glob import glob
import re
import nltk
import configparser
import os

In [2]:
import sys
sys.path.append("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/analysis")

In [3]:
# importing parser module
from textparser import TextParser

In [4]:
# read in data
source_files = "/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8"

# define OHCO
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']

In [5]:
# removing boiler plates
clip_pats = [
    r"(?m)^THE START\s*$",
    r"(?m)^THE END\s*$"
]

# chunk by chapter

ohco_pat_list = [
    ('BetweenTheActs', r'^###CHAPTER###$'),  # annotation for 5 blank lines
    ('Flush', r'^(CHAPTER\s+[A-Z]+)\s*$'), # CHAPTER X (blank line) chapter name
    ('JacobsRoom', r'^CHAPTER\s+[A-Z]+\s*$'), # CHAPTER X
    ('MrsDalloway', r'^###CHAPTER###$'),  # annotation for 5 blank lines
    ('NightAndDay', r'^CHAPTER\s+[IVXLCDM]+\s*$'),# CHAPTER ? (roman numeral)
    ('Orlando', r'^CHAPTER\s+\d+\.\s*$'), # CHAPTER X. 
    ('TheVoyageOut', r'^Chapter\s+[IVXLCDM]+\s*$'), # Chapter ? (roman numeral)
    ('TheWaves', r'^###CHAPTER###$'),  # annotation for 5 blank lines
    ('TheYears', r'^\s*(18|19)\d{2}\s*$'), # blank line, year, blank line
    ('ToTheLighthouse', r'^\s*\d+\s*$'), # blank line, number, blank line
]

In [6]:
# register each file to a library
source_file_list = sorted(glob(f"{source_files}/*.*"))

book_data = []
for source_file_path in source_file_list:
    book_id = source_file_path.split('/')[-1].replace('.utf8.txt', '')
    book_title = source_file_path.split('/')[-1].replace('.utf8.txt', '')
    book_data.append((book_id, source_file_path, book_title))

In [7]:
# create LIB table
LIB = pd.DataFrame(book_data, columns=['book_id','source_file_path','title'])\
    .set_index('book_id').sort_index()

# add chapter regexes
LIB['chap_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))

# add publication year
publication_years = {
    'TheVoyageOut': 1915,
    'NightAndDay': 1919,
    'JacobsRoom': 1922,
    'MrsDalloway': 1925,
    'ToTheLighthouse': 1927,
    'Orlando': 1928,
    'TheWaves': 1931,
    'Flush': 1933,
    'TheYears': 1937,
    'BetweenTheActs': 1941
}

LIB['year'] = LIB['title'].map(publication_years)

In [8]:
LIB

Unnamed: 0_level_0,source_file_path,title,chap_regex,year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BetweenTheActs,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,BetweenTheActs,^###CHAPTER###$,1941
Flush,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,Flush,^(CHAPTER\s+[A-Z]+)\s*$,1933
JacobsRoom,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,JacobsRoom,^CHAPTER\s+[A-Z]+\s*$,1922
MrsDalloway,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,MrsDalloway,^###CHAPTER###$,1925
NightAndDay,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,NightAndDay,^CHAPTER\s+[IVXLCDM]+\s*$,1919
Orlando,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,Orlando,^CHAPTER\s+\d+\.\s*$,1928
TheVoyageOut,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,TheVoyageOut,^Chapter\s+[IVXLCDM]+\s*$,1915
TheWaves,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,TheWaves,^###CHAPTER###$,1931
TheYears,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,TheYears,^\s*(18|19)\d{2}\s*$,1937
ToTheLighthouse,/Users/lucyshichman/Documents/MSDS/DS5001/fina...,ToTheLighthouse,^\s*\d+\s*$,1927


In [20]:
# write to csv
LIB.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/lib.csv")

In [9]:
# creating chapter markers for books with sections divided by multiple blank lines
def insert_chapter_markers_exact(file_path):
    # read in books
    with open(file_path, encoding='utf-8') as f:
        text = f.read()

    # split text into two parts: before and after "THE START"
    start_match = re.search(r'(?m)^THE START\s*$', text)
    
    start_idx = start_match.end()
    header = text[:start_idx]
    body = text[start_idx:]

    # insert chapter marker immediately after "THE START"
    body = re.sub(r'^(\s*)', r'###CHAPTER###\n\1', body, count=1)

    # replace exactly 5 blank lines with chapter marker
    five_blank_pattern = r'(?m)(?:^[ \t]*\r?\n){5}(?=^[^\s])'
    body = re.sub(five_blank_pattern, '\n###CHAPTER###\n', body)

    # write back to file
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(header + body)

    # confirm with print statement
    print(f"✅ Inserted chapter markers after 'THE START' and 5 blank lines in {file_path}")


# apply to the books that need it
target_books = ['BetweenTheActs', 'MrsDalloway', 'TheWaves']
for book_id in target_books:
    file_path = LIB.loc[book_id].source_file_path
    insert_chapter_markers_exact(file_path)
    LIB.at[book_id, 'chap_regex'] = r'^###CHAPTER###$'

AttributeError: 'NoneType' object has no attribute 'end'

^ returns error if ran already

In [10]:
# tokenizing function
def tokenize_collection(LIB):
    clip_pats = [
    r"(?m)^THE START\s*$",
    r"(?m)^THE END\s*$"
    ]
    
    books = []
    for book_id in LIB.index:
        try:
            print(f"Tokenizing {book_id} {LIB.loc[book_id].title}")
            
            chap_regex = LIB.loc[book_id].chap_regex
            ohco_pats = [('chap', chap_regex, 'm')]
            src_file_path = LIB.loc[book_id].source_file_path

            text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)
            text.verbose = True
            text.strip_hyphens = True
            text.strip_whitespace = True

            # debug: check if chapter regex is matching anything
            with open(src_file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            matching_lines = pd.DataFrame({'line': [line.strip() for line in lines]})
            num_matches = matching_lines["line"].str.contains(chap_regex, regex=True).sum()
            print(f"Found {num_matches} matching chapter headings for {book_id}")

            text.import_source().parse_tokens()
            text.TOKENS['book_id'] = book_id
            text.TOKENS = text.TOKENS.reset_index().set_index(['book_id'] + text.OHCO)
            books.append(text.TOKENS)

        except Exception as e:
            print(f"\n Failed on {book_id}: {LIB.loc[book_id].title}")
            print(f"Error: {e}\n")
    
    CORPUS = pd.concat(books).sort_index()
    print("✅ Done")
    return CORPUS

In [11]:
CORPUS = tokenize_collection(LIB)

Tokenizing BetweenTheActs BetweenTheActs
Found 36 matching chapter headings for BetweenTheActs
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/BetweenTheActs.utf8.txt
Clipping text

 Failed on BetweenTheActs: BetweenTheActs
Error: Clip start pattern not found.

Tokenizing Flush Flush
Found 6 matching chapter headings for Flush
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/Flush.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^(CHAPTER\s+[A-Z]+)\s*$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization


  num_matches = matching_lines["line"].str.contains(chap_regex, regex=True).sum()
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)


Tokenizing JacobsRoom JacobsRoom
Found 14 matching chapter headings for JacobsRoom
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/JacobsRoom.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^CHAPTER\s+[A-Z]+\s*$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization
Tokenizing MrsDalloway MrsDalloway
Found 10 matching chapter headings for MrsDalloway
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/MrsDalloway.utf8.txt
Clipping text

 Failed on MrsDalloway: MrsDalloway
Error: Clip start pattern not found.

Tokenizing NightAndDay NightAndDay
Found 34 matching chapter headings for NightAndDay
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/NightAndDay.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^CHAPTER\s+[IVXLC

  num_matches = matching_lines["line"].str.contains(chap_regex, regex=True).sum()
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)


Parsing OHCO level 3 token_num by NLTK tokenization
Tokenizing ToTheLighthouse ToTheLighthouse
Found 43 matching chapter headings for ToTheLighthouse
Importing  /Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/woolf_novels/utf8/ToTheLighthouse.utf8.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^\s*\d+\s*$
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK sentence tokenizer
Parsing OHCO level 3 token_num by NLTK tokenization
✅ Done


In [12]:
CORPUS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos_tuple,pos,token_str,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Flush,1,0,0,0,"(Three, CD)",CD,Three,three
Flush,1,0,0,1,"(Mile, NNP)",NNP,Mile,mile
Flush,1,0,0,2,"(Cross, NNP)",NNP,Cross,cross
Flush,1,1,0,0,"(It, PRP)",PRP,It,it
Flush,1,1,0,1,"(is, VBZ)",VBZ,is,is
...,...,...,...,...,...,...,...,...
ToTheLighthouse,43,3,9,10,"(I, PRP)",PRP,I,i
ToTheLighthouse,43,3,9,11,"(have, VBP)",VBP,have,have
ToTheLighthouse,43,3,9,12,"(had, VBN)",VBN,had,had
ToTheLighthouse,43,3,9,13,"(my, PRP$)",PRP$,my,my


In [13]:
CORPUS.groupby('book_id').size()

book_id
Flush               34610
JacobsRoom          55494
NightAndDay        168036
Orlando             79225
TheVoyageOut       137843
TheYears           130731
ToTheLighthouse     69913
dtype: int64

In [14]:
# write to csv
CORPUS.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/corpus.csv")

In [15]:
# creating VOCAB table

# handling anomalies
CORPUS[CORPUS.term_str == ''].token_str.value_counts()

token_str
"        439
."       215
".        81
..."      34
?"        31
....       8
),         4
'          4
.'         3
'"         3
&          3
!"         3
";         3
"...       3
***        2
,"         2
.'"        2
"'         2
"'.        2
,'         1
...?"      1
.)         1
?'"        1
?)         1
Name: count, dtype: int64

In [16]:
# removing empty term_str (punctuation)
CORPUS = CORPUS[CORPUS.term_str != '']

In [17]:
# building vocab table
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()

# getting max POS (most frequently associated part-of-speech for each word)
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)

In [19]:
VOCAB.sample(10)

Unnamed: 0_level_0,n,n_chars,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dispensed,2,9,VBN
rejoiced,1,8,NN
sped,3,4,VBD
feels,23,5,VBZ
inky,2,4,VB
insect,14,6,NN
happening,23,9,VBG
industries,2,10,NNS
blaring,1,7,NN
process,38,7,NN


In [21]:
# write to csv
VOCAB.to_csv("/Users/lucyshichman/Documents/MSDS/DS5001/final_project/woolf2vec/output/vocab.csv")