In [1]:
import re

class RowClassifier:
	def __init__(self):
		self.classifications = [
			{"class": "BOOK", "currRegex": r" {33}Book (\w+)", "prevRegex": r""},
			{"class": "CHAPTER", "currRegex": r" {27}_Chapter (\d+)_"},
			{"class": "CHAPTER_NAME", "currRegex": r" {12}(.+)", "prevRegex": r" {27}_Chapter (\d+)_"},
			{"class": "OTHER", "currRegex": r" {10,12}(.*)", "prevRegex": r"(| {10,12}.*)"},
			{"class": "TEXT", "currRegex": r" {5}(.*)", "prevRegex": r"(| {5}.*)"},
		]
	
	def classify_row(self, prev_row, curr_row):
		for rule in self.classifications:
			if ('prevRegex' not in rule or re.match(rule["prevRegex"], prev_row)) and re.match(rule["currRegex"], curr_row):
				return rule["class"], re.search(rule["currRegex"], curr_row).group(1).strip()
		return "UNKNOWN", curr_row.strip()

def read_book(input):
	classifier = RowClassifier()

	with open(input, "r") as file:
		lines = file.readlines()

	classifications = ['TITLE']
	values = [lines[0].strip()]
	prev_row = lines[0].strip() if lines else None
	
	for curr_row in lines[1:]:
		if not curr_row.strip():
			prev_row = curr_row
			continue
		classification, value = classifier.classify_row(prev_row, curr_row)
		if classifications[-1] == classification == 'OTHER':
			values[-1] += ' ' + value
		else:
			classifications.append(classification)
			values.append(value)
		prev_row = curr_row

	return classifications, values


In [2]:
def write_to_file(output, classifications, values):
	with open(output, "w") as output_file:
		for i, classification in enumerate(classifications):
			value = values[i]
			output_file.write(f"{classification}: {value}\n")

In [3]:
def process_book(input):
	classifications, values = read_book(input)

	title = ''
	book = ''
	chapter = -1
	chapter_name = ''

	rows = []

	for i, clazz in enumerate(classifications):
		if clazz == 'TITLE':
			title = values[i]
		elif clazz == 'BOOK':
			book = values[i]
		elif clazz == 'CHAPTER':
			chapter = values[i]
		elif clazz == 'CHAPTER_NAME':
			chapter_name = values[i]
		elif clazz == 'TEXT':
			rows.append({
				"title": title, 
				"book": book, 
				"chapter": chapter, 
				"chapter_name": chapter_name, 
				"text": values[i], 
				"text_other": ''
			})
		elif clazz == 'OTHER':
			rows[-1]["text_other"] += values[i]

	return rows


In [4]:
import csv
def write_verbatims(output, verbatims):
	with open(output, mode="w", newline="") as file:
		fieldnames = verbatims[0].keys()
		writer = csv.DictWriter(file, fieldnames=fieldnames)

		writer.writeheader()
		for row in verbatims:
			writer.writerow(row)

def write_text(output, verbatims):
	with open(output, 'w') as file:
		file.writelines("\n".join([r["text"] for r in verbatims]))

In [5]:
from enrichments import characters_enrichment, location_enrichment, artifact_enrichment

enrichments = [
	characters_enrichment, 
	location_enrichment,
	artifact_enrichment,
]
def enrich_verbatims(verbatims):
	for row in verbatims:
		for enrich in enrichments:
			enrich(row)
	return verbatims

In [6]:
rows = process_book('data/01.txt')
rows = enrich_verbatims(rows)
write_verbatims('output/1.csv', rows)

In [7]:
rows = process_book('data/02.txt')
rows = enrich_verbatims(rows)
write_verbatims('output/2.csv', rows)

In [8]:
rows = process_book('data/03.txt')
rows = enrich_verbatims(rows)
write_verbatims('output/3.csv', rows)