# Imports

In [1]:
import string
import json
import os

from collections import Counter, OrderedDict
from number_parser import parse_ordinal

# Process Functions

In [2]:
def remove_punctuation(text):
    # Exclude '-' from removal
    exclude_chars = string.punctuation.replace('-', '')
    translator = str.maketrans('', '', exclude_chars)
    cleaned_text = text.translate(translator)
    return cleaned_text

In [3]:
def remove_next_line(text):
    return text.replace("\n", " ")

In [4]:
# Function to extract numbers from a text
def extract_numbers(text):
    dict = {
        "tens": 10,
        "hundreds": 100,
        "thousands": 1000,
        "millions": 1000000,
        "billions": 1000000000,
    }

    words = text.split()
    numbers = []
    for word in words:
        num = parse_ordinal(word, language="en")
        if word in dict:
            num = dict[word]
        numbers.append(num) if num else None
    return numbers

# Process texts

In [5]:
creation_myths_path = "./texts"
data_path = "./data.json"

data_json = open(data_path, 'r+')
data = json.load(data_json)

for idx, creation_myth in enumerate(data["creation_myths"]):
    filename = creation_myth["title"] + ".txt"
    with open(os.path.join(creation_myths_path, filename), 'r') as file:
        text = file.read()

        # Remove punctucation
        text = remove_punctuation(text)
        # Remove "\n" used for new lines
        text = remove_next_line(text)
        # Converts text to lowercase
        text = text.lower()

        numbers = extract_numbers(text)
        data["creation_myths"][idx]["stats"] = {"numbers": OrderedDict(sorted(Counter(numbers).items()))}

        data_json.seek(0)
        json.dump(data, data_json, indent=4)
        data_json.truncate()

        with open(os.path.join(creation_myths_path, filename), 'w') as file:
            file.write(text)

data_json.close()