- text has been re-ocdr
- text has been split into context/list to maybe make it easier to parse and analyze the contexts


for topic modelling: use the whole document or just the segment?

maybe see if any facſ etc are in texts but not found? make sure all instances are found

In [3]:
import os
import re
import csv

keywords = ["fact", "fiction", "facts", "facſ", "fictions", "ficſions", "factual", "fictional", "fictionally", "factually", "fictionality", "factuality", "fictionalized", "factualized", "fictive", "factive", "fictitious", "factious"]

def extract_text_from_txt(txt_path):
    """Extract text from a given TXT file."""
    with open(txt_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def find_contexts(text, keywords, context_size=1000):
    """Find and extract the context around each keyword in the text (case insensitive), preserving original formatting."""
    contexts = []
    lines = text.split('\n')
    for keyword in keywords:
        keyword_pattern = r'\b' + re.escape(keyword.strip()) + r'\b[\s\.:;,!]*'
        for i, line in enumerate(lines):
            matches = list(re.finditer(keyword_pattern, line, flags=re.IGNORECASE))
            for match in matches:
                #print(f"Match found: '{match.group()}' at line {i+1}, positions {match.start()}-{match.end()}")  # debug print
                
                #trying to keep the original formatting of the text for readability
                start_line = max(0, i - context_size // 80)  # Assuming average line length of 80 characters
                end_line = min(len(lines), i + 1 + context_size // 80)
                context = '\n'.join(lines[start_line:end_line])
                
                contexts.append((keyword.strip(), context))
    return contexts

def save_contexts_to_csv(contexts, output_path):
    """Save extracted contexts to a CSV file."""
    with open(output_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Filename", "Keyword", "Context", "Author", "Title", "Date"])
        writer.writerows(contexts)

def load_metadata_csv(metadata_path):
    """Load metadata from a CSV file into a dictionary."""
    metadata = {}
    with open(metadata_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            if 'filename' in row:
                file_location = row['filename']
                metadata[file_location] = row
            else:
                # Fallback for Royal Society format
                file_location = row.get('pdf_link', '').split('/')[-1]
                metadata[file_location] = row
    return metadata

def process_texts(folder_path, output_csv, keywords, metadata):
    """Process text files in a folder and save extracted contexts to a CSV file."""
    all_contexts = []
    metadata = load_metadata_csv(metadata)
    print(metadata)
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            txt_path = os.path.join(folder_path, filename)
            text = extract_text_from_txt(txt_path)
            contexts = find_contexts(text, keywords)
            for keyword, context in contexts:
                # Check for both .pdf and .txt versions of the filename
                pdf_filename = os.path.splitext(filename)[0] + ".pdf"
                txt_filename = filename
                
                if pdf_filename in metadata:
                    file_metadata = metadata[pdf_filename]
                elif txt_filename in metadata:
                    file_metadata = metadata[txt_filename]
                elif 'filename' in metadata and txt_filename in metadata['filename']:
                    file_metadata = metadata['filename'][txt_filename]
                else:
                    print(f"No metadata found for {filename}")
                    continue
                
                author = file_metadata.get('author', '')
                title = file_metadata.get('title', '')
                date = file_metadata.get('date', '')
                all_contexts.append((filename, keyword, context, author, title, date))
    save_contexts_to_csv(all_contexts, output_csv)

In [4]:
#royal society
folder_path = "D:/Fact_fiction_corpus/texts/royal society/txt"
output_csv = "data_for_viewing/extracted_raw/extracted_contexts_from_rs_text.csv"
metadata = "D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata.csv"

process_texts(folder_path, output_csv, keywords, metadata=metadata)

{'rstl_1665_0051.pdf': {'author': 'none', 'date': '1665-12-04', 'identifier': '10.1098/rstl.1665.0051', 'language': 'EN', 'og_url': 'https://royalsocietypublishing.org/doi/10.1098/rstl.1665.0051', 'pdf_download_link': 'https://royalsocietypublishing.org/doi/pdf/10.1098/rstl.1665.0051?download=true', 'pdf_link': '/doi/epdf/10.1098/rstl.1665.0051', 'publisher': 'The Royal SocietyLondon', 'title': 'Of Monsieur de Sons progress in working parabolar glasses', 'downloaded': 'TRUE', 'filename': 'rstl_1665_0051.pdf'}, 'rstl_1665_0052.pdf': {'author': 'Monsieur Auzout', 'date': '1665-12-04', 'identifier': '10.1098/rstl.1665.0052', 'language': 'EN', 'og_url': 'https://royalsocietypublishing.org/doi/10.1098/rstl.1665.0052', 'pdf_download_link': 'https://royalsocietypublishing.org/doi/pdf/10.1098/rstl.1665.0052?download=true', 'pdf_link': '/doi/epdf/10.1098/rstl.1665.0052', 'publisher': 'The Royal SocietyLondon', 'title': "Monsieur Auzout's speculations of the changes, likely to be discovered in t

In [5]:
#general magazine
folder_path = "D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/txt"
output_csv = "data_for_viewing/extracted_raw/extracted_contexts_from_general_magazine.csv"
metadata = "D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/general_magazine_metadata.csv" #cant quite split it better, so just one line of metadata and everything handled in one

process_texts(folder_path, output_csv, keywords, metadata=metadata)
#it finds nothing!!! and I cant find anything either

{'general_magazine.pdf': {'filename': 'general_magazine.pdf', 'author': 'Benjamin Martin', 'title': 'General Magazine of Arts and Sciences', 'date': '1755'}}


In [6]:
#spectator
folder_path = "D:/Fact_fiction_corpus/texts/spectator/txt"
output_csv = "data_for_viewing/extracted_raw/extracted_contexts_from_spectator.csv"
metadata = "D:/Fact_fiction_corpus/texts/spectator/spectator_metadata.csv"

process_texts(folder_path, output_csv, keywords, metadata=metadata)

{'1.txt': {'filename': '1.txt', 'title': 'No. 1', 'author': 'Addison', 'date': '1711-03-01'}, '2.txt': {'filename': '2.txt', 'title': 'No. 2', 'author': 'Steele', 'date': '1711-03-02'}, '3.txt': {'filename': '3.txt', 'title': 'No. 3', 'author': 'Addison', 'date': '1711-03-01'}, '4.txt': {'filename': '4.txt', 'title': 'No. 4', 'author': 'Steele', 'date': '1711-03-05'}, '5.txt': {'filename': '5.txt', 'title': 'No. 5', 'author': 'Addison', 'date': '1711-03-06'}, '6.txt': {'filename': '6.txt', 'title': 'No. 6', 'author': 'Steele', 'date': '1711-03-07'}, '7.txt': {'filename': '7.txt', 'title': 'No. 7', 'author': 'Addison', 'date': '1711-03-08'}, '8.txt': {'filename': '8.txt', 'title': 'No. 8', 'author': 'Addison', 'date': '1711-03-09'}, '9.txt': {'filename': '9.txt', 'title': 'No. 9', 'author': 'Addison', 'date': '1711-03-10'}, '10.txt': {'filename': '10.txt', 'title': 'No. 10', 'author': 'Addison', 'date': '1711-03-12'}, '11.txt': {'filename': '11.txt', 'title': 'No. 11', 'author': 'Steele

In [7]:
#testing the output txt
import pandas as pd

df = pd.read_csv("data_for_viewing/extracted_raw/extracted_contexts_from_rs_text.csv")

for index, row in df.head(10).iterrows():
    print(f"Filename: {row['Filename']}")
    print(f"Keyword: {row['Keyword']}")
    print(f"Author: {row['Author']}")
    print(f"Title: {row['Title']}")
    print(f"Context: {row['Context']}")

    print()

Filename: rstl_1666_0067.txt
Keyword: facts
Author: none
Title: An account on some books - I. Noveaux elemens de geometrie - II. Synopsis optica, auth. Honorato fabri, Soc. Jesu, Lugduni Gall, in 4. An, 1667. - III. Devi percussionis, Joh. Alphons. Borelli. Bononnix in 4. 1667. - IV. Nic. stenonis musculi descript io geometrica, Florentiæ in 40. An. 1667
Context: © cerning the Tuſinireneſs of the for: of Percuſſion, not having been yer demon-
S Orate by any, he hathin this Book, reſumed the whole matter concerning
= Percuſſion, and clearly demonſtrared the true and genuine Nature of it, its
= Cauſe, Proprieties and Effe&ts. 1n the doing of which, betaketh occaſion
S ro diſcourſe alſo of Gravity, Mageetiſme, Tremor of Badker, Pendulians, -c.
5 Allwhich, whileſtthe Readeyis conſidering, the Authour tells him, that he
S 3 making ready hisother Books concerning the Aſetiontof Animals,
Z 1V. NIC. STENONIS MUSCULI DESCRIPTIO GEOME-
en) TRICA, Florentie in 40, An, 1667.

= to r "ies 34, . .
A