The non-royal society files don't have a good metadata file --> going to make one to make them equal

In [1]:
#metadata for spectator that resembles the royal society metadata
import os
import re
import csv
from datetime import datetime
from fuzzywuzzy import process
from typing import Tuple

# List of known author names in the document (may be incomplete)
author_names = ['Addison', 'Steele', 'Budgell', 'Hughes', 'Pope', 'Tickell', 'Parnell', 'Francham', 'Grove', 'Byrom']

def extract_metadata(text: str) -> Tuple[str, str]:
    # Determine the author using fuzzy matching (some are slightly mistyped so exact matching may not work)
    found_authors = {author: text.find(author) for author in author_names if text.find(author) != -1}
    
    if found_authors:
        # Sort found authors by their position in the text (first is generally author, given this formatting) (imperfect)
        sorted_authors = sorted(found_authors.items(), key=lambda item: item[1])
        author = sorted_authors[0][0]
    else:
        possible_authors = process.extractOne(text, author_names)
        if possible_authors and possible_authors[1] > 80:
            author = possible_authors[0]
        else:
            author = "unclear" #these might have to be set by hand

    # Extract the date (usually written textually)
    date_str = re.search(r'\b[A-Za-z]+, [A-Za-z]+ \d{1,2}, \d{4}\b', text)
    if date_str:
        date_str = date_str.group()
        try:
            date_obj = datetime.strptime(date_str, '%A, %B %d, %Y')
            formatted_date = date_obj.strftime('%Y-%m-%d')
        except ValueError:
            formatted_date = 'unclear'
    else:
        formatted_date = 'unclear' #these might have to be set by hand

    return author, formatted_date

def process_folder(folder_path: str, output_csv: str):
    metadata = []
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                title = lines[0].strip() if lines else 'unclear'
                text = ''.join(lines[1:])
                
                author, date = extract_metadata(text)
                metadata.append([filename, title, author, date])
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['filename', 'title', 'author', 'date']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for data in metadata:
            writer.writerow({'filename': data[0], 'title': data[1], 'author': data[2], 'date': data[3]})

folder_path = "D:/Fact_fiction_corpus/texts/spectator/txt"
output_csv = "spectator_metadata.csv"
process_folder(folder_path, output_csv)




In [None]:
#cleaning up "contents" and "footnotes" of spectator texts (since they were added later)
import re
import os

folder_path = "D:/Fact_fiction_corpus/texts/spectator/txt"

# Regex patterns
contents_pattern = r'^.*\bContents\b.{0,9}\bContents\b.*$'
footnote_pattern = r'^\s*footnote\s+\d+[:\s].*'

def filter_lines(text):
    """Remove everything after the first occurrence of either 'contents_pattern' or 'footnote_pattern'."""
    stop_pattern = None

    # Determine which pattern comes first in the text (and remove all after)
    contents_match = re.search(contents_pattern, text, re.IGNORECASE)
    footnote_match = re.search(footnote_pattern, text, re.IGNORECASE)

    if contents_match and footnote_match:
        stop_pattern = min(contents_match.start(), footnote_match.start())
    elif contents_match:
        stop_pattern = contents_match.start()
    elif footnote_match:
        stop_pattern = footnote_match.start()

    if stop_pattern is not None:
        return text[:stop_pattern].strip()
    return text

def process_files(folder_path):
    """Process all text files in the given folder."""
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            filtered_content = filter_lines(content)

            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(filtered_content)

process_files(folder_path)


In [3]:
#metadata (one row) for general magazine
metadata = {
    "general_magazine.pdf": {
        'author': 'Benjamin Martin',
        'title': 'General Magazine of Arts and Sciences',
        'date': '1755'
    },
}

import csv

csv_filename = "D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/general_magazine_metadata.csv"

with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['filename', 'author', 'title', 'date'])
    for filename, details in metadata.items():
        writer.writerow([filename, details['author'], details['title'], details['date']])


In [1]:
import os
import re
import pandas as pd

def count_words_by_year(folder_path, file_name):
    word_counts = {}

    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            match = re.search(r'\d{4}', file)
            if match:
                year = match.group(0)
                with open(os.path.join(folder_path, file), 'r', encoding='utf-8') as f:
                    text = f.read()
                    word_count = len(text.split())
                if year in word_counts:
                    word_counts[year] += word_count
                else:
                    word_counts[year] = word_count

    df = pd.DataFrame(list(word_counts.items()), columns=['Year', 'Word Count'])
    df.to_csv(file_name, index=False)

count_words_by_year(r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta", r"C:\Users\Igiba\Documents\fact_fiction_project\data_for_viewing\visualizations\word_counts_rsta.csv")
#count_words_by_year(r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", r"C:\Users\Igiba\Documents\fact_fiction_project\data_for_viewing\visualizations\word_counts_rstl.csv")
