In [1]:
import os
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm

nltk.download('punkt')

def extract_year_from_filename(filename):
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def clean_text(text):
    return text.replace('ſ', 's').replace('Å¿', 's').replace('obseryed', 'observed').replace('thab', 'that')

def count_facts(text_dirs, start_year, end_year, window_size=10):
    period_counts = defaultdict(lambda: {'fact': 0, 'facts': 0, 'total_words': 0})
    
    print("Processing files...")
    for dir_path in text_dirs:
        for filename in tqdm(os.listdir(dir_path)):
            if filename.endswith('.txt'):
                year = extract_year_from_filename(filename)
                if year and start_year <= year <= end_year:
                    period_start = ((year - start_year) // window_size) * window_size + start_year
                    period_end = period_start + window_size - 1
                    period = f"{period_start}-{period_end}"
                    
                    try:
                        file_path = os.path.join(dir_path, filename)
                        with open(file_path, 'r', encoding='utf-8') as file:
                            text = file.read()
                            cleaned_text = clean_text(text)
                            
                            tokens = word_tokenize(cleaned_text.lower())
                            period_counts[period]['total_words'] += len([t for t in tokens if t.isalpha()])
                            period_counts[period]['fact'] += tokens.count('fact')
                            period_counts[period]['facts'] += tokens.count('facts')
                            
                    except Exception as e:
                        print(f"Error processing file {filename}: {str(e)}")
    
    # Print and save results
    output_filename = f"fact_counts_{start_year}_{end_year}.csv"
    print(f"\nSaving results to {output_filename}")
    
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write("Period,Total Words,Fact,Facts\n")
        
        sorted_periods = sorted(period_counts.keys())
        
        print("\nFact Count Results:")
        print("-" * 60)
        print(f"{'Period':<15} {'Total Words':<12} {'Fact':<8} {'Facts':<8}")
        print("-" * 60)
        
        for period in sorted_periods:
            counts = period_counts[period]
            
            # Print to console
            print(f"{period:<15} {counts['total_words']:<12} {counts['fact']:<8} {counts['facts']:<8}")
            
            # Write to CSV
            f.write(f"{period},{counts['total_words']},{counts['fact']},{counts['facts']}\n")
    
    return period_counts

if __name__ == "__main__":
    text_directories = [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"]
    
    start_year = 1665
    end_year = 1958 
    window_size = 10   
    
    counts = count_facts(text_directories, start_year, end_year, window_size)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Igiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing files...


100%|██████████| 5284/5284 [01:23<00:00, 63.66it/s]
100%|██████████| 8520/8520 [03:29<00:00, 40.73it/s] 
100%|██████████| 4390/4390 [01:36<00:00, 45.50it/s]   


Saving results to fact_counts_1665_1958.csv

Fact Count Results:
------------------------------------------------------------
Period          Total Words  Fact     Facts   
------------------------------------------------------------
1665-1674       994663       11       3       
1675-1684       557431       8        0       
1685-1694       488977       12       1       
1695-1704       983738       11       0       
1705-1714       580926       13       1       
1715-1724       412012       3        3       
1725-1734       475327       4        1       
1735-1744       779613       12       10      
1745-1754       933817       27       32      
1755-1764       1067910      35       48      
1765-1774       1053412      54       24      
1775-1784       1410531      90       79      
1785-1794       1128687      82       52      
1795-1804       1320660      277      198     
1805-1814       1032616      230      297     
1815-1824       1077752      288      189     
1825-1834    




In [2]:
start_year = 2000
end_year = 2024 
window_size = 10   
    
counts = count_facts(text_directories, start_year, end_year, window_size)

Processing files...


100%|██████████| 5284/5284 [05:19<00:00, 16.56it/s] 
100%|██████████| 8520/8520 [00:00<00:00, 846892.36it/s]
100%|██████████| 4390/4390 [02:45<00:00, 26.49it/s] 


Saving results to fact_counts_2000_2024.csv

Fact Count Results:
------------------------------------------------------------
Period          Total Words  Fact     Facts   
------------------------------------------------------------
2000-2009       1449170      363      34      
2010-2019       38884895     8970     647     
2020-2029       14212380     2974     158     



