In [None]:
import os
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize
import nltk
from tqdm import tqdm

nltk.download('punkt')

def extract_year_from_filename(filename):
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def clean_text(text):
    return text.replace('ſ', 's').replace('Å¿', 's')

def count_facts(text_dirs, start_year, end_year, window_size=10):
    period_counts = defaultdict(lambda: {'fact': 0, 'facts': 0, 'total_words': 0})
    
    print("Processing files...")
    
    # Loop over all directories
    for dir_path in text_dirs:
        for filename in tqdm(os.listdir(dir_path)):
            if filename.endswith('.txt'):
                year = extract_year_from_filename(filename)
                
                if year and start_year <= year <= end_year:
                    try:
                        # Read and process the file
                        file_path = os.path.join(dir_path, filename)
                        with open(file_path, 'r', encoding='utf-8') as file:
                            text = file.read()
                            cleaned_text = clean_text(text)
                            
                            tokens = word_tokenize(cleaned_text.lower())
                            total_words = len([t for t in tokens if t.isalpha()])
                            fact_count = tokens.count('fact')
                            facts_count = tokens.count('facts')

                            # Now apply sliding windows
                            for window_start in range(start_year, end_year - window_size + 1):
                                window_end = window_start + window_size - 1
                                
                                if window_start <= year <= window_end:
                                    period = f"{window_start}-{window_end}"
                                    period_counts[period]['total_words'] += total_words
                                    period_counts[period]['fact'] += fact_count
                                    period_counts[period]['facts'] += facts_count
                            
                    except Exception as e:
                        print(f"Error processing file {filename}: {str(e)}")
    
    # Sort periods by total words (ascending)
    sorted_periods = sorted(period_counts.items(), key=lambda item: item[1]['total_words'])

    # Print and save results
    output_filename = f"fact_counts_{start_year}_{end_year}.csv"
    print(f"\nSaving results to {output_filename}")
    
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write("Period,Total Words,Fact,Facts\n")
        
        print("\nFact Count Results (Sorted by Total Words):")
        print("-" * 60)
        print(f"{'Period':<15} {'Total Words':<12} {'Fact':<8} {'Facts':<8}")
        print("-" * 60)
        
        for period, counts in sorted_periods:
            # Print to console
            print(f"{period:<15} {counts['total_words']:<12} {counts['fact']:<8} {counts['facts']:<8}")
            
            # Write to CSV
            f.write(f"{period},{counts['total_words']},{counts['fact']},{counts['facts']}\n")
    
    return period_counts

if __name__ == "__main__":
    text_directories = [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
                        r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
                        r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"]
    
    start_year = 1665
    end_year = 1958 
    window_size = 10  # 10-year sliding windows
    
    counts = count_facts(text_directories, start_year, end_year, window_size)

#technically would be very much justified to rerun the analysis with facts threshold set to 2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Igiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing files...


100%|██████████| 5284/5284 [01:44<00:00, 50.65it/s]
100%|██████████| 8520/8520 [03:15<00:00, 43.50it/s] 
100%|██████████| 4390/4390 [01:25<00:00, 51.46it/s]   


Saving results to fact_counts_1665_1958.csv

Fact Count Results (Sorted by Total Words):
------------------------------------------------------------
Period          Total Words  Fact     Facts   
------------------------------------------------------------
1711-1720       411208       6        2       
1712-1721       411208       6        2       
1715-1724       412012       3        3       
1716-1725       412012       3        3       
1687-1696       417588       7        0       
1688-1697       417588       7        0       
1718-1727       436911       1        3       
1719-1728       436911       1        3       
1714-1723       446511       6        3       
1725-1734       475327       4        1       
1717-1726       475972       3        4       
1723-1732       481152       2        1       
1721-1730       484272       1        2       
1685-1694       488977       12       1       
1707-1716       499112       12       1       
1709-1718       500100       7      




In [2]:
start_year = 2000
end_year = 2024 
window_size = 10   
    
counts = count_facts(text_directories, start_year, end_year, window_size)

Processing files...


100%|██████████| 5284/5284 [05:19<00:00, 16.56it/s] 
100%|██████████| 8520/8520 [00:00<00:00, 846892.36it/s]
100%|██████████| 4390/4390 [02:45<00:00, 26.49it/s] 


Saving results to fact_counts_2000_2024.csv

Fact Count Results:
------------------------------------------------------------
Period          Total Words  Fact     Facts   
------------------------------------------------------------
2000-2009       1449170      363      34      
2010-2019       38884895     8970     647     
2020-2029       14212380     2974     158     



