checking the distribution of fact/facts and how many different ways you can write it, and whether its meaningful to add them to the collocation analysis; plots below.

fix/streamline this later

In [None]:
import os
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize

def extract_year(filename):
    """Extract year from the filename."""
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def find_fact_variants_with_tokens(directories, start_year=1665, end_year=1958):
    """Find occurrences of long-s variants in tokenized text."""
    findings = {}
    yearly_counts = defaultdict(lambda: defaultdict(int))
    
    # Patterns to match against tokenized text
    patterns = {
        'ſact': re.compile(r'^ſact(?:\'s)?$', re.IGNORECASE),
        'ſacts': re.compile(r'^ſacts(?:\')?$', re.IGNORECASE),
        'ſacſ': re.compile(r'^ſacſ(?:\'s)?$', re.IGNORECASE),  # Include the 'ſacſ' pattern
    }
    
    for directory in directories:
        if not os.path.exists(directory):
            print(f"Warning: Directory not found: {directory}")
            continue
        
        for filename in os.listdir(directory):
            if not filename.endswith('.txt'):
                continue
                
            year = extract_year(filename)
            if not year or year < start_year or year > end_year:
                continue
                
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    tokens = word_tokenize(text)  # Tokenize the text
                    
                    for variant, pattern in patterns.items():
                        matches = [token for token in tokens if pattern.match(token)]
                        if matches:
                            if variant not in findings:
                                findings[variant] = []
                            findings[variant].append((year, filename))
                            yearly_counts[year][variant] += len(matches)
                            
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
    
    return findings, yearly_counts

def main():
    text_directories = [
        r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
        r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl",
        r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
    ]
    
    findings, yearly_counts = find_fact_variants_with_tokens(text_directories)
    
    if not findings:
        print("No long-s variants found.")
        return
        
    print("\nVariant Summary:")
    print("-" * 50)
    for variant in sorted(findings.keys()):
        print(f"\nVariant: {variant}")
        print(f"Total occurrences: {len(findings[variant])}")
    
    print("\nYearly Distribution:")
    print("-" * 50)
    for year in sorted(yearly_counts.keys()):
        counts = yearly_counts[year]
        variants_str = ", ".join(f"{v}: {c}" for v, c in counts.items())
        print(f"{year}: {variants_str}")

if __name__ == "__main__":
    main()


In [None]:
import os
import re
from collections import defaultdict
from nltk.tokenize import word_tokenize

def extract_year(filename):
    """Extract year from the filename."""
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def find_fact_variants_with_tokens(directories, start_year=1665, end_year=1958):
    """Find occurrences of 'fact' variants in tokenized text."""
    findings = {
        'fact': [],
        'facts': []
    }
    
    yearly_counts = defaultdict(lambda: {'fact': 0, 'facts': 0})
    
    # Patterns for matching tokenized text
    patterns = {
        'fact': [
            re.compile(r'fact(?:\'s)?$', re.IGNORECASE),
            re.compile(r'^ſact(?:\'s)?$', re.IGNORECASE)  # Include the long-s variant 'ſact'
        ],
        'facts': [
            re.compile(r'^facts(?:\')?$', re.IGNORECASE),
            re.compile(r'^facſ(?:\')?$', re.IGNORECASE),
            re.compile(r'^facſs(?:\')?$', re.IGNORECASE),
            re.compile(r'^ſacts(?:\')?$', re.IGNORECASE),  # Include 'ſacts' in facts category but not in the analysis
            re.compile(r'^ſacſ(?:\')?$', re.IGNORECASE)
            re.compile(r'^ſacſs(?:\')?$', re.IGNORECASE)
        ]
    }
    
    for directory in directories:
        if not os.path.exists(directory):
            print(f"Warning: Directory not found: {directory}")
            continue
        
        for filename in os.listdir(directory):
            if not filename.endswith('.txt'):
                continue
                
            year = extract_year(filename)
            if not year or year < start_year or year > end_year:
                continue
                
            file_path = os.path.join(directory, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    tokens = word_tokenize(text)  # Tokenize the text
                    
                    for category in ['fact', 'facts']:
                        for pattern in patterns[category]:
                            matches = [token for token in tokens if pattern.match(token)]
                            if matches:
                                findings[category].append((year, filename, len(matches), pattern.pattern))
                                yearly_counts[year][category] += len(matches)
                    
            except Exception as e:
                print(f"Error processing {file_path}: {str(e)}")
    
    return findings, yearly_counts

def main():
    text_directories = [
        r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
        r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl",
        r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
    ]
    
    findings, yearly_counts = find_fact_variants_with_tokens(text_directories)
    
    print("\nFindings by Category:")
    print("=" * 60)
    
    for category in ['fact', 'facts']:
        print(f"\n{category.upper()} VARIANTS:")
        print("-" * 60)
        
        if not findings[category]:
            print(f"No {category} variants found.")
            continue
            
        sorted_findings = sorted(findings[category], key=lambda x: (x[0], x[1]))
        total_occurrences = sum(item[2] for item in sorted_findings)
        
        pattern_counts = defaultdict(int)
        for _, _, count, pattern in sorted_findings:
            pattern_counts[pattern] += count
        
        print(f"\nTotal occurrences of {category}: {total_occurrences}")
        print("\nDistribution by pattern:")
        for pattern, count in pattern_counts.items():
            print(f"  {pattern}: {count} occurrences")
    
    print("\nYearly Distribution:")
    print("=" * 60)
    for year in sorted(yearly_counts.keys()):
        counts = yearly_counts[year]
        print(f"{year}: fact: {counts['fact']}, facts: {counts['facts']}")

if __name__ == "__main__":
    main()


some plots for the paper etc.

In [26]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
from nltk.tokenize import word_tokenize

def count_tokens_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return len(word_tokenize(text))

def count_tokens_in_10_year_windows(text_dirs, start_year, end_year):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    token_counts = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    file_paths = []
    for dir in text_dirs:
        for f in os.listdir(dir):
            if f.endswith('.txt'):
                file_paths.append(os.path.join(dir, f))
    
    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        pub_type = os.path.basename(os.path.dirname(file_path))[-4:]
        
        if year and start_year <= year <= end_year:
            token_count = count_tokens_in_file(file_path)
            token_counts[pub_type][year] += token_count
    
    tokens_per_window = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    for pub_type in ['rsta', 'rstb', 'rstl']:
        if pub_type in ['rsta', 'rstb']:
            start_year_filter = 1887
        else:
            start_year_filter = 1665
        
        for year in range(start_year, end_year - 9):
            window_start = year
            window_end = year + 9
            total_tokens = sum(
                token_counts[pub_type].get(y, 0) 
                for y in range(window_start, window_end + 1)
                if (pub_type != 'rstl' and y >= 1887) or (pub_type == 'rstl' and y <= 1886)
            )
            tokens_per_window[pub_type][window_start] = total_tokens
    
    return tokens_per_window

text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

historical_token_counts = count_tokens_in_10_year_windows(
    text_dirs,
    start_year=1665,
    end_year=1958
)

publication_types = {
    'rsta': {'color': 'red', 'name': 'RSTA', 'order': 3},
    'rstb': {'color': 'blue', 'name': 'RSTB', 'order': 2},
    'rstl': {'color': 'green', 'name': 'RSTL', 'order': 1}
}

fig = go.Figure()

all_windows = sorted(set().union(
    historical_token_counts['rsta'].keys(),
    historical_token_counts['rstb'].keys(),
    historical_token_counts['rstl'].keys()
))

sorted_pub_types = sorted(publication_types.items(), key=lambda x: x[1]['order'])

for pub_type, details in sorted_pub_types:
    windows = [year for year in all_windows if historical_token_counts[pub_type].get(year, 0) > 0]
    counts = [historical_token_counts[pub_type].get(year, 0) for year in windows]
    
    if windows:
        first_year = windows[0]
        last_year = windows[-1]
        
        if pub_type in ['rsta', 'rstb']:
            first_year = max(first_year, 1887)
        
        fig.add_trace(go.Bar(
            x=windows,
            y=counts,
            name=f'{details["name"]} ({first_year}-{last_year})',
            marker_color=details['color'],
            hovertemplate='%{y:,} tokens<extra></extra>'
        ))

fig.update_xaxes(
    range=[1665, 1948],
    tickvals=list(range(1665, 1948, 10)), #year window starts
    tickfont=dict(size=12),
    title_font=dict(size=14),
    title_text='<b>Window Start Year<b>'
)

fig.update_layout(
    title='<b>Total Tokens per 10-Year Window by Royal Society Publication Type (1665-1958)<b>',
    xaxis_title='<b>Window Start Year<b>',
    yaxis_title='<b>Total Tokens<b>',
    barmode='stack',
    template='plotly_white',
    width=1200,
    height=600,
    font=dict(family='Arial, sans-serif'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        font=dict(size=14, weight='bold')
    ),
    plot_bgcolor='rgba(240,240,240,0.5)',
    margin=dict(t=100, b=50, l=50, r=50)
)

fig.update_yaxes(
    tickformat="~s",
    tickfont=dict(size=12),
    title_font=dict(size=14),
    tickvals=[1_000_000, 2_000_000, 3_000_000, 4_000_000, 5_000_000, 6_000_000, 7_000_000],
)

fig.show()
fig.write_html("C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_token_counts_cumulative_bar_plot.html")


100%|██████████| 18194/18194 [06:21<00:00, 47.70it/s]  


In [25]:
import os
import re
import plotly.graph_objects as go
from nltk.tokenize import word_tokenize
from tqdm import tqdm

def extract_year_from_filename(filename):
    """Extract the year from the filename."""
    match = re.search(r'rst[bla]?_?(\d{4})', filename)
    return int(match.group(1)) if match else None

def count_tokens_in_file(file_path):
    """Count tokens in a single file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len(tokens)  # Count all tokens without filtering

def count_tokens_in_directory(directory, start_year, end_year):
    """Count tokens in all files within a directory, filtered by year range."""
    total_tokens = 0
    file_count = 0
    
    # Get list of all txt files
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    
    # Use tqdm for progress bar
    for file in tqdm(files, desc=f"Processing {os.path.basename(directory)}"):
        file_path = os.path.join(directory, file)
        
        # Extract year from filename and check if it's within the range
        year = extract_year_from_filename(file)
        if year is None or not (start_year <= year <= end_year):
            continue  # Skip files outside the year range
        
        try:
            token_count = count_tokens_in_file(file_path)
            total_tokens += token_count
            file_count += 1
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    return total_tokens, file_count

# Directories
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Publication types with consistent colors
publication_types = {
    'txt_rsta': {'color': 'red', 'name': 'RSTA'},
    'txt_rstb': {'color': 'blue', 'name': 'RSTB'},
    'txt_rstl': {'color': 'green', 'name': 'RSTL'}
}

# Year range to filter by
start_year = 1665
end_year = 1958

# Get counts and labels
counts = []
file_counts = []
labels = []
colors = []

print("\nProcessing files:")
print("-" * 50)

for directory in text_dirs:
    token_count, file_count = count_tokens_in_directory(directory, start_year, end_year)
    counts.append(token_count)
    file_counts.append(file_count)
    dir_name = os.path.basename(directory)
    pub_info = publication_types[dir_name]
    labels.append(f"{pub_info['name']} ({token_count:,} tokens)")
    colors.append(pub_info['color'])
    print(f"{dir_name}:")
    print(f"  - Files processed: {file_count}")
    print(f"  - Token count: {token_count:,}")

# Calculate total token count
total_tokens = sum(counts)
total_files = sum(file_counts)

# Create interactive pie chart with Plotly
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=counts,
    marker=dict(colors=colors),
    textinfo='label+percent',
    hovertemplate="<b>%{label}</b><br>" +
                  "Token count: %{value:,}<br>" +
                  "Percentage: %{percent}<br>" +
                  "<extra></extra>"
)])

# Update layout
fig.update_layout(
    title={
        'text': f'<b>Token Distribution Across Royal Society Publication Types (1665-1958)<br>Total Tokens: {total_tokens:,} (from {total_files:,} files)</b>',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    showlegend=True,
    width=1200,
    height=600,
    template='plotly_white',
    font=dict(family='Arial, sans-serif'),
    margin=dict(t=100, b=50, l=50, r=50)
)

# Save as interactive HTML
output_path = r"C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_token_distribution_pie_1665_1958.html"
fig.write_html(output_path)
fig.show()

print("\nFinal Summary:")
print("-" * 50)
for label, count, file_count in zip(labels, counts, file_counts):
    print(f"{label}")
    print(f"  - Files: {file_count}")
    print(f"  - Tokens: {count:,}")
print("-" * 50)
print(f"Total files processed: {total_files:,}")
print(f"Total tokens across all directories: {total_tokens:,}")
print(f"\nInteractive pie chart has been saved to: {output_path}")



Processing files:
--------------------------------------------------


Processing txt_rstb: 100%|██████████| 5284/5284 [01:25<00:00, 61.92it/s]


txt_rstb:
  - Files processed: 628
  - Token count: 15,719,527


Processing txt_rstl: 100%|██████████| 8520/8520 [04:17<00:00, 33.07it/s] 


txt_rstl:
  - Files processed: 8520
  - Token count: 42,079,421


Processing txt_rsta: 100%|██████████| 4390/4390 [01:23<00:00, 52.47it/s]   

txt_rsta:
  - Files processed: 864
  - Token count: 16,772,168






Final Summary:
--------------------------------------------------
RSTB (15,719,527 tokens)
  - Files: 628
  - Tokens: 15,719,527
RSTL (42,079,421 tokens)
  - Files: 8520
  - Tokens: 42,079,421
RSTA (16,772,168 tokens)
  - Files: 864
  - Tokens: 16,772,168
--------------------------------------------------
Total files processed: 10,012
Total tokens across all directories: 74,571,116

Interactive pie chart has been saved to: C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_token_distribution_pie_1665_1958.html
