C5-NC5 indicates that the MI-score with the cut-off point three was used as the association measure; the collocates were identified in the span of three words to the left (L3) and three words to the right (R3) of the node and the frequency threshold was five for both the collocate (C5) and the collocation (NC5). (frequency threshold exists)

Generally, the smaller the span, the greater the focus of the analysis on the most immediate lexico-grammatical patterns; a larger span captures looser associations.

TO DO
- make the freq threshold vary based on the amount of data for each year gap!!! (done, but might need adjusting)
- rerun with 3l3r and 5l5r for comparison (done)
- avoid lemmatization but tokenize (fact/facts) (done)
- include as a side if its from rstl rstb or rsta (done)
- different freq thresholds for fact/facts since they are not as common --> calculating the minimum subcorpora amount for the frequency (done, technically)

minimum subcorpora:
for every 412k, there should be 3

Period          Total Words  Fact     Facts   
1715-1724       412012       3        3


however, the scaling should be different since theres less of facts overall? though they align on the smallest subcorpus

In [94]:
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
import string
import csv
import os
import re
from collections import defaultdict
from tqdm.notebook import tqdm
import traceback
import concurrent.futures

nltk.download('punkt')

def extract_year_from_filename(filename):
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def clean_text(text):
    return text.replace('ſ', 's').replace('Å¿', 's') #this is optional and could be expanded upon or omitted

def build_file_directory_map(text_dirs): #to get the directory and filename later, hopefully the maps match correctly
    file_directory_map = {}
    for dir_path in text_dirs:
        dir_name = os.path.basename(dir_path)
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(dir_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        cleaned_text = clean_text(text)
                        file_directory_map[cleaned_text] = (dir_name, filename)
                except Exception as e:
                    print(f"Error reading file {file_path}: {str(e)}")
    return file_directory_map

def process_file(file_path):
    try:
        year = extract_year_from_filename(os.path.basename(file_path))
        if not year:
            return None
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        cleaned_text = clean_text(text)
        return year, cleaned_text
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def process_tokens(text):
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha() and token not in string.punctuation]

def calculate_scaled_freq_filter(total_words, word_of_interest):
    if word_of_interest == "fact":
        base_threshold = 3  # base threshold for the lowest amount of words (412,012) using smallest subcorpora for "fact"
        scale_factor = 3 / 412012  #increase threshold by this factor for each extra word over 412012
        if total_words <= 412012:
            return base_threshold
        else:
            additional_threshold = (total_words - 412012) * scale_factor
            return base_threshold + round(additional_threshold) #needs to be rounded as integer
        
    elif word_of_interest == "facts": 
        base_threshold = 2 #theres much less of facts than fact (sometimes 0)
        scale_factor = 2 / 411208 #should've maybe used 2, as the smallest subcorp has 2 facts #originally used same scale factor as fact
        if total_words <= 411208:
            return base_threshold
        else:
            additional_threshold = (total_words - 411208) * scale_factor
            return base_threshold + round(additional_threshold)


def process_window(window_data, word_of_interest, collocate_window, file_directory_map):
    try:
        window_start, window_end, window_texts = window_data
        all_tokens = [token for text in window_texts for token in process_tokens(text)]
        
        total_tokens = len(all_tokens)
        total_texts = len(window_texts)
        
        freq_filter = calculate_scaled_freq_filter(total_tokens, word_of_interest)
        print(f"Window {window_start}-{window_end}: Using frequency filter of {freq_filter} for {total_tokens} total words")
        
        finder = BigramCollocationFinder.from_words(all_tokens, window_size=collocate_window)
        finder.apply_freq_filter(freq_filter)
        
        collocations = finder.score_ngrams(BigramAssocMeasures().pmi)
        
        collocate_stats = []
        collocate_contexts = defaultdict(list)

        word_of_interest_lower = word_of_interest.lower()
        word_of_interest_freq = sum(finder.word_fd[word] for word in finder.word_fd if word.lower() == word_of_interest_lower)

        text_sources = {text: file_directory_map.get(text, ("Unknown", "Unknown")) for text in window_texts}

        for bigram, pmi in collocations:
            if word_of_interest_lower in (word.lower() for word in bigram):
                other_word = bigram[0] if bigram[1].lower() == word_of_interest_lower else bigram[1]
                observed_freq = finder.ngram_fd[bigram]
                word_freq = finder.word_fd[other_word]
                expected_freq = (word_of_interest_freq * word_freq) / total_tokens
                num_texts = sum(1 for text in window_texts if other_word.lower() in text.lower().split())
                
                if num_texts > 1:
                    collocate_stats.append({
                        'word': other_word,
                        'total_corpus': word_freq,
                        'expected_freq': expected_freq,
                        'observed_freq': observed_freq,
                        'num_texts': num_texts,
                        'pmi': pmi
                    })
                    
                    for text in window_texts:
                        words = text.split()
                        for i, word in enumerate(words):
                            if word.lower() == word_of_interest_lower and other_word.lower() in [w.lower() for w in words[max(0, i-collocate_window):i+collocate_window+1]]:
                                context = ' '.join(words[max(0, i-collocate_window):i+collocate_window+1])
                                dir_name, filename = text_sources[text]
                                context_info = context #this is whats shown as context in the csv, redundant but can change it l8
                                collocate_contexts[other_word].append((context_info, other_word, filename, dir_name, freq_filter))
        
        collocate_stats.sort(key=lambda x: x['pmi'], reverse=True)

        filtered_contexts = [context for collocate in collocate_stats for context in collocate_contexts.get(collocate['word'], [])] #testing if this is at fault for mismatching contexts

        #all_contexts = [context for contexts in collocate_contexts.values() for context in contexts] #downloads all of them, bad

        return f"{window_start}-{window_end}", {
            'collocates': collocate_stats,
            'total_tokens': total_tokens,
            'total_texts': total_texts,
            'contexts': filtered_contexts
        }
    except Exception as e:
        print(f"Error processing window {window_start}-{window_end}: {str(e)}")
        traceback.print_exc()
        return None

def analyze_collocates(text_dirs, output_dir, word_of_interest, start_year, end_year, collocate_window, window_size=10):
    def read_texts_by_year(directories):
        texts_by_year = defaultdict(list)
        file_paths = [os.path.join(dir, f) for dir in directories for f in os.listdir(dir) if f.endswith('.txt')]
        
        print("Reading files...")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(process_file, file_paths), total=len(file_paths)))
        
        for result in results:
            if result:
                year, text = result
                if start_year <= year <= end_year:
                    texts_by_year[year].append(text)
        
        return texts_by_year

    print(f"Starting text processing for years {start_year}-{end_year}...")
    texts_by_year = read_texts_by_year(text_dirs)
    file_directory_map = build_file_directory_map(text_dirs)
    
    all_years = sorted(texts_by_year.keys())
    if not all_years:
        print(f"No texts found in the specified year range {start_year}-{end_year}")
        return
        
    window_data = [
        (window_start, window_start + window_size - 1, 
         [text for year in range(window_start, window_start + window_size) for text in texts_by_year.get(year, [])])
        for window_start in range(start_year, end_year - window_size + 2)
    ]

    collocates_by_window = {}
    print("Processing windows...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda data: process_window(data, word_of_interest, collocate_window, file_directory_map), window_data), total=len(window_data)))

    for result in results:
        if result:
            window, window_result = result
            collocates_by_window[window] = window_result

    os.makedirs(output_dir, exist_ok=True)
    print("Saving results to CSV and TXT files...")

    for window, data in collocates_by_window.items():
        save_collocates_to_csv(output_dir, window, data)
        save_contexts_to_csv(output_dir, window, data)
        save_detailed_report(output_dir, window, data, word_of_interest)

    print(f"Collocate analysis completed. Results saved in the '{output_dir}' directory.")


#data saving functions    
def save_collocates_to_csv(output_dir, window, data):
    csv_filename = os.path.join(output_dir, f'collocates_{window}.csv')
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['No.', 'Word', 'Total no. in window corpus', 'Expected collocate frequency',
                         'Observed collocate frequency', 'In no. of texts', 'Mutual Information value'])
        
        for i, collocate in enumerate(data['collocates'], 1):
            writer.writerow([
                i,
                collocate['word'],
                collocate['total_corpus'],
                f"{collocate['expected_freq']:.2f}",
                collocate['observed_freq'],
                collocate['num_texts'],
                f"{collocate['pmi']:.2f}"
            ])
    print(f"Saved collocates to {csv_filename}")

def save_contexts_to_csv(output_dir, window, data): #for later viewing
    subfolder_path = os.path.join(output_dir, 'contexts')
    os.makedirs(subfolder_path, exist_ok=True)
    csv_filename = os.path.join(subfolder_path, f'contexts_{window}.csv')
    
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Context', 'Collocate', 'File', 'Directory', 'Year', 'Frequency Filter'])
    for context_info, other_word, filename, dir_name, freq_filter in data['contexts']:
        match = re.search(r'_(\d{4})_', filename)
        year = match.group(1) if match else 'Unknown'
        writer.writerow([context_info, other_word, filename, dir_name, year, freq_filter])
    
    print(f"Saved contexts to {csv_filename}")

def save_detailed_report(output_dir, window, data, word_of_interest): #this is for UFA, important to keep the formatting as-is just in case
    subfolder_path = os.path.join(output_dir, 'collocation_txt')
    os.makedirs(subfolder_path, exist_ok=True)
    txt_filename = os.path.join(subfolder_path, f'collocates_txt_{window}.txt')
    
    with open(txt_filename, 'w', encoding='utf-8') as txtfile:
        txtfile.write("https://example.com/collocation-analysis\n") #spoofing the formatting of what UFA expects (on the website), kind of arbitrary by now but its okay
        txtfile.write(f"There are {data['total_tokens']} different words in your collocation database for \"{word_of_interest}\". ")
        txtfile.write(f"(Your query returned {sum(c['observed_freq'] for c in data['collocates'])} matches in {data['total_texts']} different texts)\n")
        txtfile.write("__________________\n\n")
        
        txtfile.write("No.\tWord\tTotal no. in whole corpus\tExpected collocate frequency\t")
        txtfile.write("Observed collocate frequency\tIn no. of texts\tMutual information value\n\n")
        
        for i, collocate in enumerate(data['collocates'], 1):
            txtfile.write(f"{i}\t{collocate['word']}\t{collocate['total_corpus']}\t")
            txtfile.write(f"{collocate['expected_freq']:.2f}\t{collocate['observed_freq']}\t")
            txtfile.write(f"{collocate['num_texts']}\t{collocate['pmi']:.2f}\n")
    
    print(f"Saved detailed report to {txt_filename}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Igiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import concurrent.futures

tasks = [
    #([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      #r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      #r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     #"collocation_results/FACT/collocate_results_1665-1958_FACT_css3_w3", "fact", 1665, 1958, 3), #collocate_window = 3/5
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "collocation_results/FACTS/collocate_results_1665-1958_FACTS_css2_w3", "facts", 1665, 1958, 3), #scaling is more unforgiving for facts (imperfect) BUT SETTING IT BELOW 3 IS TOO FORGIVING
    
    #([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      #r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      #r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     #"collocation_results/FACT/collocate_results_1665-1958_FACT_css3_w5", "fact", 1665, 1958, 5),
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "collocation_results/FACTS/collocate_results_1665-1958_FACTS_css2_w5", "facts", 1665, 1958, 5) #css = custom subcorpora scaling
]

def run_analyze_collocates(args):
    analyze_collocates(*args)

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(run_analyze_collocates, task) for task in tasks]
    for future in concurrent.futures.as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error occurred: {e}")

Error occurred: name 'analyze_collocates' is not defined
Error occurred: name 'analyze_collocates' is not defined
Error occurred: name 'analyze_collocates' is not defined
Error occurred: name 'analyze_collocates' is not defined


---
extra stuff
---

ungodly messy, could use some clean up

In [None]:
#so my pc turns off eventually
import os
import time

time.sleep(300)
os.system("shutdown /s /t 1")

0

In [96]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm

def count_files_in_year_windows(text_dirs, window_size=10):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    # Initialize separate counts for each journal type
    year_counts = {
        'rstb': defaultdict(int),
        'rsta': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    print("Counting files in year windows...")
    
    for text_dir in text_dirs:
        journal_type = os.path.basename(text_dir)[-4:]  # extract journal type from directory name
        file_paths = [os.path.join(text_dir, f) for f in os.listdir(text_dir) if f.endswith('.txt')]
        
        for file_path in tqdm(file_paths):
            year = extract_year_from_filename(os.path.basename(file_path))
            if year:
                year_counts[journal_type][year] += 1
    
    # Combine and window the counts
    combined_year_windows = defaultdict(lambda: {'rstb': 0, 'rsta': 0, 'rstl': 0})
    
    for journal_type, counts in year_counts.items():
        all_years = sorted(counts.keys())
        for year in all_years:
            if year in counts:
                for offset in range(window_size):
                    window_year = year + offset
                    combined_year_windows[window_year][journal_type] += counts[year] if window_year in counts else 0
    
    return combined_year_windows

# Example usage
year_counts = count_files_in_year_windows(
    [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl",
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"]
)

# Prepare the data for plotting
years = list(range(1665, 1959))
df = pd.DataFrame({
    'Year': years,
    'RSTB Count': [year_counts.get(year, {'rstb': 0})['rstb'] if year >= 1887 else 0 for year in years],
    'RSTA Count': [year_counts.get(year, {'rsta': 0})['rsta'] if year >= 1887 else 0 for year in years],
    'RSTL Count': [year_counts.get(year, {'rstl': 0})['rstl'] if year <= 1886 else 0 for year in years]
})

# Create the interactive plot
fig = go.Figure()

# Add traces for each journal type
fig.add_trace(go.Scatter(
    x=df['Year'][df['Year'] <= 1887],
    y=df['RSTL Count'][df['Year'] <= 1887],
    mode='markers+lines',
    name='<b>RSTL (1665-1887)<b>',
    marker=dict(
        size=10,
        color='green',
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>RSTL Count: {count}' for year, count in zip(df['Year'][df['Year'] <= 1886], df['RSTL Count'][df['Year'] <= 1886])],
    hoverinfo='text'
))

fig.add_trace(go.Scatter(
    x=df['Year'][df['Year'] >= 1887],
    y=df['RSTB Count'][df['Year'] >= 1887],
    mode='markers+lines',
    name='<b>RSTB (1887-1958)<b>',
    marker=dict(
        size=10,
        color='blue',
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>RSTB Count: {count}' for year, count in zip(df['Year'][df['Year'] >= 1887], df['RSTB Count'][df['Year'] >= 1887])],
    hoverinfo='text'
))

fig.add_trace(go.Scatter(
    x=df['Year'][df['Year'] >= 1887],
    y=df['RSTA Count'][df['Year'] >= 1887],
    mode='markers+lines',
    name='<b>RSTA (1887-1958)<b>',
    marker=dict(
        size=10,
        color='red',
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>RSTA Count: {count}' for year, count in zip(df['Year'][df['Year'] >= 1887], df['RSTA Count'][df['Year'] >= 1887])],
    hoverinfo='text'
))

# Update layout
fig.update_layout(
    title='<b>Number of Articles per Year by Journal Type (1665-1958)<b>',
    xaxis_title='<b>Year<b>',
    yaxis_title='<b>Number of Articles<b>',
    xaxis=dict(
        tickmode='linear',
        tick0=1665,
        dtick=10,
        range=[1665, 1958]
    ),
    yaxis=dict(range=[0, max(df['RSTB Count'].max(), df['RSTA Count'].max(), df['RSTL Count'].max()) + 50]),
    template='plotly_white'
)

fig.update_layout(
    width=1200,  # Reduce width
    height=500,  # Reduce height
)

# Show the plot
fig.show()
fig.write_html("../extra_plots/royal_society_publications.html")

Counting files in year windows...


  0%|          | 0/5284 [00:00<?, ?it/s]

100%|██████████| 5284/5284 [00:00<00:00, 290376.58it/s]
100%|██████████| 8520/8520 [00:00<00:00, 230663.03it/s]
100%|██████████| 4390/4390 [00:00<00:00, 278921.37it/s]


In [97]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm

def count_word_in_files(text_dirs, window_size=10):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None

    # Initialize separate counts for each journal type
    year_word_counts = {
        'rstb': defaultdict(int),
        'rsta': defaultdict(int),
        'rstl': defaultdict(int)
    }

    print("Counting word counts in year windows...")

    for text_dir in text_dirs:
        journal_type = os.path.basename(text_dir)[-4:]  # extract journal type from directory name
        file_paths = [os.path.join(text_dir, f) for f in os.listdir(text_dir) if f.endswith('.txt')]

        for file_path in tqdm(file_paths):
            year = extract_year_from_filename(os.path.basename(file_path))
            if year:
                with open(file_path, 'r', encoding='utf-8') as file:
                    text = file.read()
                    word_count = len(text.split())  # Count the number of words
                    year_word_counts[journal_type][year] += word_count
    
    # Combine and window the counts
    combined_year_windows = defaultdict(lambda: {'rstb': 0, 'rsta': 0, 'rstl': 0})

    for journal_type, counts in year_word_counts.items():
        all_years = sorted(counts.keys())
        for year in all_years:
            if year in counts:
                for offset in range(window_size):
                    window_year = year + offset
                    combined_year_windows[window_year][journal_type] += counts[year] if window_year in counts else 0
    
    return combined_year_windows

# Example usage
year_word_counts = count_word_in_files(
    [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl",
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"]
)

# Prepare the data for plotting
years = list(range(1665, 1959))
df = pd.DataFrame({
    'Year': years,
    'RSTB Word Count': [year_word_counts.get(year, {'rstb': 0})['rstb'] if year >= 1887 else 0 for year in years],
    'RSTA Word Count': [year_word_counts.get(year, {'rsta': 0})['rsta'] if year >= 1887 else 0 for year in years],
    'RSTL Word Count': [year_word_counts.get(year, {'rstl': 0})['rstl'] if year <= 1886 else 0 for year in years]
})

# Create the interactive plot
fig = go.Figure()

# Add traces for each journal type
fig.add_trace(go.Scatter(
    x=df['Year'][df['Year'] <= 1887],
    y=df['RSTL Word Count'][df['Year'] <= 1887],
    mode='markers+lines',
    name='<b>RSTL (1665-1887)<b>',
    marker=dict(
        size=10,
        color='green',
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>RSTL Word Count: {count}' for year, count in zip(df['Year'][df['Year'] <= 1886], df['RSTL Word Count'][df['Year'] <= 1886])],
    hoverinfo='text'
))

fig.add_trace(go.Scatter(
    x=df['Year'][df['Year'] >= 1887],
    y=df['RSTB Word Count'][df['Year'] >= 1887],
    mode='markers+lines',
    name='<b>RSTB (1887-1958)<b>',
    marker=dict(
        size=10,
        color='blue',
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>RSTB Word Count: {count}' for year, count in zip(df['Year'][df['Year'] >= 1887], df['RSTB Word Count'][df['Year'] >= 1887])],
    hoverinfo='text'
))

fig.add_trace(go.Scatter(
    x=df['Year'][df['Year'] >= 1887],
    y=df['RSTA Word Count'][df['Year'] >= 1887],
    mode='markers+lines',
    name='<b>RSTA (1887-1958)<b>',
    marker=dict(
        size=10,
        color='red',
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>RSTA Word Count: {count}' for year, count in zip(df['Year'][df['Year'] >= 1887], df['RSTA Word Count'][df['Year'] >= 1887])],
    hoverinfo='text'
))

# Update layout
fig.update_layout(
    title='<b>Word Count per Year by Journal Type (1665-1958)<b>',
    xaxis_title='<b>Year<b>',
    yaxis_title='<b>Word Count<b>',
    xaxis=dict(
        tickmode='linear',
        tick0=1665,
        dtick=10,
        range=[1665, 1958]
    ),
    yaxis=dict(range=[0, max(df['RSTB Word Count'].max(), df['RSTA Word Count'].max(), df['RSTL Word Count'].max()) + 50000]),
    template='plotly_white'
)

fig.update_layout(
    width=1200,  # Reduce width
    height=500,  # Reduce height
)

# Show the plot
fig.show()
fig.write_html("../extra_plots/royal_society_word_counts.html")


Counting word counts in year windows...


100%|██████████| 5284/5284 [00:03<00:00, 1531.15it/s]
100%|██████████| 8520/8520 [00:02<00:00, 3172.80it/s]
100%|██████████| 4390/4390 [00:02<00:00, 1730.34it/s]


In [99]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
import string
from nltk.tokenize import word_tokenize

def count_words_in_file(file_path):
    """
    Count the number of words in a text file.
    
    Args:
        file_path (str): Path to the text file
    
    Returns:
        int: Number of words in the file
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_in_10_year_windows(text_dirs, start_year, end_year):
    """
    Count words in 10-year windows across multiple directories.
    
    Args:
        text_dirs (list): List of directory paths
        start_year (int): Starting year of analysis
        end_year (int): Ending year of analysis
    
    Returns:
        dict: Dictionary of word counts per 10-year window, 
              with separate counts for each publication type
    """
    def extract_year_from_filename(filename):
        """Extract year from filename"""
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    # Separate word counts for each publication type
    word_counts = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    print("Counting words in year windows...")
    
    # Collect file paths
    file_paths = []
    for dir in text_dirs:
        for f in os.listdir(dir):
            if f.endswith('.txt'):
                file_paths.append(os.path.join(dir, f))
    
    # Count words for each file
    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        
        # Determine publication type from directory
        pub_type = os.path.basename(os.path.dirname(file_path))[-4:]
        
        if year and start_year <= year <= end_year:
            word_count = count_words_in_file(file_path)
            word_counts[pub_type][year] += word_count
    
    # Aggregate words into 10-year windows
    words_per_window = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    # Handle RSTA and RSTB starting from 1887, RSTL only before 1887
    for pub_type in ['rsta', 'rstb', 'rstl']:
        if pub_type in ['rsta', 'rstb']:
            start_year_filter = 1887
        else:
            start_year_filter = 1665
        
        for year in range(start_year, end_year - 9):
            window_start = year
            window_end = year + 9
            total_words = sum(
                word_counts[pub_type].get(y, 0) 
                for y in range(window_start, window_end + 1)
                if (pub_type != 'rstl' and y >= 1887) or (pub_type == 'rstl' and y <= 1886)
            )
            words_per_window[pub_type][window_start] = total_words
    
    return words_per_window

# Directories for Royal Society publications
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Count words for the historical period
historical_word_counts = count_words_in_10_year_windows(
    text_dirs,
    start_year=1665,
    end_year=1958
)

# Prepare data for plotting with enhanced visual styling
publication_types = {
    'rsta': {
        'color': 'red',
        'name': 'RSTA',
        'hoverinfo': 'Physical, mathematical, and engineering sciences'
    },
    'rstb': {
        'color': 'blue', 
        'name': 'RSTB',
        'hoverinfo': 'Biological research and life sciences'
    },
    'rstl': {
        'color': 'green',
        'name': 'RSTL',
        'hoverinfo': 'Scientific letters and short communications'
    }
}

# Create the interactive plot with enhanced styling
fig = go.Figure()

# Add traces for each publication type with line style
for pub_type, details in publication_types.items():
    windows = list(historical_word_counts[pub_type].keys())
    counts = list(historical_word_counts[pub_type].values())
    
    # Only plot non-zero word counts
    non_zero_windows = [window for window, count in zip(windows, counts) if count > 0]
    non_zero_counts = [count for count in counts if count > 0]
    
    if non_zero_windows:  # Check if there are any non-zero word counts
        # Determine the actual year range for the legend label
        first_year = non_zero_windows[0]
        last_year = non_zero_windows[-1]
        # Adjust for RSTA: ensure it starts from 1887
        if pub_type == 'rsta':
            first_year = max(first_year, 1887)

        if pub_type == 'rstb':
            first_year = max(first_year, 1887)
        
        fig.add_trace(go.Scatter(
            x=non_zero_windows,
            y=non_zero_counts,
            mode='markers+lines',
            name=f'<b>{details["name"]} ({first_year}-{last_year})</b>',
            marker=dict(
                size=10,
                color=details['color'],
                line=dict(width=2, color='black')
            ),
            text=[f'Window Start: {window}<br>Total Words: {count}' for window, count in zip(non_zero_windows, non_zero_counts)],
            hoverinfo='text'
        ))

# Update layout with enhanced design
fig.update_layout(
    title='<b>Total Words per 10-Year Window by Royal Society Publication Type (1665-1958)<b>',
    xaxis_title='<b>Window Start Year<b>',
    yaxis_title='<b>Total Words</b>',
    template='plotly_white',
    width=1200,  
    height=600,
    font=dict(family='Arial, sans-serif'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        font=dict(size=14, weight='bold')
    ),
    plot_bgcolor='rgba(240,240,240,0.5)',  # Light gray background
    margin=dict(t=100, b=50, l=50, r=50)
)

# Customize x-axis to show only integer ticks
fig.update_xaxes(
    tickmode='linear',
    tick0=1665,
    dtick=10,
    tickfont=dict(size=14)
)

# Customize y-axis with comma-separated thousands
fig.update_yaxes(
    tickformat=',',
    title_font=dict(size=14),  # Increase the font size here
    title_text='<b>Total Words</b>'
)

# Show the plot
fig.show()
fig.write_html("../extra_plots/royal_society_word_counts_10_year_windows_split.html")


Counting words in year windows...


100%|██████████| 18194/18194 [06:09<00:00, 49.28it/s]  


In [100]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
import string
from nltk.tokenize import word_tokenize

def count_words_in_file(file_path):
    """
    Count the number of words in a text file.
    
    Args:
        file_path (str): Path to the text file
    
    Returns:
        int: Number of words in the file
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_in_10_year_windows(text_dirs, start_year, end_year):
    """
    Count words in 10-year windows across multiple directories.
    
    Args:
        text_dirs (list): List of directory paths
        start_year (int): Starting year of analysis
        end_year (int): Ending year of analysis
    
    Returns:
        dict: Dictionary of word counts per 10-year window
    """
    def extract_year_from_filename(filename):
        """Extract year from filename"""
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    # Dictionary to store total word counts (all publication types combined)
    word_counts = defaultdict(int)
    
    print("Counting words in year windows...")
    
    # Collect file paths
    file_paths = []
    for dir in text_dirs:
        for f in os.listdir(dir):
            if f.endswith('.txt'):
                file_paths.append(os.path.join(dir, f))
    
    # Count words for each file
    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        
        if year and start_year <= year <= end_year:
            word_count = count_words_in_file(file_path)
            word_counts[year] += word_count
    
    # Aggregate words into 10-year windows
    words_per_window = defaultdict(int)
    
    for year in range(start_year, end_year - 9):
        window_start = year
        window_end = year + 9
        total_words = sum(
            word_counts.get(y, 0) 
            for y in range(window_start, window_end + 1)
        )
        words_per_window[window_start] = total_words
    
    return words_per_window

# Directories for Royal Society publications
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Count words for the historical period
historical_word_counts = count_words_in_10_year_windows(
    text_dirs,
    start_year=1665,
    end_year=1958
)

# Create the interactive plot with combined word counts
fig = go.Figure()

# Prepare data for the single plot (combined word counts across all publications)
windows = list(historical_word_counts.keys())
counts = list(historical_word_counts.values())

# Only plot non-zero word counts
non_zero_windows = [window for window, count in zip(windows, counts) if count > 0]
non_zero_counts = [count for count in counts if count > 0]

if non_zero_windows:  # Check if there are any non-zero word counts
    # Determine the actual year range for the legend label
    first_year = non_zero_windows[0]
    last_year = non_zero_windows[-1]

    fig.add_trace(go.Scatter(
        x=non_zero_windows,
        y=non_zero_counts,
        mode='markers+lines',
        name=f'<b>Total Word Count ({first_year}-{last_year})</b>',
        marker=dict(
            size=10,
            color='purple',  # Color for the combined line
            line=dict(width=2, color='black')
        ),
        text=[f'Window Start: {window}<br>Total Words: {count}' for window, count in zip(non_zero_windows, non_zero_counts)],
        hoverinfo='text'
    ))

# Update layout with enhanced design
fig.update_layout(
    title='<b>Total Words per 10-Year Window for All Royal Society Publications (1665-1958)<b>',
    xaxis_title='<b>Window Start Year<b>',
    yaxis_title='<b>Total Words</b>',
    template='plotly_white',
    width=1200,  
    height=600,
    font=dict(family='Arial, sans-serif'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        font=dict(size=14, weight='bold')
    ),
    plot_bgcolor='rgba(240,240,240,0.5)',  # Light gray background
    margin=dict(t=100, b=50, l=50, r=50)
)

# Customize x-axis to show only integer ticks
fig.update_xaxes(
    tickmode='linear',
    tick0=1665,
    dtick=10,
    tickfont=dict(size=14)
)

# Customize y-axis with comma-separated thousands
fig.update_yaxes(
    tickformat=',',
    title_font=dict(size=14),  # Increase the font size here
    title_text='<b>Total Words</b>'
)

# Show the plot
fig.show()
fig.write_html("../extra_plots/royal_society_combined_word_counts_10_year_windows.html")


Counting words in year windows...


  0%|          | 67/18194 [00:04<21:15, 14.22it/s]


KeyboardInterrupt: 

In [4]:
import pandas as pd
import plotly.graph_objects as go
import os

def find_highest_numbers_in_directory(directory_path):
    highest_numbers = {}

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    highest_number = 0  # Initialize highest number for the current file
                    for line in f.readlines()[1:]:  # Skip the header line
                        parts = line.split('\t')
                        if len(parts) > 0:  # Check if the line is properly formatted
                            try:
                                number = int(parts[0])  # Get the number from the first column
                                highest_number = max(highest_number, number)  # Update highest number
                            except ValueError:
                                pass  # Ignore non-integer values
                    
                    # Store the highest number for the current file
                    highest_numbers[filename] = highest_number

            except PermissionError:
                pass  # Ignore permission errors
            except FileNotFoundError:
                pass  # Ignore file not found errors
            except Exception:
                pass  # Ignore all other exceptions

    return highest_numbers

# Example usage
collocate_directory_path = r"collocate_results_combined_fact_10_window"
highest_numbers = find_highest_numbers_in_directory(collocate_directory_path)

# Prepare the data for plotting
df = pd.DataFrame(list(highest_numbers.items()), columns=['File', 'Highest Number'])

# Modify the 'File' column to extract just the year range
df['Year Range'] = df['File'].str.extract(r'(\d{4}-\d{4})')[0]  # Extract year range using regex
df['Year Range'] = df['Year Range'].fillna(df['File'])  # In case of no match, keep the original filename

# Create the interactive plot
fig = go.Figure()

# Add bars for each file's highest number
fig.add_trace(go.Bar(
    x=df['Year Range'],  # Use the modified Year Range for x-axis
    y=df['Highest Number'],
    marker=dict(color='blue'),
    text=df['Highest Number'],  # Show counts on bars
    textposition='auto',  # Position text on the bars
    hovertemplate='Year Range: %{x}<br>%{y}<extra></extra>'  # Customize hover text
))

# Update layout
fig.update_layout(
    title='Amount of Collocates per Year Window',
    xaxis_title='Year Range',
    yaxis_title='Highest Number',
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
    template='plotly_white'
)

# Show the plot
fig.show()
