C5-NC5 indicates that the MI-score with the cut-off point three was used as the association measure; the collocates were identified in the span of three words to the left (L3) and three words to the right (R3) of the node and the frequency threshold was five for both the collocate (C5) and the collocation (NC5). (frequency threshold exists)

Generally, the smaller the span, the greater the focus of the analysis on the most immediate lexico-grammatical patterns; a larger span captures looser associations.

TO DO
- make the freq threshold vary based on the amount of data for each year gap!!! (done, but might need adjusting)
- rerun with 3l3r and 5l5r for comparison (done)
- avoid lemmatization but tokenize (fact/facts) (done)
- include as a side if its from rstl rstb or rsta (done)
- different freq thresholds for fact/facts since they are not as common --> calculating the minimum subcorpora amount for the frequency (done, technically)

minimum subcorpora:
for every 412k, there should be 3

Period          Total Words  Fact     Facts   
1715-1724       412012       3        3


however, the scaling should be different since theres less of facts overall? though they align on the smallest subcorpus

In [None]:
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
import string
import csv
import os
import re
from collections import defaultdict
from tqdm.notebook import tqdm
import traceback
import concurrent.futures

nltk.download('punkt')

def extract_year_from_filename(filename):
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def clean_text(text):
    return text.replace('ſ', 's').replace('Å¿', 's') #this is optional and could be expanded upon or omitted

def build_file_directory_map(text_dirs): #to get the directory and filename later, hopefully the maps match correctly
    file_directory_map = {}
    for dir_path in text_dirs:
        dir_name = os.path.basename(dir_path)
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(dir_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        cleaned_text = clean_text(text)
                        file_directory_map[cleaned_text] = (dir_name, filename)
                except Exception as e:
                    print(f"Error reading file {file_path}: {str(e)}")
    return file_directory_map

def process_file(file_path):
    try:
        year = extract_year_from_filename(os.path.basename(file_path))
        if not year:
            return None
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        cleaned_text = clean_text(text)
        return year, cleaned_text
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def process_tokens(text):
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha() and token not in string.punctuation]

def calculate_scaled_freq_filter(total_words, word_of_interest):
    if word_of_interest == "fact" or "facts":
        base_threshold = 3  # base threshold for the lowest amount of words (412,012) using smallest subcorpora for "fact"
        scale_factor = 3 / 412012  #increase threshold by this factor for each extra word over 412012
        if total_words <= 412012:
            return base_threshold
        else:
            additional_threshold = (total_words - 412012) * scale_factor
            return base_threshold + round(additional_threshold) #needs to be rounded as integer
    
    #alternative differing threshold that i tried and in the end rejected (didn't make a meaningful enough difference and may have been too lax, that said, hard to say)
    #elif word_of_interest == "facts": 
        #base_threshold = 2 #theres much less of facts than fact (sometimes 0)
        #scale_factor = 2 / 411208 #should've maybe used 2, as the smallest subcorp has 2 facts
        #if total_words <= 411208:
        #    return base_threshold
        #else:
        #   additional_threshold = (total_words - 411208) * scale_factor
        #   return base_threshold + round(additional_threshold)


def process_window(window_data, word_of_interest, collocate_window, file_directory_map):
    try:
        window_start, window_end, window_texts = window_data
        all_tokens = [token for text in window_texts for token in process_tokens(text)]
        
        total_tokens = len(all_tokens)
        total_texts = len(window_texts)
        
        freq_filter = calculate_scaled_freq_filter(total_tokens, word_of_interest)
        print(f"Window {window_start}-{window_end}: Using frequency filter of {freq_filter} for {total_tokens} total words")
        
        finder = BigramCollocationFinder.from_words(all_tokens, window_size=collocate_window)
        finder.apply_freq_filter(freq_filter)
        
        collocations = finder.score_ngrams(BigramAssocMeasures().pmi)
        
        collocate_stats = []
        collocate_contexts = defaultdict(list)

        word_of_interest_lower = word_of_interest.lower()
        word_of_interest_freq = sum(finder.word_fd[word] for word in finder.word_fd if word.lower() == word_of_interest_lower)

        text_sources = {text: file_directory_map.get(text, ("Unknown", "Unknown")) for text in window_texts}

        for bigram, pmi in collocations:
            if word_of_interest_lower in (word.lower() for word in bigram):
                other_word = bigram[0] if bigram[1].lower() == word_of_interest_lower else bigram[1]
                observed_freq = finder.ngram_fd[bigram]
                word_freq = finder.word_fd[other_word]
                expected_freq = (word_of_interest_freq * word_freq) / total_tokens
                num_texts = sum(1 for text in window_texts if other_word.lower() in text.lower().split())
                
                if num_texts > 1:
                    collocate_stats.append({
                        'word': other_word,
                        'total_corpus': word_freq,
                        'expected_freq': expected_freq,
                        'observed_freq': observed_freq,
                        'num_texts': num_texts,
                        'pmi': pmi
                    })
                    
                    for text in window_texts:
                        words = text.split()
                        for i, word in enumerate(words):
                            if word.lower() == word_of_interest_lower and other_word.lower() in [w.lower() for w in words[max(0, i-collocate_window):i+collocate_window+1]]:
                                context = ' '.join(words[max(0, i-collocate_window):i+collocate_window+1])
                                dir_name, filename = text_sources[text]
                                context_info = context #this is whats shown as context in the csv, redundant but can change it l8
                                collocate_contexts[other_word].append((context_info, other_word, filename, dir_name, freq_filter))
        
        collocate_stats.sort(key=lambda x: x['pmi'], reverse=True)

        filtered_contexts = [context for collocate in collocate_stats for context in collocate_contexts.get(collocate['word'], [])] #testing if this is at fault for mismatching contexts

        #all_contexts = [context for contexts in collocate_contexts.values() for context in contexts] #downloads all of them, bad

        return f"{window_start}-{window_end}", {
            'collocates': collocate_stats,
            'total_tokens': total_tokens,
            'total_texts': total_texts,
            'contexts': filtered_contexts
        }
    except Exception as e:
        print(f"Error processing window {window_start}-{window_end}: {str(e)}")
        traceback.print_exc()
        return None

def analyze_collocates(text_dirs, output_dir, word_of_interest, start_year, end_year, collocate_window, window_size=10):
    def read_texts_by_year(directories):
        texts_by_year = defaultdict(list)
        file_paths = [os.path.join(dir, f) for dir in directories for f in os.listdir(dir) if f.endswith('.txt')]
        
        print("Reading files...")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(process_file, file_paths), total=len(file_paths)))
        
        for result in results:
            if result:
                year, text = result
                if start_year <= year <= end_year:
                    texts_by_year[year].append(text)
        
        return texts_by_year

    print(f"Starting text processing for years {start_year}-{end_year}...")
    texts_by_year = read_texts_by_year(text_dirs)
    file_directory_map = build_file_directory_map(text_dirs)
    
    all_years = sorted(texts_by_year.keys())
    if not all_years:
        print(f"No texts found in the specified year range {start_year}-{end_year}")
        return
        
    window_data = [
        (window_start, window_start + window_size - 1, 
         [text for year in range(window_start, window_start + window_size) for text in texts_by_year.get(year, [])])
        for window_start in range(start_year, end_year - window_size + 2)
    ]

    collocates_by_window = {}
    print("Processing windows...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda data: process_window(data, word_of_interest, collocate_window, file_directory_map), window_data), total=len(window_data)))

    for result in results:
        if result:
            window, window_result = result
            collocates_by_window[window] = window_result

    os.makedirs(output_dir, exist_ok=True)
    print("Saving results to CSV and TXT files...")

    for window, data in collocates_by_window.items():
        save_collocates_to_csv(output_dir, window, data)
        save_contexts_to_csv(output_dir, window, data)
        save_detailed_report(output_dir, window, data, word_of_interest)

    print(f"Collocate analysis completed. Results saved in the '{output_dir}' directory.")


#data saving functions    
def save_collocates_to_csv(output_dir, window, data):
    import csv
    import os
    
    csv_filename = os.path.join(output_dir, f'collocates_{window}.csv')
    os.makedirs(os.path.dirname(csv_filename), exist_ok=True)
    
    try:
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['No.', 'Word', 'Total no. in window corpus', 'Expected collocate frequency',
                             'Observed collocate frequency', 'In no. of texts', 'Mutual Information value'])
           
            for i, collocate in enumerate(data['collocates'], 1):
                writer.writerow([
                    i,
                    collocate['word'],
                    collocate['total_corpus'],
                    f"{collocate['expected_freq']:.2f}",
                    collocate['observed_freq'],
                    collocate['num_texts'],
                    f"{collocate['pmi']:.2f}"
                ])
        print(f"Saved collocates to {csv_filename}")
    except Exception as e:
        print(f"Error saving collocates to {csv_filename}: {e}")

def save_contexts_to_csv(output_dir, window, data):
    import csv
    import os
    import re
    
    subfolder_path = os.path.join(output_dir, 'contexts')
    os.makedirs(subfolder_path, exist_ok=True)
    csv_filename = os.path.join(subfolder_path, f'contexts_{window}.csv')
    
    try:
        with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['Context', 'Collocate', 'File', 'Directory', 'Year', 'Frequency Filter'])
            
            for context_info, other_word, filename, dir_name, freq_filter in data['contexts']:
                match = re.search(r'_(\d{4})_', filename)
                year = match.group(1) if match else 'Unknown'
                writer.writerow([context_info, other_word, filename, dir_name, year, freq_filter])
        
        print(f"Saved contexts to {csv_filename}")
    except Exception as e:
        print(f"Error saving contexts to {csv_filename}: {e}")

def save_detailed_report(output_dir, window, data, word_of_interest):
    import os
    
    subfolder_path = os.path.join(output_dir, 'collocation_txt')
    os.makedirs(subfolder_path, exist_ok=True)
    txt_filename = os.path.join(subfolder_path, f'collocates_txt_{window}.txt')
    
    try:
        with open(txt_filename, 'w', encoding='utf-8') as txtfile:
            txtfile.write("https://example.com/collocation-analysis\n")
            txtfile.write(f"There are {data['total_tokens']} different words in your collocation database for \"{word_of_interest}\". ")
            txtfile.write(f"(Your query returned {sum(c['observed_freq'] for c in data['collocates'])} matches in {data['total_texts']} different texts)\n")
            txtfile.write("__________________\n\n")
           
            txtfile.write("No.\tWord\tTotal no. in whole corpus\tExpected collocate frequency\t")
            txtfile.write("Observed collocate frequency\tIn no. of texts\tMutual information value\n\n")
           
            for i, collocate in enumerate(data['collocates'], 1):
                txtfile.write(f"{i}\t{collocate['word']}\t{collocate['total_corpus']}\t")
                txtfile.write(f"{collocate['expected_freq']:.2f}\t{collocate['observed_freq']}\t")
                txtfile.write(f"{collocate['num_texts']}\t{collocate['pmi']:.2f}\n")
        
        print(f"Saved detailed report to {txt_filename}")
    except Exception as e:
        print(f"Error saving detailed report to {txt_filename}: {e}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Igiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import concurrent.futures

tasks = [
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "collocation_results/FACT/collocate_results_1665-1958_FACT_css3_w3", "fact", 1665, 1958, 3), #collocate_window = 3/5
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "collocation_results/FACTS/collocate_results_1665-1958_FACTS_css3_w3", "facts", 1665, 1958, 3), #scaling is more unforgiving for facts (imperfect) BUT SETTING IT BELOW 3 IS TOO FORGIVING
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "collocation_results/FACT/collocate_results_1665-1958_FACT_css3_w5", "fact", 1665, 1958, 5),

     ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "collocation_results/FACTS/collocate_results_1665-1958_FACTS_css3_w5", "facts", 1665, 1958, 5)
    
    #([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
    #r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    #r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
    #"collocation_results/FACTS/collocate_results_1665-1958_FACTS_css2_w5", "facts", 1665, 1958, 5) #tested smaller scaling for "facts" with this one (changing scaling is in the function, not here)
]

def run_analyze_collocates(args):
    analyze_collocates(*args)

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(run_analyze_collocates, task) for task in tasks]
    for future in concurrent.futures.as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error occurred: {e}")

Starting text processing for years 1665-1958...
Reading files...


  0%|          | 0/18194 [00:00<?, ?it/s]

Processing windows...


  0%|          | 0/285 [00:00<?, ?it/s]

Window 1673-1682: Using frequency filter of 3 for 514030 total words
Window 1674-1683: Using frequency filter of 3 for 544329 total words
Window 1679-1688: Using frequency filter of 3 for 523496 total words
Window 1680-1689: Using frequency filter of 3 for 523496 total words
Window 1678-1687: Using frequency filter of 3 for 523496 total words
Window 1676-1685: Using frequency filter of 3 for 572824 total words
Window 1672-1681: Using frequency filter of 3 for 618800 total words
Window 1675-1684: Using frequency filter of 3 for 557431 total words
Window 1677-1686: Using frequency filter of 3 for 624937 total words
Window 1671-1680: Using frequency filter of 4 for 732745 total words
Window 1666-1675: Using frequency filter of 5 for 941124 total words
Window 1667-1676: Using frequency filter of 5 for 951536 total words
Window 1670-1679: Using frequency filter of 4 for 862250 total words
Window 1669-1678: Using frequency filter of 5 for 952403 total words
Window 1668-1677: Using frequency 

---
extra stuff
---

ungodly messy, could use some clean up

In [104]:
#so my pc turns off eventually
import os
import time

time.sleep(300)
os.system("shutdown /s /t 1")

0

In [99]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
import string
from nltk.tokenize import word_tokenize

def count_words_in_file(file_path):
    """
    Count the number of words in a text file.
    
    Args:
        file_path (str): Path to the text file
    
    Returns:
        int: Number of words in the file
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_in_10_year_windows(text_dirs, start_year, end_year):
    """
    Count words in 10-year windows across multiple directories.
    
    Args:
        text_dirs (list): List of directory paths
        start_year (int): Starting year of analysis
        end_year (int): Ending year of analysis
    
    Returns:
        dict: Dictionary of word counts per 10-year window, 
              with separate counts for each publication type
    """
    def extract_year_from_filename(filename):
        """Extract year from filename"""
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    # Separate word counts for each publication type
    word_counts = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    print("Counting words in year windows...")
    
    # Collect file paths
    file_paths = []
    for dir in text_dirs:
        for f in os.listdir(dir):
            if f.endswith('.txt'):
                file_paths.append(os.path.join(dir, f))
    
    # Count words for each file
    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        
        # Determine publication type from directory
        pub_type = os.path.basename(os.path.dirname(file_path))[-4:]
        
        if year and start_year <= year <= end_year:
            word_count = count_words_in_file(file_path)
            word_counts[pub_type][year] += word_count
    
    # Aggregate words into 10-year windows
    words_per_window = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    # Handle RSTA and RSTB starting from 1887, RSTL only before 1887
    for pub_type in ['rsta', 'rstb', 'rstl']:
        if pub_type in ['rsta', 'rstb']:
            start_year_filter = 1887
        else:
            start_year_filter = 1665
        
        for year in range(start_year, end_year - 9):
            window_start = year
            window_end = year + 9
            total_words = sum(
                word_counts[pub_type].get(y, 0) 
                for y in range(window_start, window_end + 1)
                if (pub_type != 'rstl' and y >= 1887) or (pub_type == 'rstl' and y <= 1886)
            )
            words_per_window[pub_type][window_start] = total_words
    
    return words_per_window

# Directories for Royal Society publications
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Count words for the historical period
historical_word_counts = count_words_in_10_year_windows(
    text_dirs,
    start_year=1665,
    end_year=1958
)

# Prepare data for plotting with enhanced visual styling
publication_types = {
    'rsta': {
        'color': 'red',
        'name': 'RSTA',
        'hoverinfo': 'Physical, mathematical, and engineering sciences'
    },
    'rstb': {
        'color': 'blue', 
        'name': 'RSTB',
        'hoverinfo': 'Biological research and life sciences'
    },
    'rstl': {
        'color': 'green',
        'name': 'RSTL',
        'hoverinfo': 'Scientific letters and short communications'
    }
}

# Create the interactive plot with enhanced styling
fig = go.Figure()

# Add traces for each publication type with line style
for pub_type, details in publication_types.items():
    windows = list(historical_word_counts[pub_type].keys())
    counts = list(historical_word_counts[pub_type].values())
    
    # Only plot non-zero word counts
    non_zero_windows = [window for window, count in zip(windows, counts) if count > 0]
    non_zero_counts = [count for count in counts if count > 0]
    
    if non_zero_windows:  # Check if there are any non-zero word counts
        # Determine the actual year range for the legend label
        first_year = non_zero_windows[0]
        last_year = non_zero_windows[-1]
        # Adjust for RSTA: ensure it starts from 1887
        if pub_type == 'rsta':
            first_year = max(first_year, 1887)

        if pub_type == 'rstb':
            first_year = max(first_year, 1887)
        
        fig.add_trace(go.Scatter(
            x=non_zero_windows,
            y=non_zero_counts,
            mode='markers+lines',
            name=f'<b>{details["name"]} ({first_year}-{last_year})</b>',
            marker=dict(
                size=10,
                color=details['color'],
                line=dict(width=2, color='black')
            ),
            text=[f'Window Start: {window}<br>Total Words: {count}' for window, count in zip(non_zero_windows, non_zero_counts)],
            hoverinfo='text'
        ))

# Update layout with enhanced design
fig.update_layout(
    title='<b>Total Words per 10-Year Window by Royal Society Publication Type (1665-1958)<b>',
    xaxis_title='<b>Window Start Year<b>',
    yaxis_title='<b>Total Words</b>',
    template='plotly_white',
    width=1200,  
    height=600,
    font=dict(family='Arial, sans-serif'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        font=dict(size=14, weight='bold')
    ),
    plot_bgcolor='rgba(240,240,240,0.5)',  # Light gray background
    margin=dict(t=100, b=50, l=50, r=50)
)

# Customize x-axis to show only integer ticks
fig.update_xaxes(
    tickmode='linear',
    tick0=1665,
    dtick=10,
    tickfont=dict(size=14)
)

# Customize y-axis with comma-separated thousands
fig.update_yaxes(
    tickformat=',',
    title_font=dict(size=14),  # Increase the font size here
    title_text='<b>Total Words</b>'
)

# Show the plot
fig.show()
fig.write_html("../extra_plots/royal_society_word_counts_10_year_windows_split.html")


Counting words in year windows...


100%|██████████| 18194/18194 [06:09<00:00, 49.28it/s]  


In [5]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
import string
from nltk.tokenize import word_tokenize

def count_words_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_in_10_year_windows(text_dirs, start_year, end_year):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    word_counts = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    file_paths = []
    for dir in text_dirs:
        for f in os.listdir(dir):
            if f.endswith('.txt'):
                file_paths.append(os.path.join(dir, f))
    
    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        pub_type = os.path.basename(os.path.dirname(file_path))[-4:]
        
        if year and start_year <= year <= end_year:
            word_count = count_words_in_file(file_path)
            word_counts[pub_type][year] += word_count
    
    words_per_window = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    for pub_type in ['rsta', 'rstb', 'rstl']:
        if pub_type in ['rsta', 'rstb']:
            start_year_filter = 1887
        else:
            start_year_filter = 1665
        
        for year in range(start_year, end_year - 9):
            window_start = year
            window_end = year + 9
            total_words = sum(
                word_counts[pub_type].get(y, 0) 
                for y in range(window_start, window_end + 1)
                if (pub_type != 'rstl' and y >= 1887) or (pub_type == 'rstl' and y <= 1886)
            )
            words_per_window[pub_type][window_start] = total_words
    
    return words_per_window

# Directories for Royal Society publications
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Count words for the historical period
historical_word_counts = count_words_in_10_year_windows(
    text_dirs,
    start_year=1665,
    end_year=1958
)

# Prepare data for plotting
publication_types = {
    'rsta': {'color': 'red', 'name': 'RSTA', 'order': 3},
    'rstb': {'color': 'blue', 'name': 'RSTB', 'order': 2},
    'rstl': {'color': 'green', 'name': 'RSTL', 'order': 1}
}

# Create the interactive stacked bar plot
fig = go.Figure()

# Generate list of all unique 10-year window start years
all_windows = sorted(set().union(
    historical_word_counts['rsta'].keys(),
    historical_word_counts['rstb'].keys(),
    historical_word_counts['rstl'].keys()
))

# Sort publication types by order for proper stacking
sorted_pub_types = sorted(publication_types.items(), key=lambda x: x[1]['order'])

# Add stacked bars for each publication type
for pub_type, details in sorted_pub_types:
    # Filter out zero-value windows to reduce noise
    windows = [year for year in all_windows if historical_word_counts[pub_type].get(year, 0) > 0]
    counts = [historical_word_counts[pub_type].get(year, 0) for year in windows]
    
    if windows:  # Ensure there are non-zero counts
        first_year = windows[0]
        last_year = windows[-1]
        
        # Adjust for RSTA and RSTB: ensure it starts from 1887
        if pub_type in ['rsta', 'rstb']:
            first_year = max(first_year, 1887)
        
        fig.add_trace(go.Bar(
            x=windows,
            y=counts,
            name=f'{details["name"]} ({first_year}-{last_year})',
            marker_color=details['color'],
            hovertemplate='%{y:,} words<extra></extra>'
        ))

# Update layout to create stacked bars
fig.update_layout(
    title='<b>Total Words per 10-Year Window by Royal Society Publication Type (1665-1958)<b>',
    xaxis_title='<b>Window Start Year<b>',
    yaxis_title='<b>Total Words<b>',
    barmode='stack',  # Properly stack the bars
    template='plotly_white',
    width=1400,  
    height=700,
    font=dict(family='Arial, sans-serif'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        font=dict(size=14,  weight='bold')
    ),
    plot_bgcolor='rgba(240,240,240,0.5)',
    margin=dict(t=100, b=50, l=50, r=50)
)

# Customize y-axis with comma-separated thousands
fig.update_yaxes(
    tickformat="~s",
    tickfont=dict(size=12),
    title_font=dict(size=14),
    tickvals=[1_000_000, 2_000_000, 3_000_000, 4_000_000, 5_000_000]  # Set specific tick values
)

fig.update_layout(
    width=1200,  
    height=600,
)

# Show the plot
fig.show()
fig.write_html("C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_word_counts_cumulative_bar_plot.html")

100%|██████████| 18194/18194 [06:01<00:00, 50.32it/s]  


In [6]:
import os
import re
import plotly.graph_objects as go
import string
from nltk.tokenize import word_tokenize
from tqdm import tqdm

def extract_year_from_filename(filename):
    """Extract the year from the filename."""
    match = re.search(r'rst[bla]?_?(\d{4})', filename)
    return int(match.group(1)) if match else None

def count_words_in_file(file_path):
    """Count words in a single file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_in_directory(directory, start_year, end_year):
    """Count words in all files within a directory, filtered by year range."""
    total_words = 0
    file_count = 0
    
    # Get list of all txt files
    files = [f for f in os.listdir(directory) if f.endswith('.txt')]
    
    # Use tqdm for progress bar
    for file in tqdm(files, desc=f"Processing {os.path.basename(directory)}"):
        file_path = os.path.join(directory, file)
        
        # Extract year from filename and check if it's within the range
        year = extract_year_from_filename(file)
        if year is None or not (start_year <= year <= end_year):
            continue  # Skip files outside the year range
        
        try:
            word_count = count_words_in_file(file_path)
            total_words += word_count
            file_count += 1
        except Exception as e:
            print(f"Error reading {file_path}: {e}")
    
    return total_words, file_count

# Directories
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Publication types with consistent colors
publication_types = {
    'txt_rsta': {'color': 'red', 'name': 'RSTA'},
    'txt_rstb': {'color': 'blue', 'name': 'RSTB'},
    'txt_rstl': {'color': 'green', 'name': 'RSTL'}
}

# Year range to filter by
start_year = 1665
end_year = 1958

# Get counts and labels
counts = []
file_counts = []
labels = []
colors = []

print("\nProcessing files:")
print("-" * 50)

for directory in text_dirs:
    word_count, file_count = count_words_in_directory(directory, start_year, end_year)
    counts.append(word_count)
    file_counts.append(file_count)
    dir_name = os.path.basename(directory)
    pub_info = publication_types[dir_name]
    labels.append(f"{pub_info['name']} ({word_count:,} words)")
    colors.append(pub_info['color'])
    print(f"{dir_name}:")
    print(f"  - Files processed: {file_count}")
    print(f"  - Word count: {word_count:,}")

# Calculate total word count
total_words = sum(counts)
total_files = sum(file_counts)

# Create interactive pie chart with Plotly
fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=counts,
    marker=dict(colors=colors),
    textinfo='label+percent',
    hovertemplate="<b>%{label}</b><br>" +
                  "Word count: %{value:,}<br>" +
                  "Percentage: %{percent}<br>" +
                  "<extra></extra>"
)])

# Update layout
fig.update_layout(
    title={
        'text': f'<b>Word Distribution Across Royal Society Publication Types (1665-1958)<br>Total Words: {total_words:,} (from {total_files:,} files)</b>',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    showlegend=True,
    width=1200,
    height=600,
    template='plotly_white',
    font=dict(family='Arial, sans-serif'),
    margin=dict(t=100, b=50, l=50, r=50)
)

# Save as interactive HTML
output_path = r"C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_word_distribution_pie_1665_1958.html"
fig.write_html(output_path)
fig.show()

print("\nFinal Summary:")
print("-" * 50)
for label, count, file_count in zip(labels, counts, file_counts):
    print(f"{label}")
    print(f"  - Files: {file_count}")
    print(f"  - Words: {count:,}")
print("-" * 50)
print(f"Total files processed: {total_files:,}")
print(f"Total words across all directories: {total_words:,}")
print(f"\nInteractive pie chart has been saved to: {output_path}")



Processing files:
--------------------------------------------------


Processing txt_rstb: 100%|██████████| 5284/5284 [01:16<00:00, 68.73it/s]


txt_rstb:
  - Files processed: 628
  - Word count: 11,985,649


Processing txt_rstl: 100%|██████████| 8520/8520 [03:12<00:00, 44.15it/s] 


txt_rstl:
  - Files processed: 8520
  - Word count: 29,445,246


Processing txt_rsta: 100%|██████████| 4390/4390 [01:42<00:00, 42.87it/s]   

txt_rsta:
  - Files processed: 864
  - Word count: 10,989,711






Final Summary:
--------------------------------------------------
RSTB (11,985,649 words)
  - Files: 628
  - Words: 11,985,649
RSTL (29,445,246 words)
  - Files: 8520
  - Words: 29,445,246
RSTA (10,989,711 words)
  - Files: 864
  - Words: 10,989,711
--------------------------------------------------
Total files processed: 10,012
Total words across all directories: 52,420,606

Interactive pie chart has been saved to: C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_word_distribution_pie_1665_1958.html


In [None]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
import string
from nltk.tokenize import word_tokenize

def count_words_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_per_year(text_dirs, start_year, end_year):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None
    
    word_counts = {
        'rsta': defaultdict(int),
        'rstb': defaultdict(int),
        'rstl': defaultdict(int)
    }
    
    file_paths = []
    for dir in text_dirs:
        for f in os.listdir(dir):
            if f.endswith('.txt'):
                file_paths.append(os.path.join(dir, f))
    
    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        pub_type = os.path.basename(os.path.dirname(file_path))[-4:]
        
        if year and start_year <= year <= end_year:
            # For RSTA and RSTB, only count years from 1887 onwards
            if (pub_type in ['rsta', 'rstb'] and year >= 1887) or pub_type == 'rstl':
                word_count = count_words_in_file(file_path)
                word_counts[pub_type][year] += word_count
    
    return word_counts

# Directories for Royal Society publications
text_dirs = [
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb",
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
    r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"
]

# Count words per year for the historical period
historical_word_counts = count_words_per_year(
    text_dirs,
    start_year=1665,
    end_year=1958
)

# Prepare data for plotting
publication_types = {
    'rsta': {'color': 'red', 'name': 'RSTA', 'order': 3},
    'rstb': {'color': 'blue', 'name': 'RSTB', 'order': 2},
    'rstl': {'color': 'green', 'name': 'RSTL', 'order': 1}
}

# Create the interactive stacked bar plot
fig = go.Figure()

# Generate list of all unique years
all_years = sorted(set().union(
    historical_word_counts['rsta'].keys(),
    historical_word_counts['rstb'].keys(),
    historical_word_counts['rstl'].keys()
))

# Sort publication types by order for proper stacking
sorted_pub_types = sorted(publication_types.items(), key=lambda x: x[1]['order'])

# Add stacked bars for each publication type
for pub_type, details in sorted_pub_types:
    # Filter out zero-value years to reduce noise
    years = [year for year in all_years if historical_word_counts[pub_type].get(year, 0) > 0]
    counts = [historical_word_counts[pub_type].get(year, 0) for year in years]
    
    if years:  # Ensure there are non-zero counts
        first_year = years[0]
        last_year = years[-1]
        
        # Adjust for RSTA and RSTB: ensure it starts from 1887
        if pub_type in ['rsta', 'rstb']:
            first_year = max(first_year, 1887)
        
        fig.add_trace(go.Bar(
            x=years,
            y=counts,
            name=f'{details["name"]} ({first_year}-{last_year})',
            marker_color=details['color'],
            hovertemplate='%{y:,} words<extra></extra>'
        ))

# Update layout to create stacked bars
fig.update_layout(
    title='<b>Total Words per Year by Royal Society Publication Type (1665-1958)<b>',
    xaxis_title='<b>Year<b>',
    yaxis_title='<b>Total Words<b>',
    barmode='stack',  # Properly stack the bars
    template='plotly_white',
    width=1400,  
    height=700,
    font=dict(family='Arial, sans-serif'),
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='center',
        x=0.5,
        font=dict(size=14,  weight='bold')
    ),
    plot_bgcolor='rgba(240,240,240,0.5)',
    margin=dict(t=100, b=50, l=50, r=50)
)

# Customize y-axis with comma-separated thousands
fig.update_yaxes(
    tickformat="~s",
    tickfont=dict(size=12),
    title_font=dict(size=14),
    tickvals=[500_000, 1_000_000, 1_500_000, 2_000_000]  # Set specific tick values
)

fig.update_layout(
    width=1200,  
    height=600,
)

# Show the plot and save HTML
fig.show()
fig.write_html("C:/Users/Igiba/Documents/fact_fiction_project/extra_plots/royal_society_word_counts_annual_bar_plot.html")

In [4]:
import pandas as pd
import plotly.graph_objects as go
import os

def find_highest_numbers_in_directory(directory_path):
    highest_numbers = {}

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    highest_number = 0  # Initialize highest number for the current file
                    for line in f.readlines()[1:]:  # Skip the header line
                        parts = line.split('\t')
                        if len(parts) > 0:  # Check if the line is properly formatted
                            try:
                                number = int(parts[0])  # Get the number from the first column
                                highest_number = max(highest_number, number)  # Update highest number
                            except ValueError:
                                pass  # Ignore non-integer values
                    
                    # Store the highest number for the current file
                    highest_numbers[filename] = highest_number

            except PermissionError:
                pass  # Ignore permission errors
            except FileNotFoundError:
                pass  # Ignore file not found errors
            except Exception:
                pass  # Ignore all other exceptions

    return highest_numbers

# Example usage
collocate_directory_path = r"collocate_results_combined_fact_10_window"
highest_numbers = find_highest_numbers_in_directory(collocate_directory_path)

# Prepare the data for plotting
df = pd.DataFrame(list(highest_numbers.items()), columns=['File', 'Highest Number'])

# Modify the 'File' column to extract just the year range
df['Year Range'] = df['File'].str.extract(r'(\d{4}-\d{4})')[0]  # Extract year range using regex
df['Year Range'] = df['Year Range'].fillna(df['File'])  # In case of no match, keep the original filename

# Create the interactive plot
fig = go.Figure()

# Add bars for each file's highest number
fig.add_trace(go.Bar(
    x=df['Year Range'],  # Use the modified Year Range for x-axis
    y=df['Highest Number'],
    marker=dict(color='blue'),
    text=df['Highest Number'],  # Show counts on bars
    textposition='auto',  # Position text on the bars
    hovertemplate='Year Range: %{x}<br>%{y}<extra></extra>'  # Customize hover text
))

# Update layout
fig.update_layout(
    title='Amount of Collocates per Year Window',
    xaxis_title='Year Range',
    yaxis_title='Highest Number',
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
    template='plotly_white'
)

# Show the plot
fig.show()
