C5-NC5 indicates that the MI-score with the cut-off point three was used as the association measure; the collocates were identified in the span of three words to the left (L3) and three words to the right (R3) of the node and the frequency threshold was five for both the collocate (C5) and the collocation (NC5). (frequency threshold exists)

Generally, the smaller the span, the greater the focus of the analysis on the most immediate lexico-grammatical patterns; a larger span captures looser associations.

TO DO
- make the freq threshold vary based on the amount of data for each year gap!!! (done, but might need adjusting)
- rerun with 3l3r and 5l5r for comparison (done)
- avoid lemmatization but tokenize (fact/facts) (done)
- include as a side if its from rstl rstb or rsta (done)
- different freq thresholds for fact/facts since they are not as common --> calculating the minimum subcorpora amount for the frequency (done)

minimum subcorpora:
for every 412k, there should be 3

Period          Total Words  Fact     Facts   
1715-1724       412012       3        3


however, the scaling should be different since theres less of facts overall? though they align on the smallest subcorpus

In [1]:
import nltk
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import word_tokenize
import string
import csv
import os
import re
from collections import defaultdict
from tqdm.notebook import tqdm
import traceback
import concurrent.futures

nltk.download('punkt')

def extract_year_from_filename(filename):
    match = re.search(r'rst[bla]?_(\d{4})', filename)
    return int(match.group(1)) if match else None

def clean_text(text):
    return text.replace('ſ', 's').replace('Å¿', 's').replace('obseryed', 'observed').replace('thab', 'that') #this is optional and could be expanded upon or omitted

def build_file_directory_map(text_dirs): #to get the directory and filename later, hopefully the maps match correctly
    file_directory_map = {}
    for dir_path in text_dirs:
        dir_name = os.path.basename(dir_path)
        for filename in os.listdir(dir_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(dir_path, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as file:
                        text = file.read()
                        cleaned_text = clean_text(text)
                        file_directory_map[cleaned_text] = (dir_name, filename)
                except Exception as e:
                    print(f"Error reading file {file_path}: {str(e)}")
    return file_directory_map

def process_file(file_path):
    try:
        year = extract_year_from_filename(os.path.basename(file_path))
        if not year:
            return None
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        cleaned_text = clean_text(text)
        return year, cleaned_text
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def process_tokens(text):
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha() and token not in string.punctuation]

def calculate_scaled_freq_filter(total_words, word_of_interest):
    if word_of_interest == "fact":
        base_threshold = 3  # base threshold for the lowest amount of words (412,012) using smallest subcorpora for "fact"
        scale_factor = 3 / 412012  #increase threshold by this factor for each extra word over 412012
        if total_words <= 412012:
            return base_threshold
        else:
            additional_threshold = (total_words - 412012) * scale_factor
            return base_threshold + round(additional_threshold) #needs to be rounded as integer
        
    elif word_of_interest == "facts": 
        base_threshold = 1 #theres much less of facts than fact (sometimes 0)
        scale_factor = 1 / 412012 #this is possibly too forgiving (smallest amount of words in subcorpora for "facts")
        if total_words <= 412012:
            return base_threshold
        else:
            additional_threshold = (total_words - 412012) * scale_factor
            return base_threshold + round(additional_threshold)


def process_window(window_data, word_of_interest, collocate_window, file_directory_map):
    try:
        window_start, window_end, window_texts = window_data
        all_tokens = [token for text in window_texts for token in process_tokens(text)]
        
        total_tokens = len(all_tokens)
        total_texts = len(window_texts)
        
        freq_filter = calculate_scaled_freq_filter(total_tokens, word_of_interest)
        print(f"Window {window_start}-{window_end}: Using frequency filter of {freq_filter} for {total_tokens} total words")
        
        finder = BigramCollocationFinder.from_words(all_tokens, window_size=collocate_window)
        finder.apply_freq_filter(freq_filter)
        
        collocations = finder.score_ngrams(BigramAssocMeasures().pmi)
        
        collocate_stats = []
        collocate_contexts = defaultdict(list)

        word_of_interest_lower = word_of_interest.lower()
        word_of_interest_freq = sum(finder.word_fd[word] for word in finder.word_fd if word.lower() == word_of_interest_lower)

        text_sources = {text: file_directory_map.get(text, ("Unknown", "Unknown")) for text in window_texts}

        for bigram, pmi in collocations:
            if word_of_interest_lower in (word.lower() for word in bigram):
                other_word = bigram[0] if bigram[1].lower() == word_of_interest_lower else bigram[1]
                observed_freq = finder.ngram_fd[bigram]
                word_freq = finder.word_fd[other_word]
                expected_freq = (word_of_interest_freq * word_freq) / total_tokens
                num_texts = sum(1 for text in window_texts if other_word.lower() in text.lower().split())
                
                if num_texts > 1:
                    collocate_stats.append({
                        'word': other_word,
                        'total_corpus': word_freq,
                        'expected_freq': expected_freq,
                        'observed_freq': observed_freq,
                        'num_texts': num_texts,
                        'pmi': pmi
                    })
                    
                    for text in window_texts:
                        words = text.split()
                        for i, word in enumerate(words):
                            if word.lower() == word_of_interest_lower and other_word.lower() in [w.lower() for w in words[max(0, i-collocate_window):i+collocate_window+1]]:
                                context = ' '.join(words[max(0, i-collocate_window):i+collocate_window+1])
                                dir_name, filename = text_sources[text]
                                context_info = context #this is whats shown as context in the csv, redundant but can change it l8
                                collocate_contexts[other_word].append((context_info, other_word, filename, dir_name))
        
        collocate_stats.sort(key=lambda x: x['pmi'], reverse=True)
        filtered_contexts = [context for collocate in collocate_stats for context in collocate_contexts.get(collocate['word'], [])]
        
        return f"{window_start}-{window_end}", {
            'collocates': collocate_stats,
            'total_tokens': total_tokens,
            'total_texts': total_texts,
            'contexts': filtered_contexts
        }
    except Exception as e:
        print(f"Error processing window {window_start}-{window_end}: {str(e)}")
        traceback.print_exc()
        return None

def analyze_collocates(text_dirs, output_dir, word_of_interest, start_year, end_year, collocate_window, window_size=10):
    def read_texts_by_year(directories):
        texts_by_year = defaultdict(list)
        file_paths = [os.path.join(dir, f) for dir in directories for f in os.listdir(dir) if f.endswith('.txt')]
        
        print("Reading files...")
        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = list(tqdm(executor.map(process_file, file_paths), total=len(file_paths)))
        
        for result in results:
            if result:
                year, text = result
                if start_year <= year <= end_year:
                    texts_by_year[year].append(text)
        
        return texts_by_year

    print(f"Starting text processing for years {start_year}-{end_year}...")
    texts_by_year = read_texts_by_year(text_dirs)
    file_directory_map = build_file_directory_map(text_dirs)
    
    all_years = sorted(texts_by_year.keys())
    if not all_years:
        print(f"No texts found in the specified year range {start_year}-{end_year}")
        return
        
    window_data = [
        (window_start, window_start + window_size - 1, 
         [text for year in range(window_start, window_start + window_size) for text in texts_by_year.get(year, [])])
        for window_start in range(start_year, end_year - window_size + 2)
    ]

    collocates_by_window = {}
    print("Processing windows...")
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda data: process_window(data, word_of_interest, collocate_window, file_directory_map), window_data), total=len(window_data)))

    for result in results:
        if result:
            window, window_result = result
            collocates_by_window[window] = window_result

    os.makedirs(output_dir, exist_ok=True)
    print("Saving results to CSV and TXT files...")

    for window, data in collocates_by_window.items():
        save_collocates_to_csv(output_dir, window, data)
        save_contexts_to_csv(output_dir, window, data)
        save_detailed_report(output_dir, window, data, word_of_interest)

    print(f"Collocate analysis completed. Results saved in the '{output_dir}' directory.")

def save_collocates_to_csv(output_dir, window, data):
    csv_filename = os.path.join(output_dir, f'collocates_{window}.csv')
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['No.', 'Word', 'Total no. in window corpus', 'Expected collocate frequency',
                         'Observed collocate frequency', 'In no. of texts', 'Mutual Information value'])
        
        for i, collocate in enumerate(data['collocates'], 1):
            writer.writerow([
                i,
                collocate['word'],
                collocate['total_corpus'],
                f"{collocate['expected_freq']:.2f}",
                collocate['observed_freq'],
                collocate['num_texts'],
                f"{collocate['pmi']:.2f}"
            ])
    print(f"Saved collocates to {csv_filename}")

def save_contexts_to_csv(output_dir, window, data): #for later viewing
    subfolder_path = os.path.join(output_dir, 'contexts')
    os.makedirs(subfolder_path, exist_ok=True)
    csv_filename = os.path.join(subfolder_path, f'contexts_{window}.csv')
    
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Context', 'Word', 'File', 'Directory'])
        for context_info, other_word, filename, dir_name in data['contexts']:
            writer.writerow([context_info, other_word, filename, dir_name])
    
    print(f"Saved contexts to {csv_filename}")

def save_detailed_report(output_dir, window, data, word_of_interest): #this is for UFA, important to keep the formatting as-is
    subfolder_path = os.path.join(output_dir, 'collocation_txt')
    os.makedirs(subfolder_path, exist_ok=True)
    txt_filename = os.path.join(subfolder_path, f'collocates_txt_{window}.txt')
    
    with open(txt_filename, 'w', encoding='utf-8') as txtfile:
        txtfile.write("https://example.com/collocation-analysis\n") #spoofing the formatting of what UFA expects (on the website), kind of arbitrary by now but its okay
        txtfile.write(f"There are {data['total_tokens']} different words in your collocation database for \"{word_of_interest}\". ")
        txtfile.write(f"(Your query returned {sum(c['observed_freq'] for c in data['collocates'])} matches in {data['total_texts']} different texts)\n")
        txtfile.write("__________________\n\n")
        
        txtfile.write("No.\tWord\tTotal no. in whole corpus\tExpected collocate frequency\t")
        txtfile.write("Observed collocate frequency\tIn no. of texts\tMutual information value\n\n")
        
        for i, collocate in enumerate(data['collocates'], 1):
            txtfile.write(f"{i}\t{collocate['word']}\t{collocate['total_corpus']}\t")
            txtfile.write(f"{collocate['expected_freq']:.2f}\t{collocate['observed_freq']}\t")
            txtfile.write(f"{collocate['num_texts']}\t{collocate['pmi']:.2f}\n")
    
    print(f"Saved detailed report to {txt_filename}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Igiba\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import concurrent.futures

tasks = [
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "fact/collocate_results_1665-1958_FACT_css3_w3", "fact", 1665, 1958, 3), #collocate_window = 3
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "facts/collocate_results_1665-1958_FACTS_css1_w3", "facts", 1665, 1958, 3), #scaling is more forgiving for facts (imperfect)
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "fact/collocate_results_1665-1958_FACT_css3_w5", "fact", 1665, 1958, 5),
    
    ([r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
      r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"], 
     "facts/collocate_results_1665-1958_FACTS_css1_w5", "facts", 1665, 1958, 5) #css = custom subcorpora scaling
]

def run_analyze_collocates(args):
    analyze_collocates(*args)

with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = [executor.submit(run_analyze_collocates, task) for task in tasks]
    for future in concurrent.futures.as_completed(futures):
        try:
            future.result()
        except Exception as e:
            print(f"Error occurred: {e}")

Starting text processing for years 1665-1958...
Starting text processing for years 1665-1958...
Starting text processing for years 1665-1958...
Starting text processing for years 1665-1958...
Reading files...
Reading files...
Reading files...
Reading files...


  0%|          | 0/18194 [00:00<?, ?it/s]

  0%|          | 0/18194 [00:00<?, ?it/s]

  0%|          | 0/18194 [00:00<?, ?it/s]

  0%|          | 0/18194 [00:00<?, ?it/s]

---
extra stuff
---

In [7]:
#so my pc turns off eventually
import os
import time

time.sleep(300)
os.system("shutdown /s /t 1")

0

: 

In [3]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm

def count_files_in_year_windows(text_dirs, window_size=5):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None

    year_counts = defaultdict(int)

    print("Counting files in year windows...")
    file_paths = [os.path.join(dir, f) for dir in text_dirs for f in os.listdir(dir) if f.endswith('.txt')]

    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        if year:
            year_counts[year] += 1

    all_years = sorted(year_counts.keys())
    year_windows = defaultdict(int)

    for year in all_years:
        if year in year_counts:
            for offset in range(window_size):
                window_year = year + offset
                year_windows[window_year] += year_counts[year] if window_year in year_counts else 0

    return dict(year_windows)

# Example usage
year_counts = count_files_in_year_windows(
    [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"],
    window_size=5
)

# Prepare the data for plotting
years = list(range(1665, 2025))
counts = [year_counts.get(year, 0) for year in years]
df = pd.DataFrame({'Year': years, 'Article Count': counts})

# Create the interactive plot
fig = go.Figure()

# Add scatter points for each year
fig.add_trace(go.Scatter(
    x=df['Year'],
    y=df['Article Count'],
    mode='markers+lines',
    marker=dict(
        size=10,
        color=['red' if count == 0 else 'orange' if count < 100 else 'blue' for count in df['Article Count']],
        line=dict(width=1, color='black')
    ),
    text=[f'Year: {year}<br>Count: {count}' for year, count in zip(df['Year'], df['Article Count'])],  # Tooltip text
    hoverinfo='text'  # Show the text on hover
))

# Update layout
fig.update_layout(
    title='Number of Articles per Year (1665-2024) (without restricteds)',
    xaxis_title='Year',
    yaxis_title='Number of Articles',
    xaxis=dict(
        tickmode='linear',
        tick0=1665,
        dtick=5,
        range=[1665, 2024]  # Explicitly set the range of the x-axis
    ),
    yaxis=dict(range=[0, df['Article Count'].max() + 500]),  # Adjust the limit as necessary
    template='plotly_white'
)

# Show the plot
fig.show()


Counting files in year windows...


100%|██████████| 18194/18194 [00:00<00:00, 184788.91it/s]


In [57]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict
import os
import re
from tqdm import tqdm
import string
from nltk.tokenize import word_tokenize

def count_words_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    tokens = word_tokenize(text.lower())
    return len([token for token in tokens if token.isalpha() and token not in string.punctuation])

def count_words_in_10_year_windows(text_dirs, start_year, end_year):
    def extract_year_from_filename(filename):
        match = re.search(r'rst[bla]?_?(\d{4})', filename)
        return int(match.group(1)) if match else None

    year_word_counts = defaultdict(int)

    print("Counting words in year windows...")
    file_paths = [os.path.join(dir, f) for dir in text_dirs for f in os.listdir(dir) if f.endswith('.txt')]

    for file_path in tqdm(file_paths):
        year = extract_year_from_filename(os.path.basename(file_path))
        if year and start_year <= year <= end_year:
            word_count = count_words_in_file(file_path)
            year_word_counts[year] += word_count

    words_per_window = defaultdict(int)

    for year in range(start_year, end_year - 9):  # Ensure we have a full 10-year window
        window_start = year
        window_end = year + 9
        total_words = sum(year_word_counts.get(y, 0) for y in range(window_start, window_end + 1))
        words_per_window[f"{window_start}-{window_end}"] = total_words

    return words_per_window

# Example usage
# Count words for the historical period (1665-1958)
historical_word_counts = count_words_in_10_year_windows(
    [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"],
    start_year=1665,
    end_year=1958
)

# Count words for the modern period (2000-2024)
modern_word_counts = count_words_in_10_year_windows(
    [r"D:\Fact_fiction_corpus\texts\royal society\txt_rstb", 
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rstl", 
     r"D:\Fact_fiction_corpus\texts\royal society\txt_rsta"],
    start_year=2000,
    end_year=2024
)

# Prepare data for historical plotting
historical_windows = list(historical_word_counts.keys())
historical_counts = list(historical_word_counts.values())

# Create the interactive plot for historical data
fig_historical = go.Figure()

# Add bar trace for historical data
fig_historical.add_trace(go.Bar(
    x=historical_windows,
    y=historical_counts,
    name='Historical Period (1665-1958)',
    marker=dict(color='blue'),
    text=historical_counts,
    textposition='auto',
    hovertemplate='Window: %{x}<br>Words: %{y}<extra></extra>'
))

# Update layout for historical data
fig_historical.update_layout(
    title='Total Words per 10-Year Window (1665-1958)',
    xaxis_title='10-Year Window',
    yaxis_title='Total Words',
    template='plotly_white'
)

# Show the historical plot
fig_historical.show()

# Prepare data for modern plotting
modern_windows = list(modern_word_counts.keys())
modern_counts = list(modern_word_counts.values())

# Create the interactive plot for modern data
fig_modern = go.Figure()

# Add bar trace for modern data
fig_modern.add_trace(go.Bar(
    x=modern_windows,
    y=modern_counts,
    name='Modern Period (2000-2024)',
    marker=dict(color='orange'),
    text=modern_counts,
    textposition='auto',
    hovertemplate='Window: %{x}<br>Words: %{y}<extra></extra>'
))

# Update layout for modern data
fig_modern.update_layout(
    title='Total Words per 10-Year Window (2000-2024)',
    xaxis_title='10-Year Window',
    yaxis_title='Total Words',
    template='plotly_white'
)

# Show the modern plot
fig_modern.show()


Counting words in year windows...


100%|██████████| 18194/18194 [06:46<00:00, 44.72it/s]  


Counting words in year windows...


100%|██████████| 18194/18194 [09:36<00:00, 31.54it/s]   


In [4]:
import pandas as pd
import plotly.graph_objects as go
import os

def find_highest_numbers_in_directory(directory_path):
    highest_numbers = {}

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    highest_number = 0  # Initialize highest number for the current file
                    for line in f.readlines()[1:]:  # Skip the header line
                        parts = line.split('\t')
                        if len(parts) > 0:  # Check if the line is properly formatted
                            try:
                                number = int(parts[0])  # Get the number from the first column
                                highest_number = max(highest_number, number)  # Update highest number
                            except ValueError:
                                pass  # Ignore non-integer values
                    
                    # Store the highest number for the current file
                    highest_numbers[filename] = highest_number

            except PermissionError:
                pass  # Ignore permission errors
            except FileNotFoundError:
                pass  # Ignore file not found errors
            except Exception:
                pass  # Ignore all other exceptions

    return highest_numbers

# Example usage
collocate_directory_path = r"collocate_results_combined_fact_10_window"
highest_numbers = find_highest_numbers_in_directory(collocate_directory_path)

# Prepare the data for plotting
df = pd.DataFrame(list(highest_numbers.items()), columns=['File', 'Highest Number'])

# Modify the 'File' column to extract just the year range
df['Year Range'] = df['File'].str.extract(r'(\d{4}-\d{4})')[0]  # Extract year range using regex
df['Year Range'] = df['Year Range'].fillna(df['File'])  # In case of no match, keep the original filename

# Create the interactive plot
fig = go.Figure()

# Add bars for each file's highest number
fig.add_trace(go.Bar(
    x=df['Year Range'],  # Use the modified Year Range for x-axis
    y=df['Highest Number'],
    marker=dict(color='blue'),
    text=df['Highest Number'],  # Show counts on bars
    textposition='auto',  # Position text on the bars
    hovertemplate='Year Range: %{x}<br>%{y}<extra></extra>'  # Customize hover text
))

# Update layout
fig.update_layout(
    title='Amount of Collocates per Year Window',
    xaxis_title='Year Range',
    yaxis_title='Highest Number',
    xaxis_tickangle=-45,  # Rotate x-axis labels for better visibility
    template='plotly_white'
)

# Show the plot
fig.show()


In [None]:
import os
import pandas as pd

folder_path = "collocate_results_combined_fact_10_window/csv_ver"

csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
combined_data = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

unique_words = combined_data['Word'].unique()
print(unique_words)

keyword = "lsw"
keyword_docs = []

for f in csv_files:
    df = pd.read_csv(f)
    if keyword in df['Word'].values:
        keyword_docs.append(f)

if keyword_docs:
    print(f"The keyword '{keyword}' is found in the following document(s):")
    for doc in keyword_docs:
        print(doc)
else:
    print(f"The keyword '{keyword}' is not found in any document.")