Trying to make the contexts match the found collocates so I can read them later for qualitative analysis.

Slightly messy and verbose + AI'd to the wazoo, but it functions

Add the frequency threshold to this to see how that might affect things

Also remove duplicates if only Year differs (keep the earlier year)

In [3]:
import pandas as pd
import os
import re

def process_collocates(input_csv, context_directory, output_csv=None, debug=False):
    """
    Process collocate data by merging with context files based on exact year match for context files,
    and removing duplicates.
   
    Parameters:
    -----------
    input_csv : str
        Path to the input CSV file containing collocate results
    context_directory : str
        Path to the directory containing context CSV files
    output_csv : str, optional
        Path where the output CSV should be saved. If None, no file is saved
    debug : bool, optional
        If True, print debugging information during processing
       
    Returns:
    --------
    pandas.DataFrame
        Processed DataFrame with merged context data and duplicates removed
    """
    def print_debug(message, df=None):
        if debug:
            print(message)
            if df is not None:
                print(df.head(), "\n")

    # Step 1: Load the primary DataFrame and drop "Unnamed" columns
    r1 = pd.read_csv(input_csv)
    r1 = r1.loc[:, ~r1.columns.str.contains('^Unnamed')]
    print_debug("Initial data loaded:", r1)
    
    # Step 2: Identify and map each context file by its starting year
    context_files = [f for f in os.listdir(context_directory) if f.endswith('.csv')]
    year_to_file_map = {}
    
    for file in context_files:
        match = re.match(r'contexts_(\d+)-\d+\.csv', file)
        if match:
            start_year = int(match.group(1))
            year_to_file_map[start_year] = file
    
    # Step 3: Initialize an empty DataFrame for the merged data
    merged_data = pd.DataFrame()
    
    # Step 4: Loop through each unique Year in r1 and merge with its respective context file
    for year in r1['Year'].unique():
        if year in year_to_file_map:
            # Get the context file for the current year
            context_file = year_to_file_map[year]
            file_path = os.path.join(context_directory, context_file)
            
            # Load the context file
            context_temp = pd.read_csv(file_path)
            context_temp = context_temp.loc[:, ~context_temp.columns.str.contains('^Unnamed')]
            
            # Rename the 'Year' column in context data to 'Context_Year' to avoid conflicts
            context_temp.rename(columns={'Year': 'Context_Year'}, inplace=True)
            
            # Filter rows in r1 for the current year
            year_data = r1[r1['Year'] == year]
            
            # Merge the year_data with the context_temp on 'Collocate' only
            merged_year_data = pd.merge(year_data, context_temp, on='Collocate', how='left')
            
            # Append to merged_data DataFrame
            merged_data = pd.concat([merged_data, merged_year_data], ignore_index=True)
        else:
            print_debug(f"No context file found for year {year}. Skipping.")
    
    print_debug("After merging all relevant context files based on year:", merged_data)
    
    # Step 5: Remove exact duplicates, ignoring differences in 'Year' and 'Context_Year'
    columns_to_compare = [col for col in merged_data.columns if col not in ['Year', 'Context_Year']]
    merged_data.drop_duplicates(subset=columns_to_compare, keep='first', inplace=True)
    print_debug("After removing duplicates (ignoring 'Year' and 'Context_Year'):", merged_data)
    
    # Step 6: Remove duplicates where only 'Year' differs, keeping the earliest Year version
    merged_data.sort_values(by='Year', inplace=True)  # Sort by 'Year' to keep earliest first
    columns_to_compare_without_year = [col for col in merged_data.columns if col != 'Year']
    merged_data.drop_duplicates(subset=columns_to_compare_without_year, keep='first', inplace=True)
    print_debug("After removing duplicates where only 'Year' differs (keeping earliest Year):", merged_data)
    
    # Step 7: Remove duplicates where only 'Context_Year' differs,  keeping the earliest Context_Year version
    merged_data.sort_values(by='Context_Year', inplace=True)  # Sort by 'Context_Year' to keep first
    columns_to_compare_without_context_year = [col for col in merged_data.columns if col != 'Context_Year']
    merged_data.drop_duplicates(subset=columns_to_compare_without_context_year, keep='first', inplace=True)
    print_debug("After removing duplicates where only 'Context_Year' differs (keeping first entry):", merged_data)
    
    # Step 8: Reset index
    merged_data.reset_index(drop=True, inplace=True)
    
    # Save to CSV if output path is provided
    if output_csv:
        merged_data.to_csv(output_csv, index=False)
        print_debug(f"Saved to {output_csv}")
   
    return merged_data



result_df2 = process_collocates(
    'collocate_results/dfs/fact5.csv',
    "../collocation_results/FACT/collocate_results_1665-1958_FACT_css3_w5/contexts",
    'collocate_results/contexts/window5/fact5_contexts.csv',
    debug=True
)

result_df4 = process_collocates(
    'collocate_results/dfs/facts5.csv',
    "../collocation_results/FACTS/collocate_results_1665-1958_FACTS_css3_w5/contexts",
    'collocate_results/contexts/window5/facts5_contexts.csv',
    debug=True
)


Initial data loaded:
   Year Collocate  MI Score
0  1670    matter      7.99
1  1671    matter      7.83
2  1673    matter      7.81
3  1675    matter      8.09
4  1676    matter      7.71 

After merging all relevant context files based on year:
   Year Collocate  MI Score  \
0  1670    matter      7.99   
1  1671    matter      7.83   
2  1673    matter      7.81   
3  1675    matter      8.09   
4  1676    matter      7.71   

                                             Context                File  \
0  irrational, if the matter of fact be true ; fo...  rstl_1675_0037.txt   
1  irrational, if the matter of fact be true ; fo...  rstl_1675_0037.txt   
2  irrational, if the matter of fact be true ; fo...  rstl_1675_0037.txt   
3  irrational, if the matter of fact be true ; fo...  rstl_1675_0037.txt   
4  Philoſophy , trill matter of fact and experime...  rstl_1685_0041.txt   

  Directory  Context_Year  Frequency Filter  
0  txt_rstl          1675                 6  
1  txt_rstl      

In [4]:
def get_unique_collocates(file):
    collocates = pd.read_csv(file)['Collocate'].unique().tolist()
    print(f"\nFile: {file}")
    print(f"Collocates: {collocates}")

# Process each file
get_unique_collocates('collocate_results/contexts/window5/fact5_contexts.csv')
get_unique_collocates('collocate_results/contexts/window5/facts5_contexts.csv')


File: collocate_results/contexts/window5/fact5_contexts.csv
Collocates: ['matter', 'of', 'this', 'has', 'ascertained', 'general', 'curious', 'new', 'interesting', 'remarkable', 'important', 'that', 'due', 'attention', 'owing', 'lies', 'view', 'spite', 'do']

File: collocate_results/contexts/window5/facts5_contexts.csv
Collocates: ['there', 'i', 'theſe', 'have', 'these', 'some', 'prove', 'new', 'stated', 'appear', 'many', 'general', 'preceding', 'ascertained', 'number', 'chemical', 'few', 'respecting', 'curious', 'following', 'propriety', 'paper', 'such', 'appears', 'recorded', 'connected', 'observed', 'known', 'now', 'all', 'certainty']


In [5]:
import pandas as pd

def get_unique_collocates(file1, file2):
    # Get collocates from both files
    collocates1 = set(pd.read_csv(file1)['Collocate'].unique())
    collocates2 = set(pd.read_csv(file2)['Collocate'].unique())
    
    # Find overlapping collocates
    shared = collocates1.intersection(collocates2)
    
    print("\nCollocates that appear with both 'fact' and 'facts':")
    print(sorted(shared))

# Process files
get_unique_collocates(
    'collocate_results/contexts/window5/fact5_contexts.csv',
    'collocate_results/contexts/window5/facts5_contexts.csv'
)


Collocates that appear with both 'fact' and 'facts':
['ascertained', 'curious', 'general', 'new']
