In [None]:
#Connecting to the drive
from google.colab import drive
drive.mount('/content/drive',force_remount = True)

The original DBLP Discovery Dataset (D3) was downloaded from https://github.com/jpwahle/lrec22-d3-dataset/blob/main/README.md using this link: https://zenodo.org/records/7071698

In [None]:
import gzip
import shutil

# File paths
input_file_path = '/content/drive/My Drive/Experiment/2022-11-30-papers.jsonl.gz'
output_file_path = '/content/drive/My Drive/Experiment/2022-11-30-papers.jsonl'

# Unpack the .gz file
with gzip.open(input_file_path, 'rb') as gz_file:
    with open(output_file_path, 'wb') as out_file:
        shutil.copyfileobj(gz_file, out_file)

print(f"File has been unpacked and saved as {output_file_path}")

Titles and abstracts were retrieved and preprocessed

In [None]:
import json
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')

# Function to preprocess the text
def preprocess(txt):
    # Lowercase and tokenize the text
    return word_tokenize(txt.lower())

# Paths to input and output files
input_file_path = '/content/drive/My Drive/Experiment/2022-11-30-papers.jsonl'
output_file_path = '/content/drive/My Drive/Experiment/corpus/titles_and_abstracts.txt'
chunk_size = 10000  # Number of lines to process in each chunk

# First, calculate the total number of lines for progress tracking
with open(input_file_path, 'r') as f:
    total_lines = sum(1 for line in f)

# Open the output file for writing
try:
    with open(output_file_path, 'w') as output_file:
        # Read and process the input JSONL file in chunks
        with open(input_file_path, 'r') as input_file:
            for start_line in range(0, total_lines, chunk_size):
                # Process the chunk
                lines = [input_file.readline() for _ in range(chunk_size)]
                for line_number, line in enumerate(lines, start=start_line + 1):
                    try:
                        # Parse the JSON line
                        data = json.loads(line.strip())

                        # Extract and preprocess title and abstract
                        title = data.get('title', '').strip()
                        abstract = data.get('abstract', '').strip()

                        # Use the preprocess function to tokenize title and abstract
                        processed_title = ' '.join(preprocess(title))
                        processed_abstract = ' '.join(preprocess(abstract))

                        # Write the processed title and abstract to the output file, only if they are not empty
                        if processed_title:
                            output_file.write(processed_title + " ")
                        if processed_abstract:
                            output_file.write(processed_abstract + " ")

                    except json.JSONDecodeError:
                        print(f"Error decoding JSON on line {line_number}: {line.strip()}")
                        continue  # Skip the line if there is a JSON error

                # Display progress percentage
                progress = min((start_line + chunk_size) / total_lines * 100, 100)
                print(f"Processing: {progress:.2f}% complete")

    print(f"Filtered and processed titles and abstracts have been saved to {output_file_path}")

except Exception as e:
    print(f"An error occurred while writing to the file: {e}")

Bigrams were retrieved (to form a list of hypernyms)

In [None]:
import re
import math
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
from nltk.corpus import stopwords
from nltk import download

# Download required NLTK resources
download('punkt')
download('stopwords')

# Define constants
MIN_FREQUENCY = 50
MIN_LOG_DICE = 6
CHUNK_SIZE = 10 * 1024 * 1024  # Process in 10MB chunks

# Function to check if a word contains only valid characters (English/Greek letters, digits, dashes, and underscores)
def contains_invalid_characters(word):
    if not isinstance(word, str):
        return True
    for char in word:
        if not (char.isalpha() and (('A' <= char <= 'Z') or ('a' <= char <= 'z') or ('\u0370' <= char <= '\u03FF')) or char in ['-', '_'] or char.isdigit()):
            return True
    return False

# Function to check if the word consists only of numbers or symbols
def is_only_numbers_or_symbols(word):
    word_cleaned = word.replace("-", "").replace("_", "")
    return word_cleaned.isdigit() or all(not c.isalnum() for c in word_cleaned)

# Function to check if the word starts or ends with a dash or underscore
def starts_or_ends_with_dash_or_underscore(word):
    return word.startswith('-') or word.startswith('_') or word.endswith('-') or word.endswith('_')

# Set file paths
file_path = '/content/drive/My Drive/Experiment/corpus/titles_and_abstracts.txt'
output_file_path = '/content/drive/My Drive/Experiment/filtered_bigrams.csv'

# Initialize stopwords and counters
stop_words = set(stopwords.words('english'))

# Initialize counters for tokens and bigrams
token_freq = Counter()
bigram_freq = Counter()

# Track chunk number
chunk_number = 1

# Read and process the file in chunks
with open(file_path, 'r') as f:
    while True:
        chunk = f.read(CHUNK_SIZE)  # Read a chunk of text
        if not chunk:
            break

        print(f"Processing chunk {chunk_number}...")

        # Tokenize the chunk
        tokens = word_tokenize(chunk.lower())

        # Update token frequencies
        token_freq.update(tokens)

        # Generate and count bigrams, filtering out stopwords
        filtered_bigrams = [
            bigram for bigram in bigrams(tokens)
            if bigram[0] not in stop_words and bigram[1] not in stop_words
        ]
        bigram_freq.update(filtered_bigrams)

        # Move to the next chunk
        chunk_number += 1

# Filter and calculate log dice score for valid bigrams
all_filtered_bigrams = []
for bigram, freq in bigram_freq.items():
    if freq >= MIN_FREQUENCY:
        first_word, second_word = bigram

        # Check that both words in the bigram meet the conditions
        if (not contains_invalid_characters(first_word) and not contains_invalid_characters(second_word) and
            not is_only_numbers_or_symbols(first_word) and not is_only_numbers_or_symbols(second_word) and
            not starts_or_ends_with_dash_or_underscore(first_word) and not starts_or_ends_with_dash_or_underscore(second_word)):

            # Calculate log dice score
            freq_first_word = token_freq[first_word]
            freq_second_word = token_freq[second_word]

            if freq_first_word > 0 and freq_second_word > 0:  # Avoid division by zero
                log_dice = 14 + math.log2(2 * freq / (freq_first_word + freq_second_word))

                # Check if log dice score meets the threshold
                if log_dice >= MIN_LOG_DICE:
                    all_filtered_bigrams.append({
                        'Bigram': f"{first_word} {second_word}",
                        'Frequency': freq,
                        'Log Dice': log_dice
                    })

# Convert the filtered bigrams to a DataFrame
filtered_bigrams_df = pd.DataFrame(all_filtered_bigrams, columns=['Bigram', 'Frequency', 'Log Dice'])

# Save the DataFrame to an output file
filtered_bigrams_df.to_csv(output_file_path, index=False)

print(f"Filtered bigrams have been saved to {output_file_path}")

Sequencies were retrieved (to extract candidate MWTs)

In [None]:
import pandas as pd
import re
import csv
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict

# Download NLTK's tokenizer if not already downloaded
nltk.download('punkt')

# File paths
file_path = '/content/drive/My Drive/Experiment/corpus/titles_and_abstracts.txt'
control_list_path = '/content/drive/My Drive/Experiment/Hypernym.xlsx'
output_path = '/content/drive/My Drive/Experiment/Sequencies.csv'

# Load expressions from Excel file
control_list_df = pd.read_excel(control_list_path)
expressions_to_find = control_list_df['Hypernym'].dropna().tolist()

# Prepare expressions in tokenized form for easier matching
tokenized_expressions = {
    " ".join(word_tokenize(expression.lower())): expression
    for expression in expressions_to_find
}

# List to store found contexts
all_found_contexts = []

# Define chunk size: 10MB
CHUNK_SIZE = 10 * 1024 * 1024  # 10MB

# Process the corpus in chunks
chunk_number = 0
with open(file_path, 'r') as f:
    while True:
        # Read the next chunk
        chunk = f.read(CHUNK_SIZE)
        if not chunk:
            break  # End of file

        chunk_number += 1
        print(f"Processing chunk {chunk_number}...")

        # Tokenize the chunk
        tokens = word_tokenize(chunk.lower())  # Tokenize chunk by chunk
        token_positions = defaultdict(list)

        # Index each token's position in the chunk
        for idx, token in enumerate(tokens):
            token_positions[token].append(idx)

        # Search for each expression in the tokenized chunk
        for tokenized_expression, original_expression in tokenized_expressions.items():
            expression_tokens = tokenized_expression.split()
            first_token = expression_tokens[0]

            # Only proceed if the first token of the expression is in the chunk
            if first_token in token_positions:
                for j in token_positions[first_token]:
                    # Check if the entire expression matches
                    if tokens[j:j + len(expression_tokens)] == expression_tokens:
                        # Extract context: up to 10 tokens before the expression
                        before_context_tokens = tokens[max(0, j - 10):j]
                        before_context = " ".join(before_context_tokens)
                        context = f"{before_context} {tokenized_expression}".strip()
                        all_found_contexts.append((original_expression, context))

# Final report of contexts
if not all_found_contexts:
    print("No expressions were found in the text.")
else:
    print(f"Total contexts found: {len(all_found_contexts)}")

# Write results to a CSV file
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Expression', 'Context'])
    writer.writerows(all_found_contexts)

print(f'Successfully saved contexts to {output_path}')

Statistical evaluation of MWTs and Hypernyms (Part I. Absolute frequency)

In [None]:
import pandas as pd
import re
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
import time

# Define paths and load expressions
controllist_path = '/content/drive/My Drive/Experiment/full_list.xlsx'
text_path = '/content/drive/My Drive/Experiment/corpus/titles_and_abstracts.txt'
controllist_df = pd.read_excel(controllist_path)
expressions = controllist_df['Terms'].tolist()

# Chunk size and batch size
CHUNK_SIZE = 10 * 1024 * 1024  # 10MB
BATCH_SIZE = 50  # Process 50 chunks at a time

# Frequency function
def count_occurrences(text, multiword):
    pattern = re.escape(multiword)
    matches = re.findall(pattern, text)
    return len(matches)

# Process a single chunk
def process_chunk(chunk, expressions):
    chunk_frequencies = [count_occurrences(chunk, expr) for expr in expressions]
    return chunk_frequencies

# Initialize frequency array
frequencies = [0] * len(expressions)

# Start chunk processing
chunk_counter = 0
futures = []

with open(text_path, 'r', encoding='utf-8') as file, ProcessPoolExecutor() as executor:
    while True:
        chunk = file.read(CHUNK_SIZE)
        if not chunk:
            break
        chunk_counter += 1
        print(f"Submitting chunk {chunk_counter} for processing...")
        futures.append(executor.submit(process_chunk, chunk, expressions))

        # Process futures in batches to reduce memory usage
        if len(futures) >= BATCH_SIZE:
            print(f"Processing batch of {BATCH_SIZE} chunks...")
            for future in concurrent.futures.as_completed(futures, timeout=300):
                chunk_frequencies = future.result()
                for i, freq in enumerate(chunk_frequencies):
                    frequencies[i] += freq
            futures.clear()  # Clear processed futures

# Process remaining futures
for future in concurrent.futures.as_completed(futures, timeout=300):
    chunk_frequencies = future.result()
    for i, freq in enumerate(chunk_frequencies):
        frequencies[i] += freq

# Add frequencies to DataFrame and save results
controllist_df['Absolute Frequency'] = frequencies
output_path = '/content/drive/My Drive/Experiment/statistics.xlsx'
controllist_df.to_excel(output_path, index=False)

print(f"Results have been saved to {output_path}")

Statistical evaluation of MWTs and Hypernyms (Part II. C-value)

In [None]:
import pandas as pd
import numpy as np

# Load the data from the Excel file
file_path = '/content/drive/My Drive/Experiment/statistics.xlsx'
df = pd.read_excel(file_path)

# Initialize a new column for C-values
df['C-value'] = 0.0

# Create a dictionary to store the frequency of each term
term_frequencies = dict(zip(df['Terms'], df['Absolute Frequency']))

# Function to calculate C-value for a term
def calculate_c_value(term, freq_a, candidates):
    # Calculate length in terms of words
    length_a = len(term.split())  # Number of words in the term

    # Case where there are no nested candidates
    if not candidates:
        return np.log2(length_a) * freq_a if freq_a > 0 else 0

    # Case where nested candidates exist
    p_t_a = len(candidates)  # Number of candidates containing the term
    sum_f_b = sum(term_frequencies.get(b, 0) for b in candidates)  # Sum of frequencies of candidates
    return np.log2(length_a) * (freq_a - (sum_f_b / p_t_a)) if p_t_a != 0 else 0

# Calculate C-values for each expression
for index, row in df.iterrows():
    expression = row['Terms']
    freq_a = term_frequencies[expression]  # Frequency of the term
    # Identify candidates that contain the expression as a substring
    candidates = [exp for exp in term_frequencies.keys() if expression in exp and exp != expression]
    # Calculate C-value
    c_value = calculate_c_value(expression, freq_a, candidates)
    df.at[index, 'C-value'] = c_value  # Store the C-value in the DataFrame

# Save the results with C-values to a new Excel file
output_file_path = '/content/drive/My Drive/Experiment/statistics_with_cvalues.xlsx'
df.to_excel(output_file_path, index=False)

print(f"C-values have been calculated and saved to {output_file_path}")

Syntactic analysis of MWTs

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import pandas as pd
import spacy
import re

# Load SpaCy's large model for dependency parsing
nlp = spacy.load("en_core_web_lg")

# Define the path to your Excel file
input_path = '/content/drive/My Drive/Experiment/MWT.xlsx'

# Load the data from the Excel file
df = pd.read_excel(input_path)

# Initialize a new column for syntactic structure representation
df['Syntactic structure representation'] = ""

# Preprocess expression to handle hyphenated words as single tokens
def preprocess_expression(expression):
    # Replace hyphens within words with underscores to treat them as single tokens
    return re.sub(r'(?<=\w)-(?=\w)', '_', expression)

# Function to extract dependency pairs from a given expression
def extract_dependency_pairs(expression):
    # Preprocess expression to handle hyphenated terms as single tokens
    expression = preprocess_expression(expression)
    doc = nlp(expression)
    # Extract dependency pairs in the form (token1, token2)
    dependency_pairs = [f"{token.text.replace('_', '-')} {token.head.text.replace('_', '-')}" for token in doc if token.dep_ != 'ROOT']
    # Join pairs as a single string separated by commas
    return ", ".join(dependency_pairs)

# Process each expression in the "MWT" column and store the dependency pairs
df['Syntactic structure representation'] = df['MWT'].apply(extract_dependency_pairs)

# Save the updated DataFrame back to the Excel file
output_path = '/content/drive/My Drive/Experiment/MWT_syntax.xlsx'
df.to_excel(output_path, index=False)

print(f"Syntactic structure representation saved to {output_path}")

Statistical analysis of syntactic structure representations

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the SciBERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to compute embeddings with mean pooling
def mean_pooling(embeddings, attention_mask):
    # Apply mean pooling to get the sentence embedding
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embedding(text):
    # Tokenize and encode the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Perform mean pooling on the token embeddings
    return mean_pooling(outputs.last_hidden_state, inputs['attention_mask']).squeeze()

# Load the Excel file and select relevant columns
file_path = '/content/drive/My Drive/Experiment/MWT_syntax.xlsx'
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Initialize a list to store cosine similarities
cosine_similarities = []

# Iterate over each row to compute cosine similarity between "Expression" and "Syntactic structure representation"
for _, row in df.iterrows():
    expression = str(row['MWT'])
    structure = str(row['Syntactic structure representation'])

    # Get embeddings for each text
    expr_embedding = get_embedding(expression)
    struct_embedding = get_embedding(structure)

    # Calculate cosine similarity and store the result
    cos_sim = cosine_similarity(expr_embedding.unsqueeze(0), struct_embedding.unsqueeze(0))[0][0]
    cosine_similarities.append(cos_sim)

# Add the cosine similarities to a new column "cosim"
df['cosim'] = cosine_similarities

# Calculate and print the mean cosine similarity
mean_cosine_similarity = sum(cosine_similarities) / len(cosine_similarities)
print(f"Mean cosine similarity: {mean_cosine_similarity}")

# Save the updated DataFrame back to the Excel file
output_file_path = '/content/drive/My Drive/Experiment/MWT_cosine_similarities.xlsx'
df.to_excel(output_file_path, index=False)

print("Cosine similarities calculated and saved in the new file.")

Statistical analysis of triplets

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the SciBERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to compute embeddings with mean pooling
def mean_pooling(embeddings, attention_mask):
    # Apply mean pooling to get the sentence embedding
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embedding(text):
    # Tokenize and encode the text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Perform mean pooling on the token embeddings
    return mean_pooling(outputs.last_hidden_state, inputs['attention_mask']).squeeze()

# Load the Excel file and select the "Triplets" sheet
file_path = '/content/drive/My Drive/Experiment/Triplets.xlsx'
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Initialize lists to store cosine similarities and differences
cosim1_values = []
cosim2_values = []
differences = []

# Iterate over each row to compute cosine similarities
for _, row in df.iterrows():
    anchor_term = str(row['Anchor term'])
    closely_related_term = str(row['Closely related term'])
    distantly_related_term = str(row['Distantly related term'])

    # Get embeddings for each term
    anchor_embedding = get_embedding(anchor_term)
    closely_related_embedding = get_embedding(closely_related_term)
    distantly_related_embedding = get_embedding(distantly_related_term)

    # Calculate cosine similarity between "Anchor term" and "Closely related term"
    cosim1 = cosine_similarity(anchor_embedding.unsqueeze(0), closely_related_embedding.unsqueeze(0))[0][0]
    cosim1_values.append(cosim1)

    # Calculate cosine similarity between "Anchor term" and "Distantly related term"
    cosim2 = cosine_similarity(anchor_embedding.unsqueeze(0), distantly_related_embedding.unsqueeze(0))[0][0]
    cosim2_values.append(cosim2)

    # Calculate the difference between cosim1 and cosim2
    difference = cosim1 - cosim2
    differences.append(difference)

# Add the cosine similarities and differences to new columns in the DataFrame
df['cosim1'] = cosim1_values
df['cosim2'] = cosim2_values
df['difference (cosim1-cosim2)'] = differences

# Save the updated DataFrame back to the Excel file
output_file_path = '/content/drive/My Drive/Experiment/triplet_cosine_similarities.xlsx'
df.to_excel(output_file_path, sheet_name='Triplet', index=False)

# Print the mean of each calculated column
print(f"Mean cosim1: {sum(cosim1_values) / len(cosim1_values)}")
print(f"Mean cosim2: {sum(cosim2_values) / len(cosim2_values)}")
print(f"Mean difference (cosim1 - cosim2): {sum(differences) / len(differences)}")

print("Cosine similarities and differences calculated and saved in the new file.")


In [None]:
import pandas as pd
import scipy.stats as stats
from scipy.stats import shapiro

# Load the Excel file and select the "Triplet" sheet
file_path = '/content/drive/My Drive/Experiment/triplet_cosine_similarities.xlsx'
df = pd.read_excel(file_path, sheet_name='Triplet')

# Extract the columns for cosine similarities and calculate the difference
cosim1 = df['cosim1']
cosim2 = df['cosim2']
cosim_diff = df['difference (cosim1-cosim2)']

# Perform the Shapiro-Wilk test for normality on the cosine similarity differences
shapiro_stat, shapiro_pvalue = shapiro(cosim_diff)
print("Shapiro-Wilk Test Statistic:", shapiro_stat)
print("Shapiro-Wilk Test p-value:", shapiro_pvalue)

# Check if the data is normally distributed (p-value >= 0.05)
if shapiro_pvalue >= 0.05:
    print("Data is normally distributed. Proceeding with paired t-tests.")

    # Two-tailed paired t-test
    t_stat, t_pvalue = stats.ttest_rel(cosim1, cosim2)
    print("Two-tailed paired T-test Statistic:", t_stat)
    print("Two-tailed paired T-test p-value:", t_pvalue)

    # One-tailed paired t-test (alternative hypothesis: cosim1 > cosim2)
    t_stat_one_tail, t_pvalue_one_tail = stats.ttest_rel(cosim1, cosim2, alternative='greater')
    print("One-tailed paired T-test Statistic:", t_stat_one_tail)
    print("One-tailed paired T-test p-value:", t_pvalue_one_tail)

else:
    print("Data is not normally distributed. Proceeding with Wilcoxon signed-rank tests.")

    # Two-tailed Wilcoxon signed-rank test
    wilcoxon_stat, wilcoxon_pvalue = stats.wilcoxon(cosim1, cosim2)
    print("Two-tailed Wilcoxon signed-rank test Statistic:", wilcoxon_stat)
    print("Two-tailed Wilcoxon signed-rank test p-value:", wilcoxon_pvalue)

    # One-tailed Wilcoxon signed-rank test (alternative hypothesis: cosim1 > cosim2)
    wilcoxon_stat_one_tail, wilcoxon_pvalue_one_tail = stats.wilcoxon(cosim1, cosim2, alternative='greater')
    print("One-tailed Wilcoxon signed-rank test Statistic:", wilcoxon_stat_one_tail)
    print("One-tailed Wilcoxon signed-rank test p-value:", wilcoxon_pvalue_one_tail)


Practical implementation of the EEMWT dataset (example)

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load SciBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

# Load the positive triplets
positive_triplets_df = pd.read_excel('/content/drive/My Drive/Experiment/Triplets.xlsx')

# Function to get embeddings using different pooling methods
def get_embedding(text, pooling_method='mean'):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs).last_hidden_state

    # Select the pooling method
    if pooling_method == 'mean':
        return outputs.mean(dim=1).numpy()  # Mean pooling
    elif pooling_method == 'max':
        return outputs.max(dim=1).values.numpy()  # Max pooling
    elif pooling_method == 'min':
        return outputs.min(dim=1).values.numpy()  # Min pooling
    elif pooling_method == 'cls':
        return outputs[:, 0, :].numpy()  # CLS token pooling
    else:
        raise ValueError("Pooling method must be one of ['mean', 'max', 'min', 'cls']")

# Initialize counters for metrics
TP = 0  # True Positive
FP = 0  # False Positive
total_triplets = len(positive_triplets_df)

# Define the pooling methods to evaluate
pooling_methods = ['mean', 'max', 'min', 'cls']

# Evaluate triplets for each pooling method
for pooling_method in pooling_methods:
    TP = 0  # Reset TP for each pooling method
    FP = 0  # Reset FP for each pooling method

    print(f"\nEvaluating with {pooling_method} pooling...")

    for _, row in positive_triplets_df.iterrows():
        anchor_term = row['Anchor term']
        close_term = row['Closely related term']
        dist_term = row['Distantly related term']

        # Get embeddings using the specified pooling method
        anchor_emb = get_embedding(anchor_term, pooling_method)
        close_emb = get_embedding(close_term, pooling_method)
        dist_emb = get_embedding(dist_term, pooling_method)

        # Calculate cosine similarities
        sim_close = cosine_similarity(anchor_emb, close_emb)[0][0]
        sim_dist = cosine_similarity(anchor_emb, dist_emb)[0][0]

        # Determine true positive or false positive
        if sim_close > 0.85:  # True Positive threshold
            TP += 1  # Correctly identified as closely related
        if sim_dist > 0.9:  # False Positive threshold
            FP += 1  # Incorrectly identified as closely related

    # Calculate false negatives and true negatives
    FN = total_triplets - TP
    TN = total_triplets - FP

    # Calculate metrics
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0

    # Output results for the current pooling method
    print("Results:")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-score: {f1:.4f}")
    print(f"  Accuracy: {accuracy:.4f}")

    # Debugging: Print counts of TP, FP, TN, FN
    print(f"\nTP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}")