In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [None]:
path = '/home/kebl7383/claim-matching-robustness/experiments/mitigation/gpt4o/ood-dataset/orig_ood_normalised.jsonl'

In [None]:
parsed_df = pd.read_json(path, lines=True)

In [None]:
def parse_normalised_claim(row):
    """
    Parses the normalised claim string to extract the claim content.
    If parsing fails, returns the original query from the row.

    Args:
        row (pd.Series): A row of the DataFrame containing 'normalised' and 'query'.

    Returns:
        str: The extracted claim content or the original query if parsing fails.
    """
    input_string = row['normalised']
    
    try:
        if isinstance(input_string, str) and input_string.startswith("Normalised Claim:"):
            # Remove the prefix and trim whitespace
            claim = input_string[len("Normalised Claim:"):].strip()
            # Remove surrounding quotes if they exist
            if claim.startswith('"') and claim.endswith('"'):
                claim = claim[1:-1]
            return claim
    except Exception:
        pass  # Handle unexpected errors silently

    # If parsing fails, return the original query
    return row['query']

# Apply function using DataFrame row-wise processing
parsed_df['normalised_claim'] = parsed_df.apply(parse_normalised_claim, axis=1)

In [None]:
parsed_df.head()

In [None]:
# Save the original claims
pd.DataFrame(parsed_df[['query_id', 'normalised_claim']]).to_csv(
    '/home/kebl7383/claim-matching-robustness/experiments/ood/ood-dataset/ood_normalised_queries.tsv',
    index=False,
    header=["query_id", "query"],
    sep="\t",
)

In [None]:
import matplotlib.pyplot as plt

# Prepare the data for plotting
data = {
    "Perturbation": ["Typos (Most)", "Dialect (Pidgin)", "Entity Replacement (All)"],
    "Unpertubed-LASER": [0.3925368375, 0.3762761346, 0.383623902],
    "Unpertubed-LASER+CN": [0.4692155375, 0.4272919467, 0.4487253053],
    "Perturbed-LASER": [0.327248736, 0.3307935937, 0.3221515615],
    "Perturbed-LASER+CN": [0.481794082, 0.4600243046, 0.3968193758],
    "Unpertubed-RoLASER": [0.4048508585, 0.3883688138, 0.4164909542],
    "Unpertubed-RoLASER+CN": [0.4239301394, 0.3980654122, 0.4168946832],
    "Perturbed-RoLASER": [0.3511871528, 0.3336198569, 0.3129087056],
    "Perturbed-RoLASER+CN": [0.4797898815, 0.4129593157, 0.3616379534]
}

df = pd.DataFrame(data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import cmcrameri.cm as cmc
import matplotlib

# Set font globally to Helvetica Neue
matplotlib.rcParams['font.family'] = 'Helvetica Neue'
plt.rcParams.update({'font.size': 16})  # Global font size


def plot_all_comparisons(df, output_file='comparison_plot.pdf'):
    perturbations = df["Perturbation"].unique()
    num_perturbations = len(perturbations)
    
    # Create a horizontal subplot for each perturbation
    fig, axes = plt.subplots(1, num_perturbations, figsize=(6 * num_perturbations, 4), sharey=True)
    
    # Colormap and bar colors
    bar_colors = ['#5DADE2', '#1B4F72', '#28B463', '#117A65']  # Navy, Green, Light Blue, Medium Green
    scatter_colors = ['#21618C', '#5499C7', '#82E0AA', '#48C9B0']  # Contrasting colors for scatter markers
    y_ticks = np.arange(0.30, 0.56, 0.05)  # Adjust start, end, and step size as needed

    # Markers for shapes
    markers = ['s', '^', 'h', 'o']  # Cross, Triangle, Star, Circle
    
    # Loop through each perturbation and plot
    for ax, perturbation in zip(axes, perturbations):
        row = df[df["Perturbation"] == perturbation].iloc[0]
        
        # Define x positions for each bar
        x_labels = ['LASER', 'LASER+CN', 'RoLASER', 'RoLASER+CN']
        x_positions = np.arange(len(x_labels))
        
        # Unperturbed and perturbed data
        unperturbed = [
            row["Unpertubed-LASER"],
            row["Unpertubed-LASER+CN"],
            row["Unpertubed-RoLASER"],
            row["Unpertubed-RoLASER+CN"]
        ]
        perturbed = [
            row["Perturbed-LASER"],
            row["Perturbed-LASER+CN"],
            row["Perturbed-RoLASER"],
            row["Perturbed-RoLASER+CN"]
        ]
        
        # Bar width
        bar_width = 0.50
        
        # Plot each bar
        for i, (x, p, u, bar_color, scatter_color) in enumerate(zip(x_positions, perturbed, unperturbed, bar_colors, scatter_colors)):
            ax.bar(x, p, width=bar_width, color=bar_color, alpha=0.8)
            
            # Add a marker for unperturbed value
            ax.scatter(x, u, color=scatter_color, marker=markers[i], s=100, edgecolors='white', linewidth=1, label=f'{x_labels[i]} Unperturbed')
        
        # Add horizontal gridlines
        for y in y_ticks:
            ax.axhline(y=y, color='lightgray', linestyle='--', linewidth=0.3)
        
        for x in x_positions:
            ax.axvline(x=x, color='lightgray', linestyle='--', linewidth=0.3)
    
        
        # Set labels and title
        ax.set_title(f'{perturbation}')
        ax.set_xticks(x_positions)
        ax.set_xticklabels(x_labels)
        ax.set_yticks(y_ticks)
        ax.set_ylim(0.30, 0.55)

        # Set the box outline (spines) to grey
        for spine in ax.spines.values():
            spine.set_edgecolor('grey')
        
        ax.tick_params(axis='both', length=0)

    
    # Add a common y-axis label
    fig.text(-0.005, 0.5, 'MAP@20', va='center', rotation='vertical')
    
    # Add a shared legend
    handles, labels = ax.get_legend_handles_labels()
    fig.legend(handles, labels, loc='lower center', ncol=len(x_labels), fontsize='small', bbox_to_anchor=(0.5, -0.1), frameon=False)
    
    # Adjust layout to prevent overlap
    plt.tight_layout(rect=[0, 0, 1, 0.95])

    plt.savefig(output_file, format='pdf', bbox_inches='tight', dpi=300)
    
    # Show the combined plot
    plt.show()

# Call the function to plot all perturbations
plot_all_comparisons(df)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Sample CSV data (simulated based on the description)
data = {
    "Model": [
        "all-mpnet-v2", "all-mpnet-basev2-robust", "all-mpnet-base-v2-ft", "all-mpnet-basev2-robust-ft",
        "all-mpnet-v2 + CN", "all-mpnet-basev2-robust + CN", "all-mpnet-base-v2-ft + CN", "all-mpnet-basev2-robust-ft + CN"
    ],
    "Typos": [0.6777792947, 0.7299715423, 0.7738389936, 0.7778453705, 0.7802357695, 0.8297682036, 0.8864141811, 0.8815278637],
    "Entity Replacement": [0.6634501966, 0.7741837164, 0.8048978672, 0.8113413255, 0.7465073234, 0.8383203952, 0.8653778943, 0.8863768749],
    "Dialect-Pidgin": [0.651213211, 0.7962632275, 0.798595063, 0.8428588505, 0.7289621542, 0.8162026703, 0.8615482391, 0.865947589],
    "Negation": [0.7365962491, 0.8122539556, 0.8538558851, 0.8422887173, 0.7930470993, 0.8236963299, 0.8748762499, 0.8531178844]
}

# Convert to DataFrame
df = pd.DataFrame(data)

def plot_perturbations_no_xlabels(df, output_file='perturbations_no_xlabels.pdf'):
    categories = ["Typos", "Entity Replacement", "Dialect-Pidgin", "Negation"]
    num_categories = len(categories)
    num_models = len(df["Model"])
    
    # Define color map for the models
    cmap = cm.get_cmap('plasma', num_models)
    colors = [cmap(i) for i in range(num_models)]

    # Create subplots for each perturbation type
    fig, axes = plt.subplots(1, num_categories, figsize=(6 * num_categories, 5), sharey=True)
    bar_width = 1.0  # Full-width bars for no spacing
    x_positions = np.arange(len(df["Model"]))  # X positions for each bar group

    for ax, category in zip(axes, categories):
        for i, model in enumerate(df["Model"]):
            ax.bar(x_positions[i], df[category][i], color=colors[i], width=bar_width, align='center', label=model if ax == axes[0] else "")
        ax.set_title(category, fontsize=14)
        ax.set_xticks([])  # Remove x-axis labels
        # ax.set_ylabel("Score", fontsize=12)
        ax.set_ylim(0.5, 1)
        ax.grid(axis='y', linestyle='--', alpha=0.7)



    # Add a single legend
    handles = [plt.Rectangle((0, 0), 1, 1, color=colors[i]) for i in range(num_models)]
    fig.legend(handles, df["Model"], loc='upper center', ncol=num_models, bbox_to_anchor=(0.5, 1.15), fontsize=14)

    # Adjust layout and save
    plt.tight_layout()
    # plt.savefig(output_file, format='pdf', bbox_inches='tight', dpi=300)
    plt.show()

# Call the function to create the plot without x-axis labels
plot_perturbations_no_xlabels(df)

In [None]:
def plot_perturbations_no_spacing_in_groups(df, output_file='perturbations_no_spacing_in_groups.pdf'):
    categories = ["Typos", "Entity Replacement", "Dialect-Pidgin", "Negation"]
    num_categories = len(categories)
    
    # Split the data into without CN and with CN
    df_no_cn = df.iloc[:4]
    df_with_cn = df.iloc[4:]
    
    # Define color map for the models
    cmap = cm.get_cmap('virdis', len(df))
    colors_no_cn = [cmap(i + len(df_no_cn)) for i in range(len(df_with_cn))] 
    colors_with_cn = [cmap(i) for i in range(len(df_no_cn))] 

    # Create subplots for each perturbation type
    fig, axes = plt.subplots(1, num_categories, figsize=(6 * num_categories, 4), sharey=True)
    bar_width = 1.0  # Bar width
    x_spacing = 2.0 # Spacing between groups=

    for ax, category in zip(axes, categories):
        # Add bars for models without CN
        x_positions_no_cn = np.arange(len(df_no_cn))
        for i, model in enumerate(df_no_cn["Model"]):
            ax.bar(x_positions_no_cn[i], df_no_cn[category].iloc[i], color=colors_no_cn[i], width=bar_width, align='center', label=model if ax == axes[0] else "")

        # Add bars for models with CN, placed after a gap
        x_positions_with_cn = x_positions_no_cn[-1] + x_spacing + np.arange(len(df_with_cn))
        for i, model in enumerate(df_with_cn["Model"]):
            ax.bar(x_positions_with_cn[i], df_with_cn[category].iloc[i], color=colors_with_cn[i], width=bar_width, align='center', label=model if ax == axes[0] else "")
        
        # Add titles and format axes
        ax.set_title(category, fontsize=20)
        ax.set_ylim(0.5, 1)
        ax.grid(axis='y', linestyle='--', alpha=0.7)
        ax.set_xticks([])

    all_colors = colors_no_cn + colors_with_cn
    
    # Add a single legend
    handles = [plt.Rectangle((0, 0), 1, 1, color=all_colors[i]) for i in range(len(all_colors))]
    fig.legend(handles, df["Model"], loc='lower center', ncol=len(df), bbox_to_anchor=(0.5, -0.1), fontsize=14, frameon=False)
     # Add a common y-axis label
    fig.text(-0.007, 0.5, 'MAP@20', va='center', rotation='vertical', fontdict={'fontsize': 20})

    # Adjust layout and save
    plt.tight_layout()
    # plt.savefig(output_file, format='pdf', bbox_inches='tight', dpi=300)
    plt.show()

# Call the function to create the plot with no spacing within each group
plot_perturbations_no_spacing_in_groups(df)


In [None]:
# Implement logic to create parallel dataset sentences for each perturbation type
# Load the generations from GPT4o
perturbation_path = '/home/kebl7383/claim-matching-robustness/experiments/named_entity_replacement/gpt4o/clef2021-checkthat-task2a--english/train_worstcase_named_entity_replacements_verified.jsonl'

verified_df = pd.read_json(perturbation_path, lines=True)

def parse_rewritten_tweets(text):
    """
    Parses a given string of rewritten tweets into a list of individual tweets.

    Args:
        text (str): The input string containing rewritten tweets.

    Returns:
        list: A list of individual rewritten tweets.
    """
    # Split the text by lines and filter out any empty lines
    lines = [line.strip() for line in text.split("\n") if line.strip()]

    # Extract tweets after the colon ": " in lines that start with "Rewritten Tweet"
    tweets = [
        line.split(": ", 1)[1]
        for line in lines
        if line.startswith("Rewritten Tweet") and ": " in line
    ]

    return tweets


def parse_rewritten_tweets(text):
    """
    Parses a given string of rewritten tweets into a list of individual tweets.

    Args:
        text (str): The input string containing rewritten tweets.

    Returns:
        list: A list of individual rewritten tweets.
    """
    # Handle cases where the text may not have proper newlines between tweets
    tweets = []
    for segment in text.split("Rewritten Tweet"):
        # Ignore empty segments or ones without valid content
        if not segment.strip():
            continue
        # Extract the tweet number and content after ":"
        parts = segment.split(":", 1)
        if len(parts) > 1:
            tweet = parts[1].replace("\n", "").replace("\\n", "").strip()
            tweets.append(tweet)
    return tweets

In [None]:
import json 
import random

In [None]:
sample_response = verified_df.sample(n=1)['rewrites'].values[0]
parse_rewritten_tweets(sample_response)

In [None]:
perturbed_claims = []
for idx, row in verified_df.iterrows():
    rewrites = parse_rewritten_tweets(str(row["rewrites"]))
    # If json loads fails, skip the row
    try:
        verified_labels = json.loads(row["verification"])["labels"]
    except:
        continue
    # Get indices where the label is 1
    verified_idx = [
        idx for idx, label in enumerate(verified_labels) if label == 1
    ]
    # Loop through the verified indices
    for idx in verified_idx:
        # Add the original claim and rewritten claim to their respective lists
        orig_json = {
            "query_id": row["query_id"],
            "original_query": row["query"],
            "perturbed_query": rewrites[idx],
        }
        perturbed_claims.append(orig_json)

In [None]:
len(perturbed_claims)

In [None]:
entity_replaced_df = pd.DataFrame(perturbed_claims)

In [None]:
entity_replaced_df.head()

In [None]:
# Load the dialect changes
perturbation_path = "/home/kebl7383/claim-matching-robustness/experiments/dialect/gpt4o/clef2021-checkthat-task2a--english/train_dialect_rewrites_verified.jsonl"

In [None]:
dialect_verified_df = pd.read_json(perturbation_path, lines=True)

In [None]:
def parse_rewritten_old_tweets(text):
    """
    Parses a given string of rewritten tweets into a list of individual tweets.

    Args:
        text (str): The input string containing rewritten tweets.

    Returns:
        list: A list of individual rewritten tweets.
    """
    # Split the text by lines and filter out any empty lines
    lines = [line.strip() for line in text.split("\n") if line.strip()]

    # Extract tweets after the colon ": " in lines that start with "Rewritten Tweet"
    tweets = [
        line.split(": ", 1)[1]
        for line in lines
        if line.startswith("Rewritten Tweet") and ": " in line
    ]

    return tweets

In [None]:
dialect_perturbed_claims = []
for idx, row in dialect_verified_df.iterrows():
    rewrites = parse_rewritten_old_tweets(str(row["rewrites"]))
    # If json loads fails, skip the row
    try:
        verified_labels = json.loads(row["verification"])["labels"]
    except:
        continue
    # Get indices where the label is 1
    verified_idx = [
        idx for idx, label in enumerate(verified_labels) if label == 1
    ]
    # Loop through the verified indices
    for idx in verified_idx:
        # Add the original claim and rewritten claim to their respective lists
        orig_json = {
            "query_id": row["query_id"],
            "original_query": row["query"],
            "perturbed_query": rewrites[idx],
        }
        dialect_perturbed_claims.append(orig_json)

In [None]:
dialect_perturbed_df = pd.DataFrame(dialect_perturbed_claims)

In [None]:
dialect_perturbed_df.tail()

In [None]:
dialect_perturbed_df.shape, entity_replaced_df.shape

In [None]:
# Load typos
typos_perturbation_path = "/home/kebl7383/claim-matching-robustness/experiments/typos/gpt4o/clef2021-checkthat-task2a--english/train_llm_typos_verified.jsonl"

In [None]:
typos_verified_df = pd.read_json(typos_perturbation_path, lines=True)

In [None]:
typos_perturbed_claims = []
for idx, row in typos_verified_df.iterrows():
    rewrites = parse_rewritten_old_tweets(str(row["rewrites"]))
    # If json loads fails, skip the row
    try:
        verified_labels = json.loads(row["verification"])["labels"]
    except:
        continue
    # Get indices where the label is 1
    verified_idx = [
        idx for idx, label in enumerate(verified_labels) if label == 1
    ]
    # Loop through the verified indices
    for idx in verified_idx:
        # Add the original claim and rewritten claim to their respective lists
        orig_json = {
            "query_id": row["query_id"],
            "original_query": row["query"],
            "perturbed_query": rewrites[idx],
        }
        typos_perturbed_claims.append(orig_json)

In [None]:
typos_perturbed_df = pd.DataFrame(typos_perturbed_claims) 

In [None]:
typos_perturbed_df.shape

In [None]:
(dialect_perturbed_df.shape[0] + entity_replaced_df.shape[0] + typos_perturbed_df.shape[0])

In [None]:
# Load the negation changes
negation_perturbation_path = "/home/kebl7383/claim-matching-robustness/experiments/negation/gpt4o/clef2021-checkthat-task2a--english/train_worstcase_negation_verified.jsonl"

In [None]:
negation_verified_df = pd.read_json(negation_perturbation_path, lines=True)

In [None]:
import re

In [None]:
def parse_claims(markdown_json_string):
    """
    Parses a JSON string formatted with Markdown-style backticks and returns the list of claims.

    Args:
        markdown_json_string (str): A string containing JSON wrapped in Markdown backticks.

    Returns:
        list: A list of claims from the JSON or an empty list if no claims are found.
    """
    try:
        # Remove Markdown formatting (backticks and optional language labels)
        cleaned_json_string = re.sub(
            r"```(?:json)?\n", "", markdown_json_string.strip()
        ).strip("`")

        # Parse the cleaned JSON string
        parsed_data = json.loads(cleaned_json_string)

        # Return the list of claims
        return parsed_data.get("negated_claims", [])
    except (json.JSONDecodeError, AttributeError) as e:
        # Handle errors gracefully and return an empty list
        print(f"Error parsing JSON: {e}")
        return []

In [None]:
test = negation_verified_df.sample(n=1)['rewrites'].values[0]
parse_claims(test)

In [None]:
negation_perturbed_claims = []
for idx, row in negation_verified_df.iterrows():
    print(f'We are here {idx}')
    rewrites = parse_claims(row["rewrites"])
    # If json loads fails, skip the row
    try:
        verified_labels = json.loads(row["verification"])["labels"]
    except:
        continue
    # Get indices where the label is 1
    verified_idx = [
        idx for idx, label in enumerate(verified_labels) if label == 1
    ]
    # Loop through the verified indices
    for idx in verified_idx:
        # Add try catch block to handle errors
        try:
            # Add the original claim and rewritten claim to their respective lists
            orig_json = {
                "query_id": row["query_id"],
                "original_query": row["query"],
                "perturbed_query": rewrites[idx],
            }
            negation_perturbed_claims.append(orig_json)
        except:
            continue

In [None]:
len(negation_perturbed_claims)

In [None]:
negation_perturbed_df = pd.DataFrame(negation_perturbed_claims)

In [None]:
(dialect_perturbed_df.shape[0] + entity_replaced_df.shape[0] + typos_perturbed_df.shape[0] + negation_perturbed_df.shape[0])

In [None]:
all_dfs = pd.concat([dialect_perturbed_df, entity_replaced_df, typos_perturbed_df, negation_perturbed_df])

In [None]:
all_dfs.shape

In [None]:
all_dfs.head()

In [None]:
all_dfs.to_csv('train_perturbed_queries_lite.csv', index=False)

In [None]:
from itertools import combinations

In [None]:
# Function to generate combinatorial pairs
def generate_combinatorial_pairs(df):
    result = []
    for query_id, group in df.groupby("query_id"):
        # Generate all combinations of perturbed_query pairs
        perturbed_queries = group["perturbed_query"].tolist()
        pairs = combinations(perturbed_queries, 2)
        for pair in pairs:
            result.append({
                "query_id": query_id,
                "original_query": pair[0],
                "perturbation_query": pair[1]
            })
    return pd.DataFrame(result)

# Generate the test file
test_df = generate_combinatorial_pairs(all_dfs)

In [None]:
test_df.shape

In [None]:
test_df[test_df["query_id"] == "tweet-sno-4"]  

In [None]:
# Save output as CSV
test_df.to_csv('train_perturbed_queries_full.csv', index=False)

In [None]:
import pandas as pd

In [None]:
# Load the lite file
lite_df = pd.read_csv('train_perturbed_queries_lite.csv')

In [None]:
lite_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Get unique query IDs
unique_query_ids = lite_df['query_id'].unique()

# Step 2: Split the unique IDs into train and evaluation sets
train_ids, eval_ids = train_test_split(unique_query_ids, test_size=0.10, random_state=42)

# Step 3: Split the original DataFrame based on the IDs
train_df = lite_df[lite_df['query_id'].isin(train_ids)]
eval_df = lite_df[lite_df['query_id'].isin(eval_ids)]

In [None]:
# Display the resulting DataFrames
train_df.shape, eval_df.shape

In [None]:
eval_df.head()

In [None]:
# Save lite train and eval files
train_df.to_csv('train_perturbed_queries_lite_train.csv', index=False)
eval_df.to_csv('train_perturbed_queries_lite_eval.csv', index=False)

In [None]:
full_df = pd.read_csv('train_perturbed_queries_full.csv')

In [None]:
from sklearn.model_selection import train_test_split

# Step 1: Get unique query IDs
unique_query_ids = full_df['query_id'].unique()

# Step 2: Split the unique IDs into train and evaluation sets
train_ids, eval_ids = train_test_split(unique_query_ids, test_size=0.10, random_state=42)

# Step 3: Split the original DataFrame based on the IDs
full_train_df = full_df[full_df['query_id'].isin(train_ids)]
full_eval_df = full_df[full_df['query_id'].isin(eval_ids)]

In [None]:
full_train_df.shape, full_eval_df.shape

In [None]:
# Save lite train and eval files
train_df.to_csv('train_perturbed_queries_full_train.csv', index=False)
eval_df.to_csv('train_perturbed_queries_full_eval.csv', index=False)