In [None]:
# Ensure nltk is set up (run this once)
import nltk
nltk.download('punkt')
nltk.download('taggers/averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np

#file_path = "results/results_gradients/val-text-preds.csv"
#file_path = "results/results_gradients/combined_val_text_preds.csv"
root_file_path = "results/results_gradients/"
file_path_dict = {
    "ag_news": root_file_path + "ag_news_eval_output.csv",
    "anthropic_toxic_prompts": root_file_path + "anthropic_toxic_prompts_eval_output.csv",
    "arxiv": root_file_path + "arxiv_eval_output.csv",
#     "one_million_instructions_train": root_file_path + "one_million_instructions_train_1000_eval_output.csv",
      "one_million_instructions_train_10k": root_file_path + "one_million_instructions_train_10k_eval_output.csv",
    "one_million_instructions_val": root_file_path + "one_million_instructions_val_eval_output.csv",
    "wikibio": root_file_path + "wikibio_eval_output.csv",
    "xsum_doc": root_file_path + "xsum_doc_eval_output.csv",
    "xsum_summ": root_file_path + "xsum_summ_eval_output.csv",
    "python_code_alpaca": root_file_path + "python_code_alpaca_eval_output.csv"
}
df_dict = {}
for key, value in file_path_dict.items():
    print(f"reading {key}'s file", flush=True)
    df_dict[key] = pd.read_csv(value)

# Function to calculate matching token length
def calculate_matching_length(row):
    original_tokens = word_tokenize(row['Original'])
    decoded_tokens = word_tokenize(row['Decoded'])
    correct_tokens = [tok for tok, dec_tok in zip(original_tokens, decoded_tokens) if tok == dec_tok]
    return len(correct_tokens), len(original_tokens)

# for key, df in df_dict.items():
#     # Apply function to each row
#     df[['correct_token_length', 'original_token_length']] = df.apply(
#         lambda row: pd.Series(calculate_matching_length(row)), axis=1
#     )
#     # Calculate additional stats if needed
#     df['match_ratio'] = df['correct_token_length'] / df['original_token_length']

#     # Save to a new CSV or inspect
#     output_path = key + "_comparison_results.csv"
#     df.to_csv(output_path, index=False)
#     print(df.head())

In [None]:
import pandas as pd
from nltk import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree

# Function to extract person names using NER
def extract_person_names(sentence):
    if not isinstance(sentence, str) or sentence.strip() == "":
        return set()
    chunked = ne_chunk(pos_tag(word_tokenize(sentence)))
    person_names = set()
    for chunk in chunked:
        if isinstance(chunk, Tree) and chunk.label() == 'PERSON':
            name = " ".join(c[0] for c in chunk)
            person_names.add(name)
    return person_names


# for key, data in df_dict.items():
# # Extract person names from both columns
#     output_file_path = key + "_person_instances.csv"
#     data['Original_Names'] = data['Original'].apply(extract_person_names)
#     data['Decoded_Names'] = data['Decoded'].apply(extract_person_names)
#     data[['Original', 'Decoded', 'Original_Names', 'Decoded_Names']].to_csv(output_file_path, index=False)
    
#     # Calculate precision and recall
#     true_positive = 0
#     total_original_names = 0
#     total_decoded_names = 0
    
#     for _, row in data.iterrows():
#         original_names = row['Original_Names']
#         decoded_names = row['Decoded_Names']
        
#         true_positive += len(original_names & decoded_names)
#         total_original_names += len(original_names)
#         total_decoded_names += len(decoded_names)
    
#     precision = true_positive / total_decoded_names if total_decoded_names > 0 else 0
#     recall = true_positive / total_original_names if total_original_names > 0 else 0
    
#     # Output the results
#     print(f"Precision: {precision:.2f}")
#     print(f"Recall: {recall:.2f}")

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define the sentences
sentences = [
    "Is Bob coming to the meeting today?",
    "Bob’s coming to the meeting today?",
    "The meeting attendance scheduled for today includes Bob, correct?",
    "Is Bob going to the concert today?",
    "Bob's coming to the concert today?",
    "Bob will be playing today, right?"
]

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose other Sentence-BERT models as well

# Generate embeddings for each sentence
embeddings = model.encode(sentences)

# Compute pairwise cosine similarities
similarity_matrix = cosine_similarity(embeddings)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix:")
print(similarity_matrix)

# Optional: Format the matrix for better readability
formatted_matrix = np.round(similarity_matrix, 2)
print("\nFormatted Cosine Similarity Matrix:")
print(formatted_matrix)



Cosine Similarity Matrix:
[[1.         0.9315515  0.78193676 0.80356836 0.73167723 0.67972594]
 [0.9315515  1.         0.7944467  0.72048306 0.7646935  0.62692446]
 [0.78193676 0.7944467  1.0000001  0.6352874  0.60976624 0.5672598 ]
 [0.80356836 0.72048306 0.6352874  1.0000001  0.9181043  0.7095541 ]
 [0.73167723 0.7646935  0.60976624 0.9181043  1.0000002  0.6342138 ]
 [0.67972594 0.62692446 0.5672598  0.7095541  0.6342138  0.9999999 ]]

Formatted Cosine Similarity Matrix:
[[1.   0.93 0.78 0.8  0.73 0.68]
 [0.93 1.   0.79 0.72 0.76 0.63]
 [0.78 0.79 1.   0.64 0.61 0.57]
 [0.8  0.72 0.64 1.   0.92 0.71]
 [0.73 0.76 0.61 0.92 1.   0.63]
 [0.68 0.63 0.57 0.71 0.63 1.  ]]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# for key, df in df_dict.items():
#     print(f"KEY: {key}")
#     plt.figure(figsize=(10, 6))
#     sns.regplot(x='original_token_length', y='correct_token_length', data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
#     plt.title('Trend of Original vs Correct Token Lengths', fontsize=14)
#     plt.xlabel('Original Token Length', fontsize=12)
#     plt.ylabel('Correct Token Length', fontsize=12)
#     plt.grid(True)
#     plt.show()

In [None]:
for key, df in df_dict.items():
    print(f"KEY: {key}")
    plt.figure(figsize=(10, 6))
    plt.scatter(df['original_token_length'], df['correct_token_length'], alpha=0.6, color='b', label="Correct Tokens")
    plt.plot([0, max(df['original_token_length'])], [0, max(df['original_token_length'])], color='r', linestyle='--', label="Perfect Match")

    plt.title('Comparison of Original and Correctly Decoded Token Lengths', fontsize=14)
    plt.xlabel('Original Token Length', fontsize=12)
    plt.ylabel('Correct Token Length', fontsize=12)
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
# Create histograms
for key, df in df_dict.items():
    print(f"KEY: {key}")
    plt.figure(figsize=(10, 6))
    plt.hist(df['original_token_length'], bins=20, alpha=0.6, label='Original Token Length', color='blue')
    plt.hist(df['correct_token_length'], bins=20, alpha=0.6, label='Correct Token Length', color='orange')

    # Add labels and legend
    plt.title('Distribution of Token Lengths', fontsize=14)
    plt.xlabel('Token Length', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.legend()
    plt.grid(True)
    plt.show()


In [None]:
# Calculate the ratio of correct to original token lengths
for key, df in df_dict.items():
    print(f"KEY: {key}")
    df['ratio'] = df['correct_token_length'] / df['original_token_length']

    # Create heatmap data: ratio values as a 2D array with one row
    heatmap_data = np.expand_dims(df['ratio'].values, axis=0)

    # Plot the heatmap
    plt.figure(figsize=(15, 2))  # Wide and short for a compact representation
    sns.heatmap(heatmap_data, cmap="coolwarm", cbar=True, annot=False, xticklabels=False, yticklabels=['Ratio'])

    # Add labels
    plt.title('Heatmap of Correct to Original Token Length Ratios', fontsize=14)
    plt.xlabel('Sentence Index', fontsize=12)
    plt.ylabel('Correctness Ratio', fontsize=12)
    plt.show()

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# file_path = "results/results-gradients/val-text-preds.csv"
# df = pd.read_csv(file_path)

# Load a pre-trained model from SentenceTransformers
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose other models as needed
tokenizer = model.tokenizer
for key, df in df_dict.items():
    print(f"KEY: {key}")
    # Compute embeddings
    embeddings1 = model.encode(df["Original"].tolist(), convert_to_tensor=True)
    embeddings2 = model.encode(df["Decoded"].tolist(), convert_to_tensor=True)

    # Compute token lengths for each sentence
    df["tokens_original"] = df["Original"].apply(lambda x: len(tokenizer.tokenize(x)))
    df["tokens_decoded"] = df["Decoded"].apply(lambda x: len(tokenizer.tokenize(x)))

    # Compute cosine similarity
    similarities = cosine_similarity(embeddings1.cpu().numpy(), embeddings2.cpu().numpy())
    diagonal_similarities = np.diagonal(similarities)  # Get pairwise similarities

    # Add similarity scores to the DataFrame
    df["similarity"] = diagonal_similarities

    # Sort the DataFrame by similarity in descending order
    df_sorted = df.sort_values(by="similarity", ascending=False)

    # Display results
    print("Sentence pairs sorted by similarity (most to least):")
    for _, row in df_sorted.iterrows():
        print(f"Pair: ({row['Original']}, {row['Decoded']})\nSimilarity: {row['similarity']:.4f}\n")

    df.to_csv("cos_sim_" + key + "_eval.csv")

In [None]:
# import matplotlib.pyplot as plt

# # Calculate length difference and add to the DataFrame
# for key, df in df_dict.items():
#     print(f"KEY: {key}")
#     df_sorted = df.sort_values(by="similarity", ascending=False)
#     df_sorted["length"] = df_sorted["tokens_decoded"]

#     # Plot similarity vs. length difference
#     plt.figure(figsize=(10, 6))
#     plt.scatter(
#         df_sorted["length"], 
#         df_sorted["similarity"], 
#         c=df_sorted["similarity"], 
#         cmap="viridis", 
#         edgecolor="k", 
#         s=100
#     )

#     # Add labels and title
#     plt.title("Similarity vs. Decoded Token Length", fontsize=14)
#     plt.xlabel("Token Length", fontsize=12)
#     plt.ylabel("Similarity Score", fontsize=12)

#     # Add color bar for similarity
#     cbar = plt.colorbar()
#     cbar.set_label("Similarity Score", fontsize=12)

#     # Show grid and plot
#     plt.grid(True, linestyle="--", alpha=0.6)
#     plt.show()


In [None]:
import matplotlib.pyplot as plt

# Calculate length difference and add to the DataFrame
for key, df in df_dict.items():
    print(f"KEY: {key}")
    df_sorted = df.sort_values(by="similarity", ascending=False)
    df['length_bin'] = pd.cut(df['tokens_decoded'], bins=20)
    binned_data = df.groupby('length_bin')['similarity'].mean().reset_index()

    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=binned_data['length_bin'].astype(str), 
        y=binned_data['similarity'], 
        palette='Blues'
    )

    plt.title('Binned Cosine Similarity vs Original Token Length', fontsize=14)
    plt.xlabel('Original Token Length (Binned)', fontsize=12)
    plt.ylabel('Average Cosine Similarity', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Determine the global range of 'original_token_length'
all_lengths = pd.concat([df['original_token_length'] for df in df_dict.values()])
global_min, global_max = all_lengths.min(), all_lengths.max()

# Step 2: Create consistent bins
bins = np.linspace(global_min, global_max, 20)  # 20 bins for example
bin_labels = [f"{int(bins[i])}-{int(bins[i+1])}" for i in range(len(bins)-1)]

# Step 3: Apply binning to each DataFrame and compute aggregates
binned_data_cosine = []
binned_data_decoded = []

for dataset_name, df in df_dict.items():
    # Binning
    df['length_bin'] = pd.cut(df['tokens_original'], bins=bins, labels=bin_labels, include_lowest=True)
    
    # Aggregating cosine similarity
    binned_cosine = df.groupby('length_bin')['similarity'].mean().reset_index()
    binned_cosine['dataset'] = dataset_name
    binned_data_cosine.append(binned_cosine)
    
    # Aggregating decoded token length
    binned_decoded = df.groupby('length_bin')['tokens_decoded'].mean().reset_index()
    binned_decoded['dataset'] = dataset_name
    binned_data_decoded.append(binned_decoded)

# Combine the results
combined_cosine_data = pd.concat(binned_data_cosine, ignore_index=True)
combined_decoded_data = pd.concat(binned_data_decoded, ignore_index=True)

# Step 4: Plot the trends

# Cosine Similarity Trends
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=combined_cosine_data,
    x='length_bin', y='similarity', hue='dataset',
    marker='o'
)
plt.title('Cosine Similarity Trends Across Token Length Bins', fontsize=14)
plt.xlabel('Original Token Length (Binned)', fontsize=12)
plt.ylabel('Average Cosine Similarity', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Dataset')
plt.tight_layout()
plt.show()

# Decoded Token Length Trends
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=combined_decoded_data,
    x='length_bin', y='tokens_decoded', hue='dataset',
    marker='o'
)
plt.title('Decoded Token Length Trends Across Original Token Length Bins', fontsize=14)
plt.xlabel('Original Token Length (Binned)', fontsize=12)
plt.ylabel('Average Decoded Token Length', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Dataset')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.translate.bleu_score import sentence_bleu

# Step 1: Determine the global range of 'original_token_length'
all_lengths = pd.concat([df['tokens_original'] for df in df_dict.values()])
global_min, global_max = all_lengths.min(), all_lengths.max()

# Step 2: Create consistent bins
bins = np.linspace(global_min, global_max, 20)  # 20 bins for example
bin_labels = [f"{int(bins[i])}-{int(bins[i+1])}" for i in range(len(bins)-1)]

# Step 3: Add BLEU scores and apply binning
binned_data_bleu = []
binned_data_cosine = []
binned_data_decoded = []

for dataset_name, df in df_dict.items():
    # Adding BLEU scores
    df['bleu_score'] = df.apply(
        lambda row: sentence_bleu(
            [[str(row['Original'])]],  # Reference sequence
            [str(row['Decoded'])],    # Candidate sequence
            weights=(1.0,)  # BLEU-1
        ),
        axis=1
    )
    
    # Binning
    df['length_bin'] = pd.cut(df['tokens_original'], bins=bins, labels=bin_labels, include_lowest=True)
    
    # Aggregating BLEU scores
    binned_bleu = df.groupby('length_bin')['bleu_score'].mean().reset_index()
    binned_bleu['dataset'] = dataset_name
    binned_data_bleu.append(binned_bleu)

# Combine the results
combined_bleu_data = pd.concat(binned_data_bleu, ignore_index=True)
# Step 4: Plot the trends

# BLEU Score Trends
plt.figure(figsize=(12, 6))
sns.lineplot(
    data=combined_bleu_data,
    x='length_bin', y='bleu_score', hue='dataset',
    marker='o'
)
plt.title('BLEU Score Trends Across Token Length Bins', fontsize=14)
plt.xlabel('Original Token Length (Binned)', fontsize=12)
plt.ylabel('Average BLEU Score', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Dataset')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for key, df in df_dict.items():
    print(f"KEY: {key}")
    df_sorted = df.sort_values(by="similarity", ascending=False)
    df_sorted["length"] = df_sorted["tokens_original"]
    # Aggregate similarity by exact length difference values
    heatmap_data = df_sorted.groupby("length")["similarity"].mean().reset_index()

    # Replace NaN with 0
    heatmap_data["similarity"] = heatmap_data["similarity"].fillna(0)

    # Prepare data for the heatmap
    heatmap_pivot = heatmap_data.pivot_table(columns="length", values="similarity")  # Pivot for horizontal axis

    # Plot the heatmap
    plt.figure(figsize=(12, 6))
    sns.heatmap(
        heatmap_pivot, 
        cmap="viridis", 
        cbar_kws={"label": "Average Similarity"},
        linewidths=0.5,
        fmt=".2f"
    )

    # Add labels and title
    plt.title("Heatmap of Similarity vs. Token Length", fontsize=14)
    plt.xlabel("Token Length", fontsize=12)
    plt.ylabel("Average Similarity", fontsize=12)

    # Rotate x-axis labels for better visibility
    plt.xticks(rotation=45)
    plt.show()

In [None]:
import matplotlib.pyplot as plt

for key, df in df_dict.items():
    print(f"KEY: {key}")
    # Tokenize to calculate lengths
    original_lengths = df["tokens_original"]
    decoded_lengths = df["tokens_decoded"]

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.scatter(original_lengths, decoded_lengths, alpha=0.6, edgecolor='k', label="Token Lengths")
    plt.plot([0, max(original_lengths + decoded_lengths)], [0, max(original_lengths + decoded_lengths)], 
             color="red", linestyle="--", label="y=x (ideal)")

    plt.title("Original Token Length vs. Decoded Token Length")
    plt.xlabel("Original Token Length")
    plt.ylabel("Decoded Token Length")
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()
