In [1]:
from datasets import load_dataset, concatenate_datasets

from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, DataCollatorForLanguageModeling
import random
import numpy as np
from utilities import evaluate_gpt2_classification as evaluate_gpt2_classification, mask_range_gpt,compute_masks, reset_gpt, compute_mask_probe, mask_gpt2
import torch  
from tqdm import tqdm

dataset_name = "fancyzhx/ag_news"

text_tag = "text"

# Load dataset and tokenizer


tables = []
layer = 6
# for i in tqdm(range(1, 21)):
per = 0.3
print("Percentage: ", per)
num_classes = 4

# tao = 2.5

lab = "label"
# tao = torch.inf

dataset = load_dataset(dataset_name)

print(dataset)

# print(dataset['train'].features)






#######################Filter dataset####################
from datasets import DatasetDict, Dataset, Features, ClassLabel, Value
import pandas as pd

def sample_balanced_dataset(dataset_dict, max_train_per_class=800, max_test_per_class=200):
    """
    Sample a balanced subset while preserving the original feature structure including ClassLabel.
    """
    # Store original features
    original_features = dataset_dict['train'].features
    
    # Convert to pandas for sampling
    train_df = dataset_dict['train'].to_pandas()
    test_df = dataset_dict['test'].to_pandas()
    
    # Group by label
    train_groups = train_df.groupby('label')
    test_groups = test_df.groupby('label')
    
    sampled_train_dfs = []
    sampled_test_dfs = []
    
    print("\nClass distribution:")
    print("\nLabel | Label Name | Train Samples | Test Samples | Final Train | Final Test")
    print("-" * 85)
    
    label_names = original_features['label'].names
    for idx, label_name in enumerate(label_names):
        train_group = train_groups.get_group(idx)
        test_group = test_groups.get_group(idx) if idx in test_groups.groups else pd.DataFrame()
        
        # Sample with replacement if needed
        train_replace = len(train_group) < max_train_per_class
        test_replace = len(test_group) < max_test_per_class
        
        sampled_train = train_group.sample(
            n=min(len(train_group), max_train_per_class),
            replace=train_replace,
            random_state=42
        )
        
        if not test_group.empty:
            sampled_test = test_group.sample(
                n=min(len(test_group), max_test_per_class),
                replace=test_replace,
                random_state=42
            )
        else:
            sampled_test = pd.DataFrame(columns=test_df.columns)
        
        sampled_train_dfs.append(sampled_train)
        sampled_test_dfs.append(sampled_test)
        
        print(f"{idx:5d} | {label_name:10s} | {len(train_group):12d} | "
              f"{len(test_group):11d} | {len(sampled_train):10d} | {len(sampled_test):9d}")
    
    # Concatenate all sampled dataframes
    final_train_df = pd.concat(sampled_train_dfs, ignore_index=True)
    final_test_df = pd.concat(sampled_test_dfs, ignore_index=True)
    
    # Convert back to datasets while preserving the original features
    final_train_dataset = Dataset.from_pandas(final_train_df, features=original_features)
    final_test_dataset = Dataset.from_pandas(final_test_df, features=original_features)
    
    # Create new DatasetDict
    sampled_dataset = DatasetDict({
        'train': final_train_dataset,
        'test': final_test_dataset
    })
    
    print("\nFinal dataset sizes:")
    print(f"Train: {len(final_train_dataset)} samples")
    print(f"Test: {len(final_test_dataset)} samples")
    
    # Verify feature structure is preserved
    print("\nVerifying feature structure:")
    print(sampled_dataset['train'].features)
    
    return sampled_dataset

# dataset = sample_balanced_dataset(dataset, max_train_per_class=800, max_test_per_class=200)

###########################################



# Set random seed
seed_value = 42  # or any other integer

random.seed(seed_value)
np.random.seed(seed_value)

if torch.cuda.is_available():  # PyTorch-specific
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

import torch

torch.autograd.set_detect_anomaly(True)
# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


special_tokens_dict = {}
new_tokens = []
label2text = dataset['train'].features[lab].names

for label in label2text:
    # Create special token format (with and without space)
    special_token = f'{label}'
    
    # Check if the label is already a single token in the tokenizer
    label_tokens = tokenizer.encode(label, add_special_tokens=False)
    is_single_token = len(label_tokens) == 1
    
    if is_single_token:
        print(f"'{label}' is already a single token (ID: {label_tokens[0]})")
    
    # Add both versions to new tokens list
    new_tokens.extend([special_token])

# Add the tokens to the tokenizer
num_added_tokens = tokenizer.add_tokens(new_tokens)
print(f"\nAdded {num_added_tokens} new tokens to the tokenizer")

special_tokens = {
    'pad_token': '<|pad|>',
    'sep_token': '<|sep|>',
    'eos_token': '<|eos|>'
}
tokenizer.add_special_tokens(special_tokens)

def format_data(examples):
    formatted_texts = []
    for text, label in zip(examples[text_tag], examples[lab]):
        # Convert label to string
        
        tok_text = tokenizer.encode(text, max_length=400, truncation=True)
        text = tokenizer.decode(tok_text)
        label_str = dataset['train'].features[lab].int2str(label)
        formatted_text = f"Classify emotion: {text}{tokenizer.sep_token}"#{label_str}{tokenizer.eos_token}"
        formatted_texts.append(formatted_text)
    return {'formatted_text': formatted_texts}

def tokenize_and_prepare(examples):

    # Tokenize with batch processing
    tokenized = tokenizer(
        examples['formatted_text'],
        padding='max_length',
        max_length=408,
        truncation=True,
        return_tensors="pt"
    )
    
    # Clone input_ids to create labels
    labels = tokenized['input_ids'].clone()
    
    # Find the position of sep_token
    sep_token_id = tokenizer.convert_tokens_to_ids(tokenizer.sep_token)
    sep_positions = (labels == sep_token_id).nonzero(as_tuple=True)
    
    # Mask all tokens with -100 except for the token right after sep_token
    labels[:] = -100  # Mask all initially
    for batch_idx, sep_pos in zip(*sep_positions):
        if sep_pos + 1 < labels.size(1):
            labels[batch_idx, sep_pos + 1] = tokenized['input_ids'][batch_idx, sep_pos + 1]
    
    # Set padding tokens to -100
    labels[labels == tokenizer.pad_token_id] = -100
    
    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask'],
        'labels': labels
    }
    
dataset = dataset.filter(lambda x: x[lab] != -1)
# Process the dataset
formatted_dataset = dataset.map(format_data, batched=True)
tokenized_dataset = formatted_dataset.map(
    tokenize_and_prepare, 
    batched=True,
)

from transformers import GPT2LMHeadModel as gt
from models.gpt2 import GPT2LMHeadModel
# Load pre-trained GPT-2 model
model1 = gt.from_pretrained('gpt2')

model1.resize_token_embeddings(len(tokenizer))

model1.config.m_layer = layer
import os

base_path = os.path.join("model_weights", dataset_name)
if not os.path.exists(base_path):
    os.makedirs(base_path)

weights_path = os.path.join(base_path, "weights.pth")

model = GPT2LMHeadModel(model1.config)


model.load_state_dict(torch.load(weights_path))






Percentage:  0.3
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
'World' is already a single token (ID: 10603)
'Sports' is already a single token (ID: 18153)
'Business' is already a single token (ID: 24749)

Added 1 new tokens to the tokenizer


  model.load_state_dict(torch.load(weights_path))


<All keys matched successfully>

In [2]:
from prettytable import PrettyTable
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.tensor")

batch_size = 128
# mask_layer = 5
compliment = True
results_table = PrettyTable()
if(compliment):
    results_table.field_names = results_table.field_names = ["Class", "Base Accuracy", "Base Confidence", "Base Complement Acc", "Base Compliment Conf", "STD Accuracy", "STD Confidence", "STD compliment ACC", "STD compliment Conf", "MAX Accuracy", "MAX Confidence", "Max compliment acc", "Max compliment conf", "Total Masked", "Intersection"]#, "Same as Max"]#"MAX Accuracy", "MAX Confidence", "Max compliment acc", "Max compliment conf"

class_labels = []
base_accuracies = []
base_confidences = []
base_comp_acc = []
base_comp_conf = []
std_masked_counts = []
std_accuracies = []
std_confidences = []
std_comp_acc = []
std_comp_conf = []
max_masked_counts = []
max_accuracies = []
max_confidences = []
max_comp_acc = []
max_comp_conf = []
diff_from_max = []
total_masked = []

#merge test and train set and then shuffle and make splits

# First merge and shuffle
# tokenized_dataset = concatenate_datasets([tokenized_dataset['train'], tokenized_dataset['test']]).shuffle(seed=42)#.select(range(100))

# Get the total length
# dataset_length = len(tokenized_dataset)


# Calculate split index
# split_index = int(dataset_length * 0.2)  # 80% for training

# Create the splits using dataset slicing
tokenized_dataset1 = tokenized_dataset['test']#.shuffle().select(range(200))
recording_dataset = tokenized_dataset['train']#.shuffle().select(range(200))

    



all_fc_vals = []
base_accuracies = []
base_confidences = []
base_comp_acc = []
base_comp_conf = []
print("Recording activations...")
for j in range(0,num_classes):
    dataset_recording = recording_dataset.filter(lambda x: x[lab] in [j])
    dataset = tokenized_dataset1.filter(lambda x: x[lab] in [j])
    dataset_complement = tokenized_dataset1.filter(lambda x: x[lab] not in [j])
    fc_vals = evaluate_gpt2_classification(lab, model, dataset_recording, tokenizer, batch_size)
    fc_vals = fc_vals[2]
    all_fc_vals.append(np.array(fc_vals))
    
    
    
    acc = evaluate_gpt2_classification(lab, model, dataset, tokenizer, batch_size)
    
    base_accuracies.append(acc[0])
    base_confidences.append(acc[1])
    
    print("Class ",j, "base accuracy: ", acc[0], acc[1])
    
    acc = evaluate_gpt2_classification(lab, model, dataset_complement, tokenizer)
    
    base_comp_acc.append(acc[0])
    base_comp_conf.append(acc[1])
    
    print("Class ",j, "complement base accuracy: ", acc[0], acc[1])
    
    
results_table = PrettyTable()
if(compliment):
    results_table.field_names = results_table.field_names = ["Class", "Base Accuracy", "Base Confidence", "Base Complement Acc", "Base Compliment Conf", "STD Accuracy", "STD Confidence", "STD compliment ACC", "STD compliment Conf", "MAX Accuracy", "MAX Confidence", "Max compliment acc", "Max compliment conf", "Total Masked", "Intersection"]#, "Same as Max"]#"MAX Accuracy", "MAX Confidence", "Max compliment acc", "Max compliment conf"

class_labels = []
# base_accuracies = []
# base_confidences = []
# base_comp_acc = []
# base_comp_conf = []
std_masked_counts = []
std_accuracies = []
std_confidences = []
std_comp_acc = []
std_comp_conf = []
max_masked_counts = []
max_accuracies = []
max_confidences = []
max_comp_acc = []
max_comp_conf = []
diff_from_max = []
total_masked = []
    



Recording activations...


Evaluating:   0%|          | 0/235 [00:00<?, ?it/s]

  input_ids = torch.tensor(item['input_ids']).to(device)
  attention_mask = torch.tensor(item['attention_mask']).to(device)


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

Class  0 base accuracy:  0.9547 0.9509


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

Class  0 complement base accuracy:  0.9412 0.9276


Evaluating:   0%|          | 0/235 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

Class  1 base accuracy:  0.9863 0.9811


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

Class  1 complement base accuracy:  0.9307 0.9175


Evaluating:   0%|          | 0/235 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

Class  2 base accuracy:  0.8974 0.8858


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

Class  2 complement base accuracy:  0.9604 0.9493


Evaluating:   0%|          | 0/235 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

Class  3 base accuracy:  0.94 0.9159


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

Class  3 complement base accuracy:  0.9461 0.9392


In [3]:
import utilities
import importlib
importlib.reload(utilities)
from utilities import compute_masks

compliment = True
results_table = PrettyTable()
if(compliment):
    results_table.field_names = results_table.field_names = ["Class", "Base Accuracy", "Base Confidence", "Base Complement Acc", "Base Compliment Conf", "STD Accuracy", "STD Confidence", "STD compliment ACC", "STD compliment Conf", "MAX Accuracy", "MAX Confidence", "Max compliment acc", "Max compliment conf"]#, "Same as Max"]#"MAX Accuracy", "MAX Confidence", "Max compliment acc", "Max compliment conf"

per = 0.5
for j in range(0,num_classes):
    fc_vals = all_fc_vals[j]
    model = reset_gpt(model)
    model = mask_gpt2(model, torch.ones(768).to('cuda'))
    dataset = tokenized_dataset1.filter(lambda x: x[lab] in [j])
    dataset_recording = recording_dataset.filter(lambda x: x[lab] in [j])
    dataset_complement = tokenized_dataset1.filter(lambda x: x[lab] not in [j])
    

    class_labels.append(f"Class {j}")
    # acc = evaluate_gpt2_classification(lab, model, dataset, tokenizer)
    print("Class ",j, "base accuracy: ", base_accuracies[j], base_confidences[j])
    if(compliment):
        print("Class ",j, "complement base accuracy: ", base_comp_acc[j], base_comp_conf[j])

        
    mask_max, mask_std, mask_intersection, mask_max_low_std, mask_max_high_std, mask_std_high_max, mask_max_random_off, random_mask = compute_masks(fc_vals,per)
    mask_max2, mask_std, mask_intersection, mask_max_low_std, mask_max_high_std, mask_std_high_max, mask_max_random_off, random_mask = compute_masks(fc_vals,1)
    
    all_fc_vals_pass = all_fc_vals.copy()
    # all_fc_vals_pass.pop(j)
    
    
    
    
    tao = 3.5
    model = mask_range_gpt(model, mask_max2, fc_vals, tao, all_fc_vals_pass)        
    t = int(mask_std.shape[0]-torch.count_nonzero(mask_max))
    print("Total Masked :", t)
    # total_masked.append(t)
    
    
    acc = evaluate_gpt2_classification(lab, model, dataset, tokenizer) 
    print("accuracy after masking STD: ", acc[0], acc[1])
    std_accuracies.append(acc[0])
    std_confidences.append(acc[1])
    if(compliment):
        acc = evaluate_gpt2_classification(lab, model, dataset_complement, tokenizer)
        print("accuracy after masking STD on complement: ", acc[0], acc[1])
        std_comp_acc.append(acc[0])
        std_comp_conf.append(acc[1])
    model = reset_gpt(model)

    print("Masking MAX...")
    tao = torch.inf
    
    # model = mask_distillbert(model,mask_max) 
    # model = mask_range_gpt(model, mask_max, fc_vals, tao, all_fc_vals_pass)
    
    # model = mask_gpt2(model, mask_max)
    model = mask_range_gpt(model, mask_max, fc_vals, tao, all_fc_vals_pass)    
    t = int(mask_max.shape[0]-torch.count_nonzero(mask_max))
    print("Total Masked :", t)
    acc = evaluate_gpt2_classification(lab, model, dataset, tokenizer)
    print("accuracy after masking MAX: ", acc[0], acc[1])
    max_accuracies.append(acc[0])
    max_confidences.append(acc[1])
    acc = evaluate_gpt2_classification(lab, model, dataset_complement, tokenizer)
    print("accuracy after masking MAX on complement: ", acc[0], acc[1])
    max_comp_acc.append(acc[0])
    max_comp_conf.append(acc[1])
    if(compliment):
        results_table.add_row([
            class_labels[j],
            base_accuracies[j],
            base_confidences[j],
            base_comp_acc[j],
            base_comp_conf[j],
            std_accuracies[j],
            std_confidences[j],
            std_comp_acc[j],
            std_comp_conf[j],
            max_accuracies[j],
            max_confidences[j],
            max_comp_acc[j],
            max_comp_conf[j],
            # total_masked[j],
            # diff_from_max[j]
        ])            
# print("Layer ", mask_layer)
print(results_table)
#     tables.append(results_table)
#     # print("Layer ", mask_layer)
#     print("Average Base Accuracy: ",round(sum(base_accuracies)/len(base_accuracies), 4))
#     print("Average Base Confidence: ", round(sum(base_confidences)/len(base_confidences), 4))
#     print("Average MAX Accuracy: ", round(sum(max_accuracies)/len(max_accuracies), 4))
#     print("Average MAX Confidence: ", round(sum(max_confidences)/len(max_confidences), 4))
#     print("Average MAX Complement Accuracy: ", round(sum(max_comp_acc)/len(max_comp_acc), 4))
#     print("Average MAX Complement Confidence: ", round(sum(max_comp_conf)/len(max_comp_conf), 4))

# per = 0.1
# for table in tables:
    
#     print(f"\nAnalysis for Percentage: {per:.2f}")
#     print(table)
#     print("\n")
#     per += 0.1

Class  0 base accuracy:  0.9547 0.9509
Class  0 complement base accuracy:  0.9412 0.9276
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking STD:  0.1532 0.0969


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking STD on complement:  0.3591 0.3181
Masking MAX...
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking MAX:  0.0005 0.0006


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking MAX on complement:  0.2879 0.272
Class  1 base accuracy:  0.9863 0.9811
Class  1 complement base accuracy:  0.9307 0.9175
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking STD:  0.0442 0.0342


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking STD on complement:  0.3802 0.3377
Masking MAX...
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking MAX:  0.0016 0.0006


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking MAX on complement:  0.2186 0.1749
Class  2 base accuracy:  0.8974 0.8858
Class  2 complement base accuracy:  0.9604 0.9493
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking STD:  0.0747 0.0555


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking STD on complement:  0.3539 0.3224
Masking MAX...
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking MAX:  0.0063 0.0029


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking MAX on complement:  0.2496 0.2468
Class  3 base accuracy:  0.94 0.9159
Class  3 complement base accuracy:  0.9461 0.9392
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking STD:  0.6679 0.6526


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking STD on complement:  0.1242 0.0838
Masking MAX...
Total Masked : 384


Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

accuracy after masking MAX:  0.7589 0.7403


Evaluating:   0%|          | 0/45 [00:00<?, ?it/s]

accuracy after masking MAX on complement:  0.0125 0.0063
+---------+---------------+-----------------+---------------------+----------------------+--------------+----------------+--------------------+---------------------+--------------+----------------+--------------------+---------------------+
|  Class  | Base Accuracy | Base Confidence | Base Complement Acc | Base Compliment Conf | STD Accuracy | STD Confidence | STD compliment ACC | STD compliment Conf | MAX Accuracy | MAX Confidence | Max compliment acc | Max compliment conf |
+---------+---------------+-----------------+---------------------+----------------------+--------------+----------------+--------------------+---------------------+--------------+----------------+--------------------+---------------------+
| Class 0 |     0.9547    |      0.9509     |        0.9412       |        0.9276        |    0.1532    |     0.0969     |       0.3591       |        0.3181       |    0.0005    |     0.0006     |       0.2879       |  

In [None]:
import torch

# Clear PyTorch GPU cache
torch.cuda.empty_cache()


In [2]:
import pandas as pd
import numpy as np
import re

def parse_table(file_path):
    # Read the content from the file
    with open(file_path, 'r') as file:
        content = file.read()

    # Extract the table rows
    lines = content.strip().split('\n')

    # Find header and data rows
    header_row = None
    data_rows = []
    
    for line in lines:
        if '+--' in line:
            continue
        if header_row is None and '|' in line:
            header_row = line
        elif header_row is not None and '|' in line:
            data_rows.append(line)

    # Parse header
    header_parts = [part.strip() for part in re.split(r'\s*\|\s*', header_row) if part.strip()]

    # Parse data rows
    parsed_rows = []
    for row in data_rows:
        parts = [part.strip() for part in re.split(r'\s*\|\s*', row) if part.strip()]
        if len(parts) == len(header_parts):
            parsed_rows.append(parts)

    # Create DataFrame
    df = pd.DataFrame(parsed_rows, columns=header_parts)

    # Convert numeric columns to float
    numeric_columns = df.columns[1:]  # All columns except 'Class'
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    return df

def trimmed_mean(data, proportion=0.1):
    """Calculate the trimmed mean by removing the specified proportion from both ends."""
    data = [x for x in data if pd.notna(x)]  # Remove NaN values
    if not data:
        return np.nan
    
    n = len(data)
    k = int(round(proportion * n))
    return np.mean(sorted(data)[k:n-k])

def main():
    # Parse the table
    df = parse_table('paste.txt')
    
    # Get metrics (all columns except 'Class')
    metrics = [col for col in df.columns if col != 'Class']
    
    # Calculate 10% trimmed mean for each metric across all classes
    print("10% Trimmed Mean across all classes for each metric:")
    for metric in metrics:
        mean_value = trimmed_mean(df[metric].values)
        print(f"{metric}: {mean_value:.4f}")
    
    # Print the sorted classes by Base Accuracy for reference
    print("\nClasses sorted by Base Accuracy:")
    sorted_df = df.sort_values(by='Base Accuracy', ascending=False)
    print(sorted_df[['Class', 'Base Accuracy']].head(10).to_string(index=False))
    
    # Calculate mean comparison between Base and MAX metrics
    print("\nComparison of Base vs MAX metrics (10% trimmed means):")
    print(f"Base Accuracy vs MAX Accuracy: {trimmed_mean(df['Base Accuracy'].values):.4f} vs {trimmed_mean(df['MAX Accuracy'].values):.4f}")
    print(f"Base Confidence vs MAX Confidence: {trimmed_mean(df['Base Confidence'].values):.4f} vs {trimmed_mean(df['MAX Confidence'].values):.4f}")
    
    # Calculate the difference between Base and MAX for each class
    df['Accuracy_Diff'] = df['Base Accuracy'] - df['MAX Accuracy']
    
    # Print classes with biggest difference in accuracy
    print("\nClasses with biggest difference between Base and MAX Accuracy:")
    diff_df = df.sort_values(by='Accuracy_Diff', ascending=False)
    print(diff_df[['Class', 'Base Accuracy', 'MAX Accuracy', 'Accuracy_Diff']].head(5).to_string(index=False))

if __name__ == "__main__":
    main()

10% Trimmed Mean across all classes for each metric:
Base Accuracy: 0.9872
Base Confidence: 0.9842
Base Complement Acc: 0.9926
Base Compliment Conf: 0.9811
MAX Accuracy: 0.9381
MAX Confidence: 0.1577
Max compliment acc: 0.9929
Max compliment conf: 0.9919

Classes sorted by Base Accuracy:
Class  Base Accuracy
  WRB            1.0
   ``            1.0
    .            1.0
    #            1.0
   ''            1.0
-LRB-            1.0
   EX            1.0
  WP$            1.0
  PDT            1.0
 PRP$            1.0

Comparison of Base vs MAX metrics (10% trimmed means):
Base Accuracy vs MAX Accuracy: 0.9872 vs 0.9381
Base Confidence vs MAX Confidence: 0.9842 vs 0.1577

Classes with biggest difference between Base and MAX Accuracy:
Class  Base Accuracy  MAX Accuracy  Accuracy_Diff
 NNPS         0.9262        0.2746         0.6516
  RBR         0.9191        0.5662         0.3529
   UH         0.6667        0.3333         0.3334
  RBS         0.8857        0.6000         0.2857
  JJS     

In [10]:
import numpy as np
import pandas as pd
from tabulate import tabulate

def calculate_trimmed_means(data_text, trim_percent=0.1):
    # Parse the data from the text
    tau_values = []
    metrics_data = {}
    
    lines = data_text.strip().split('\n')
    current_tau = None
    
    for line in lines:
        if line.startswith('For Tau:'):
            current_tau = float(line.split(':')[1].strip())
            tau_values.append(current_tau)
            metrics_data[current_tau] = []
        elif line.startswith('|  Class  | Base Accuracy') and current_tau is not None:
            # This is the header line, extract column names
            headers = [h.strip() for h in line.split('|')[1:-1]]
        elif line.startswith('|  Class') and current_tau is not None:
            # This is a data line
            values = line.split('|')[1:-1]
            class_name = values[0].strip()
            metrics = [float(v.strip()) for v in values[1:]]
            metrics_data[current_tau].append([class_name] + metrics)
    
    # Convert to DataFrame for easier manipulation
    all_data = []
    
    for tau in tau_values:
        for row in metrics_data[tau]:
            all_data.append([tau] + row)
    
    df = pd.DataFrame(all_data, columns=['Tau', 'Class'] + headers[1:])
    
    # Calculate trimmed means for each tau value and class
    tau_classes = df.groupby(['Tau', 'Class'])
    
    # List of metric columns
    metric_columns = df.columns[2:]
    
    # Create a new dataframe for trimmed means
    trimmed_means = []
    
    for tau in tau_values:
        tau_data = df[df['Tau'] == tau]
        
        for class_name in tau_data['Class'].unique():
            class_data = tau_data[tau_data['Class'] == class_name]
            
            row = {'Tau': tau, 'Class': class_name}
            
            for metric in metric_columns:
                # Calculate trimmed mean (removing 10% from each end)
                values = class_data[metric].values
                trimmed_mean = np.mean(np.sort(values)[int(len(values)*trim_percent):int(len(values)*(1-trim_percent))])
                row[metric] = trimmed_mean
            
            trimmed_means.append(row)
    
    trimmed_df = pd.DataFrame(trimmed_means)
    
    # Format the results by tau value
    results = {}
    
    for tau in tau_values:
        tau_results = trimmed_df[trimmed_df['Tau'] == tau].sort_values('Class')
        tau_results = tau_results.drop('Tau', axis=1)
        
        # Convert to formatted table
        table = tabulate(tau_results, headers='keys', tablefmt='grid', floatfmt='.4f')
        results[tau] = table
    
    return results

# Example usage
with open('tao_abiliation.txt', 'r') as f:
    data_text = f.read()

trimmed_means = calculate_trimmed_means(data_text)

# Print results for each tau value
for tau, table in trimmed_means.items():
    print(f"\nFor Tau: {tau}")
    print(table)

KeyError: 'Tau'

In [11]:
import pandas as pd
import numpy as np
import re

def parse_data(text):
    # Find all tau sections
    tau_sections = re.split(r'Tao:\s+', text)[1:]  # Skip the first empty element
    
    # Initialize dictionary to store results
    results = {}
    
    for section in tau_sections:
        # Extract tau value
        tau_match = re.match(r'(\d+\.?\d*)', section)
        if not tau_match:
            continue
        
        tau = float(tau_match.group(1))
        
        # Extract class data
        class_data = []
        for class_idx in range(4):
            pattern = r'Class ' + str(class_idx) + r'\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+\|\s+([\d\.]+)\s+'
            match = re.search(pattern, section)
            
            if match:
                row_data = [float(match.group(i)) for i in range(1, 9)]
                class_data.append(row_data)
        
        # Convert to numpy array
        if class_data:
            results[tau] = np.array(class_data)
    
    return results

def create_averages_table(parsed_data):
    # Column names
    columns = [
        'Base Accuracy', 'Base Confidence', 'Base Complement Acc', 'Base Compliment Conf',
        'STD Accuracy', 'STD Confidence', 'STD compliment ACC', 'STD compliment Conf'
    ]
    
    # Calculate averages for each tau value
    results = []
    for tau, data in sorted(parsed_data.items()):
        row = [tau]
        row.extend([np.mean(data[:, i]) for i in range(data.shape[1])])
        results.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(results, columns=['Tau'] + columns)
    return df.round(4)

# Read data from file
with open('tao_abiliation.txt', 'r') as file:
    data = file.read()

# Parse the data
parsed_data = parse_data(data)

# Create and display the averages table
result_table = create_averages_table(parsed_data)
print(result_table)

# Save to CSV
result_table.to_csv('tau_averages.csv', index=False)

   Tau  Base Accuracy  Base Confidence  Base Complement Acc  \
0  1.0          0.945           0.9337                0.945   
1  1.3          0.945           0.9337                0.945   
2  1.6          0.945           0.9337                0.945   
3  1.9          0.945           0.9337                0.945   
4  2.2          0.945           0.9337                0.945   
5  2.5          0.945           0.9337                0.945   
6  2.8          0.945           0.9337                0.945   
7  3.1          0.945           0.9337                0.945   
8  3.4          0.945           0.9337                0.945   
9  3.7          0.945           0.9337                0.945   

   Base Compliment Conf  STD Accuracy  STD Confidence  STD compliment ACC  \
0                0.9337        0.7632          0.7281              0.9375   
1                0.9337        0.7124          0.6767              0.9338   
2                0.9337        0.6684          0.6350              0.9337  