In [None]:
import pandas as pd
import torch
import numpy as np
import matplotlib.pyplot as plt

import pickle

def load_pickle(filename,path):
    with open(path+filename + ".pkl", "rb") as file:
        my_list = pickle.load(file)
    return my_list
        

original_dir  = "/home/knowledgeconflict/home/martin/MasterThesis/data/logit_lens/original/"
transformed_dir = "/home/knowledgeconflict/home/martin/MasterThesis/data/logit_lens/fake/"

#load original lists 

original_final_answers = load_pickle(filename="original_answers",path=original_dir)
original_labeled_true_scores = torch.load(original_dir+"labeled_true_scores"+".pt", map_location=torch.device('cpu'))
original_labeled_false_scores = torch.load(original_dir+"labeled_false_scores"+".pt", map_location=torch.device('cpu'))
original_topk_tokens = load_pickle(filename="topk_tokens",path=original_dir)
original_topk_scores = torch.load(original_dir+"topk_scores"+".pt", map_location=torch.device('cpu'))

#load fake lists 

transformed_final_answers = load_pickle(filename="original_answers",path=transformed_dir)
transformed_labeled_true_scores = torch.load(transformed_dir+"labeled_true_scores"+".pt", map_location=torch.device('cpu'))
transformed_labeled_false_scores = torch.load(transformed_dir+"labeled_false_scores"+".pt", map_location=torch.device('cpu'))
transformed_topk_tokens = load_pickle(filename="topk_tokens",path=transformed_dir)
transformed_topk_scores = torch.load(transformed_dir+"topk_scores"+".pt", map_location=torch.device('cpu'))

In [None]:
print(original_final_answers)
original_hallucinated_indices =  [i for i, x in enumerate(original_final_answers) if x == "FALSE"]
print(transformed_final_answers)
transformed_hallucinated_indices =[i for i, x in enumerate(transformed_final_answers) if x == "TRUE"]


### Accuracy

In [None]:
length_inputs = (len(original_final_answers)+len(transformed_final_answers))
failures = len(original_hallucinated_indices)+len(transformed_hallucinated_indices)
print(f"Accuracy {(length_inputs-failures)/length_inputs*100:.2f}%")


In [None]:
#shapes 

In [None]:
#print layer token development
print("For one sample only")
for o_topk,t_topk in zip (original_topk_tokens[:1],transformed_topk_tokens[:1]):
        print(o_topk)
        print(t_topk)
        
for o_topk,t_topk in zip (original_topk_scores[:1],transformed_topk_scores[:1]):
        print(o_topk[:,:,:1].tolist())
        print(t_topk[:,:,:1].tolist())

In [None]:
#true vs false development 
print("For one sample only")
print("TRUE")
for o_topk,t_topk in zip (original_labeled_true_scores[:1],transformed_labeled_true_scores[:1]):
        print(o_topk.tolist())
        print(t_topk.tolist())
print("FALSE")
for o_topk,t_topk in zip (original_labeled_false_scores[:1],transformed_labeled_false_scores[:1]):
        print(o_topk.tolist())
        print(t_topk.tolist())

In [None]:
original_dif = original_labeled_true_scores-original_labeled_false_scores
print(original_dif[0].tolist())

transformed_dif = transformed_labeled_true_scores-transformed_labeled_false_scores
print(transformed_dif[0].tolist())

In [None]:
layer_avg_original_dif = torch.mean(original_dif,dim=0)
layer_avg_transformed_dif = torch.mean(transformed_dif,dim=0)

In [None]:
def plot_tensor(tensor, title="Tensor Plot"):
    # Check if tensor is from PyTorch or TensorFlow and convert it to NumPy array
    data = tensor.detach().numpy()  # For PyTorch

    # Plot the data
    plt.plot(data)
    plt.title(title)
    plt.xlabel("Index")
    plt.ylabel("Value")
    plt.grid(True)
    plt.show()

In [None]:
import matplotlib.pyplot as plt

def four_subplot(original_true_tensor, original_false_tensor, transformed_true_tensor, transformed_false_tensor):
    # Calculate the maximum value across all tensors
    max_value = max(
        max(original_true_tensor), 
        max(original_false_tensor), 
        max(transformed_true_tensor), 
        max(transformed_false_tensor)
    )
    
    # Create a 2x2 subplot grid
    fig, axs = plt.subplots(2, 2, figsize=(5, 5))
    
    # Plot each tensor on the respective subplot
    axs[0, 0].plot(original_true_tensor, color='blue')
    axs[0, 0].set_title('Original True Tensor')
    axs[0, 0].set_ylim(0, max_value)  # Set y-axis limit
    axs[0, 0].grid(True) 

    axs[0, 1].plot(original_false_tensor, color='orange')
    axs[0, 1].set_title('Original False Tensor')
    axs[0, 1].set_ylim(0, max_value)  # Set y-axis limit
    axs[0, 1].grid(True)

    axs[1, 0].plot(transformed_true_tensor, color='green')
    axs[1, 0].set_title('Transformed True Tensor')
    axs[1, 0].set_ylim(0, max_value)  # Set y-axis limit
    axs[1, 0].grid(True)

    axs[1, 1].plot(transformed_false_tensor, color='red')
    axs[1, 1].set_title('Transformed False Tensor')
    axs[1, 1].set_ylim(0, max_value)  # Set y-axis limit
    axs[1, 1].grid(True)
    
    # Adjust layout
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

def plot_combined_tensors(original_tensor, transformed_tensor, original_label="Original Difference", transformed_label="Transformed Difference"):
    plt.figure(figsize=(10, 6))
    
    max_value = max(max(original_tensor),max(transformed_tensor))
    min_value = min(min(original_tensor),min(transformed_tensor))
    
    # Plot the original tensor
    plt.plot(original_tensor, label=original_label, color='blue')
    
    # Plot the transformed tensor
    plt.plot(transformed_tensor, label=transformed_label, color='orange')
    
    # Add titles and labels
    plt.title("Comparison of Original and Transformed Differences")
    plt.xlabel("Index")
    plt.ylabel("Value")
    plt.ylim(min_value,max_value)
    
    # Add a legend
    plt.legend()
    
    # Show grid for better readability
    plt.grid(True)
    
    # Show the plot
    plt.tight_layout()
    plt.show()


In [None]:
plot_combined_tensors(layer_avg_original_dif,layer_avg_transformed_dif)
print(list(zip(layer_avg_original_dif,layer_avg_transformed_dif)))

In [None]:
plot_combined_tensors(layer_avg_original_dif,layer_avg_transformed_dif)
print(list(zip(layer_avg_original_dif,layer_avg_transformed_dif)))

In [None]:
original_hallucinated_mean = torch.mean(original_dif[original_hallucinated_indices, :], dim=0)
transformed_hallucinated_mean = torch.mean(transformed_dif[transformed_hallucinated_indices, :], dim=0)

original_other_indices = list(set(range(len(original_dif))) - set(original_hallucinated_indices))
transformed_other_indices = list(set(range(len(transformed_dif))) - set(transformed_hallucinated_indices))

original_non_hallucinated_mean = torch.mean(original_dif[original_other_indices, :], dim=0)
transformed_non_hallucinated_mean = torch.mean(transformed_dif[transformed_other_indices, :], dim=0)

# Create the plot
plt.figure(figsize=(12, 8))

# Plot each line
x = range(original_hallucinated_mean.shape[0])
plt.plot(x, original_hallucinated_mean.numpy(), label='Original Hallucinated Mean', color='blue', linestyle='--', linewidth=2)
plt.plot(x, transformed_hallucinated_mean.numpy(), label='Transformed Hallucinated Mean', color='orange', linestyle='--', linewidth=2)
plt.plot(x, original_non_hallucinated_mean.numpy(), label='Original Non-Hallucinated Mean', color='green', linestyle='-', linewidth=2)
plt.plot(x, transformed_non_hallucinated_mean.numpy(), label='Transformed Non-Hallucinated Mean', color='red', linestyle='-', linewidth=2)

# Add labels and legend
plt.xlabel('Layer')
plt.ylabel('Difference of True and False Logits')
plt.title('Comparison of Logit Development for Hallucinated and Non-Hallucinated')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
four_subplot(original_labeled_true_scores.mean(dim=0),original_labeled_false_scores.mean(dim=0),transformed_labeled_true_scores.mean(dim=0),transformed_labeled_false_scores.mean(dim=0))

### Hallucinated Values

In [None]:
four_subplot(original_labeled_true_scores[original_hallucinated_indices,:].mean(dim=0),original_labeled_false_scores[original_hallucinated_indices,:].mean(dim=0),transformed_labeled_true_scores[transformed_hallucinated_indices,:].mean(dim=0),transformed_labeled_false_scores[transformed_hallucinated_indices,:].mean(dim=0))

### Train on Early Exit

In [None]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
def calculate_metrics(preds, labels):

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds)
    recall = recall_score(y_true=labels, y_pred=preds)
    f1 = f1_score(y_true=labels, y_pred=preds)

    return accuracy, precision, recall, f1

In [None]:
#prepare data
original_hallucinated = original_dif[original_hallucinated_indices, :]
transformed_hallucinated = transformed_dif[transformed_hallucinated_indices, :]

original_other_indices = list(set(range(len(original_dif))) - set(original_hallucinated_indices))
transformed_other_indices = list(set(range(len(transformed_dif))) - set(transformed_hallucinated_indices))

original_non_hallucinated = original_dif[original_other_indices, :]
transformed_non_hallucinated = transformed_dif[transformed_other_indices, :]

inputs = torch.cat([original_hallucinated,transformed_hallucinated,original_non_hallucinated,transformed_non_hallucinated])
num_of_hallucinated = len(original_hallucinated)+len(transformed_hallucinated)
num_of_non_hallucinated = len(original_non_hallucinated)+len(transformed_non_hallucinated)
labels = torch.cat([torch.Tensor([1]*num_of_hallucinated),torch.Tensor([0]*num_of_non_hallucinated)]).unsqueeze(1)

inputs = inputs.cpu().numpy()
labels = labels.cpu().numpy()

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)


In [None]:
X_train.shape, y_train.shape

In [None]:
#train log reg
log_reg = LogisticRegression(max_iter=1000)
num_layers = X_train.shape[-2]

log_reg.fit(X_train, y_train)

acc, prec, rec, f1 =  calculate_metrics(log_reg.predict(X_train),y_train)
print({"train_acc": acc, "train_precision": prec, "train_recall": rec, "train_f1": f1})

acc, prec, rec, f1 =  calculate_metrics(log_reg.predict(X_test),y_test)
print({"val_acc": acc, "val_precision": prec, "val_recall": rec, "f1": f1})


In [None]:
#train model
grad_boost = GradientBoostingClassifier(n_estimators=150)
num_layers = X_train.shape[-2]

grad_boost.fit(X_train, y_train)

acc, prec, rec, f1 =  calculate_metrics(grad_boost.predict(X_train),y_train)
print({"train_acc": acc, "train_precision": prec, "train_recall": rec, "train_f1": f1})

acc, prec, rec, f1 =  calculate_metrics(grad_boost.predict(X_test),y_test)
print({"val_acc": acc, "val_precision": prec, "val_recall": rec, "f1": f1})
