In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2Model
from transformers import GPT2Model, GPT2Config
from datasets import load_dataset
import numpy as np
np.random.seed(42)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#model = GPT2Model.from_pretrained('gpt2')

config = GPT2Config()
config.output_hidden_states=True

model = GPT2Model(config)

In [3]:
dataset = load_dataset("sst")
train_dataset = dataset["train"]
test_dataset = dataset["test"]
validation_dataset = dataset["validation"]

In [4]:
trXt = [tokenizer(string, return_tensors="pt") for string in train_dataset['sentence'][0:400]]

In [5]:
trX_hs = [model(**string).hidden_states[1:] for string in trXt]

In [6]:
layers = np.zeros((12, 400, 768))

for i, sentence_hs in enumerate(trX_hs):
    for layer, hs in enumerate(sentence_hs): 
        layers[layer][i] = torch.mean(hs.squeeze(0), dim=0).detach().numpy()  

In [7]:
trY = train_dataset['label'][0:400]

In [8]:
trY_reshaped = trY

# Calculate correlations for each layer
correlation_results = []
for layer_data in layers:
    # Calculate correlation for each feature in the layer
    correlations = np.corrcoef(layer_data.T, trY_reshaped)
    # Take only the correlations of interest (with the scores), remove the last row
    correlations = correlations[:-1, -1]
    correlation_results.append(correlations)
correlation_matrix = np.array(correlation_results)

In [9]:
np.max(correlation_matrix)

0.17951969857437397

In [10]:
flat_indices = np.argpartition(correlation_matrix.flatten(), -10)[-10:]

# Get top 10 values and their indices
top_10_values = correlation_matrix.flatten()[flat_indices]
top_10_indices = np.unravel_index(flat_indices, correlation_matrix.shape)

# Display top 10 values and their positions
for value, index in zip(top_10_values, zip(*top_10_indices)):
    print(f"Value: {value}, Index: {index}")

Value: 0.15632846599207814, Index: (9, 738)
Value: 0.15639723094903513, Index: (7, 578)
Value: 0.16728559001046517, Index: (2, 666)
Value: 0.16177474247587048, Index: (5, 578)
Value: 0.166897703680477, Index: (10, 578)
Value: 0.1635813251680548, Index: (3, 763)
Value: 0.16212361101586942, Index: (7, 596)
Value: 0.1684591691974711, Index: (5, 738)
Value: 0.17951969857437397, Index: (0, 467)
Value: 0.16874805516684346, Index: (9, 578)


## Computer correlation with only last hidden values

In [11]:
trXls = [model(**string).last_hidden_state for string in trXt[0:400]]

In [12]:
last_hidden_mean_states = [torch.mean(tensor.squeeze(0), dim=0) for tensor in trXls]

In [13]:
last_hidden_mean_states = torch.stack(last_hidden_mean_states, 0)

In [15]:
transposed_tensor = last_hidden_mean_states.t()

In [16]:
def corrcoef(x):
    numpy_array = x.detach().numpy()
    python_list = numpy_array.tolist()

    correlation_matrix = np.corrcoef(python_list, trY)

    return correlation_matrix[0, 1]

coeff = list(map(corrcoef, transposed_tensor))

In [17]:
import pandas as pd
numbers_series = pd.Series(coeff)

# Getting the indices that would sort the numbers in descending order
sorted_indices = (-numbers_series).argsort()

# Creating a DataFrame with columns 'Number' (sorted) and 'Original_Index'
df = pd.DataFrame({'Number': numbers_series[sorted_indices].values, 'Original_Index': sorted_indices})

df

Unnamed: 0,Number,Original_Index
0,0.191403,212
1,0.136741,634
2,0.134816,578
3,0.131846,346
4,0.115368,126
...,...,...
763,-0.123792,482
764,-0.135119,564
765,-0.136690,711
766,-0.141952,302
