In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report

import numpy as np
from pos_tagger import PosTagger
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression


In [None]:
model_url = 'roneneldan/TinyStories-3M'

model = AutoModelForCausalLM.from_pretrained(model_url)
tokenizer = AutoTokenizer.from_pretrained(model_url)

In [None]:
with open('data/tinystories_val.txt', 'r') as file:
    stories = file.read()
    stories = stories.split("<|endoftext|>\n")

In [None]:
class miav():
    def __init__(self, model, tokenizer, dataset):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
    
    
    def attach_value_hook(self, model, layer):
        values = []
        def extract_value_hook(module, input, output):
            values.append(output)
        value_hookpoint = f'transformer.h.{layer}.attn.attention.v_proj'
        value_hook = model.get_submodule(value_hookpoint).register_forward_hook(extract_value_hook)
        return value_hook, values


    def get_values(self, model, tokenizer, layer, head_index,  inputs:list[str]):
        if head_index > 15:
            raise ValueError('Head index must be between 0 and 15')
        value_hook, values = self.attach_value_hook(model, layer)
        for input_ in inputs:
            tokenized = tokenizer.encode(input_, return_tensors='pt')
            model.forward(tokenized)
        value_hook.remove()
        return values

    
def get_value_attention_token_level(attention_values,sentence_index, token_index, head_index):
    #This function is just the structure of the data
    sentence = attention_values[sentence_index]
    batch = sentence[0]
    token = batch[token_index]
    head = token[head_index:head_index+8]
    return head
    

def get_value_attention_sentence_level(attention_matrix, head_index):
    sentence_list = []
    for sentence in attention_matrix:
        attention_values = []
        for batch in sentence:
            for token in batch:
                attention_values.append(token[head_index:head_index+8])
        sentence_list.append(attention_values)
    return sentence_list
    


idk = miav(model, tokenizer, stories)

In [None]:
def create_data(model, tokenizer, layer, head_index, sentences):
    tagger = PosTagger(tokenizer)
    X = []
    Y = []


    for sentence in sentences:

        value_matrix = idk.get_values(model, tokenizer, layer, head_index,[sentence])
        tokens, tags = tagger.tag_input(sentence)
        attention_data = get_value_attention_sentence_level(value_matrix,head_index)[0]
        for token, tag in zip(attention_data, tags):
            X.append(token.detach().numpy())
            Y.append(tag)
    
    # Initialize the LabelEncoder
    label_encoder = LabelEncoder()
    
    # Fit and transform the data
    Y = label_encoder.fit_transform(Y)
    X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.33, random_state=42)

    return X_train, X_test, y_train, y_test
    

def train_model(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    report = classification_report(y_test, y_pred, zero_division=0)
    return score, report

In [None]:
# layer_list =[]
# layers = range(8)
# heads = range(16)
# for layer in layers:
#     head_list = []
#     for head in heads:
#         X_train, X_test, y_train, y_test = create_data(model, tokenizer, layer, head, stories[0:10])
#         score = train_model(X_train, X_test, y_train, y_test)
#         head_list.append(score)
#     layer_list.append(head_list)

In [None]:
from attention_extraction import extract_all_attention

In [None]:
keys, q, v = extract_all_attention(model, tokenizer, stories[0:10])

In [None]:
v.shape

In [None]:
def generate_scores(model, tokenizer, stories):
    layers=range(8)
    heads=range(16)
    for layer in layers:
        head_list = []
        report_list = []
        for head in heads:
            X_train, X_test, y_train, y_test = create_data(model, tokenizer, layer, head,stories)
            #score,report = train_model(X_train, X_test, y_train, y_test)
            #report_list.append(report)
            #head_list.append(score)
        yield head_list, report_list


layer_list = []
report_list = []
for layer_scores, reports in generate_scores(model, tokenizer,  stories[0:10]):
    print(layer_scores)
    report_list.append(reports)
    layer_list.append(layer_scores)

#Each row is a layer, each column is a head
#In the print below

In [None]:
200*128