In [1]:
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
import tensorflow as tf

In [78]:
#Loading in my finetuned model and tokenizer
model = TFGPT2LMHeadModel.from_pretrained('fine_tuned_model_test')
tokenizer = GPT2Tokenizer.from_pretrained('fine_tuned_pre')

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at fine_tuned_model_test.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [44]:
#Or you could load in the base GPT-2
model = TFGPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
#imports and loading in my training data
import numpy as np
import pandas as pd
training_data = pd.read_csv("Training_Data.csv")
training_data = training_data.dropna()

In [None]:

#Function to calcualte avg attention values for each token in a specific attn head. Expects nxn matrix
def calculate_average_attention(attention_head):
    # Calculate the row-wise and column-wise sums
    row_sums = np.sum(attention_head, axis=1)
    col_sums = np.sum(attention_head, axis=0)
    
    # Calculate the sum of attention values on the main diagonal
    diagonal_sum = np.diag(attention_head)
    
    # Calculate the average attention values for each token, substract diagonal for double counted and divsion by seq_length
    #bc zeros in top half of arr
    num_tokens = attention_head.shape[0]
    average_attention = (row_sums + col_sums - diagonal_sum) / (num_tokens)
    
    return average_attention

In [7]:
#Training loop to create weights_dic, every layer and attention head is looped and the avarge attn scores for each
#token is saved in a list in dic. In my code I only used 20K samples to fine tune model, and those same samples are
#used to generate the avg attn scores for each token

weights_dic = {}
for q in list(training_data["Questions"])[0:20000]:
    input_ids = tokenizer.encode(q , return_tensors='tf', truncation=True,max_length = 1048)
    outputs = model(input_ids, output_attentions=True)
    attentions = outputs.attentions
    arr = None
    for layer in attentions:
        for head in layer[0]:
            if arr is None:
                arr = ((calculate_average_attention(head)))
            else:
                arr = arr + ((calculate_average_attention(head)))
    arr = arr/144
    for idx,token_val in enumerate(input_ids[0].numpy()):
        if token_val not in weights_dic.keys():
            weights_dic[token_val] = [arr[idx]]
        else:
            weights_dic[token_val].append(arr[idx])

In [9]:
#Average the attn scores for each token
for key in weights_dic.keys():
    weights_dic[key] = sum(weights_dic[key])/len(weights_dic[key]) 

In [93]:


#function expects lists
def attn_truncate(sequence,dic, max_length):
    #Conditional to return already short enough sequence, maybe raise error if expected to padd before
    if len(sequence) <= max_length:
        return sequence
    #Set return arr and temp_dic and avg
    return_arr = None
    temp_dic = {}
    avg = sum(dic.values())/len(dic)
    
    #convert array to matching attention values, TODO add exception for tokens not in weights_dic
    val_arr = [dic[index] for index in sequence]
    
    #temp_dic becomes reverse dictionary, this could be moved outside of function and passed as arg but keeping simple for now,
    #Possible error with values being same when converted to keys, unlikely in my testing
    for index in sequence:
        temp_dic[dic[index]] = index
    
    val_arr.sort()
    
    #convert sorted attention rankings back to token_ids for removal
    sorted_ids = [temp_dic[val] for val in val_arr]
    
    #Remove least valuable information until seq is required length
    
    for removal_index in range(len(sequence)-max_length):
        token_to_remove = sorted_ids[removal_index]
        sequence = [x for x in sequence if x != token_to_remove]
        
        if len(sequence) == max_length:
            break
    return sequence

In [105]:
#Example of LVIT 
input_ids = tokenizer.encode(list(training_data["Questions"])[15], truncation=True, max_length = 1024) #, return_tensors='tf'
tokenizer.decode(attn_truncate(input_ids,weights_dic,50))

'<Text> Python works on can used for desktop web I conclude that there is way to compile it into an executable for Mac Windows Linux </Text> <Text> The I have where to start or how to write a GUI with it can anybody on this in please? </Text>'

In [104]:
list(training_data["Questions"])[15]

'<Text>Python works on multiple platforms and can be used for desktop and web applications, thus I conclude that there is some way to compile it into an executable for Mac, Windows and Linux.</Text>\r \r <Text>The problem being I have no idea where to start or how to write a GUI with it, can anybody shed some light on this and point me in the right direction please?</Text>'