In [1]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
import torch
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaModel
import math
import pandas as pd
from torch import optim

In [2]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
#GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
head_model = GPT2LMHeadModel.from_pretrained('gpt2-xl').to(device)
for param in head_model.parameters():
    
   param.requires_grad = False


lm_head = head_model.lm_head

# head_transformer = head_model.transformer

In [4]:
R_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
Roberta_model = RobertaModel.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
class CrossAttentionSingle(nn.Module):
    # def __init__(self, max_length):
    def __init__(self, encoder_dim, decoder_dim, attention_dim = None):
        """
        Single head cross attention block scaled
        """
        super().__init__()
        self.e_dim = encoder_dim
        self.d_dim = decoder_dim
        if attention_dim is None:
            self.attention_dim = decoder_dim
        else:
            self.attention_dim = attention_dim
        
        self.WQ = torch.randn((self.d_dim, self.attention_dim), requires_grad=True).to(device)
        self.WK = torch.randn((self.e_dim, self.attention_dim), requires_grad=True).to(device)
        self.WV = torch.randn((self.e_dim, self.attention_dim), requires_grad=True).to(device)
        self.softmax = nn.Softmax(dim=1).to(device)
        

    def forward(self, encoder_x, decoder_x):
        
        #print(f"self.WQ: {self.WQ}")
        Q = torch.mm(decoder_x.to(device), self.WQ ).to(device)
        #print(f"Q shape {Q.shape}")
        #print(f"Q {Q}")
        K = torch.mm(encoder_x.to(device), self.WK ).to(device)
        #print(f"K shape {K.shape}")
        #print(f"K {K}")
        V = torch.mm(encoder_x.to(device), self.WV ) .to(device)
        #print(f"V shape {V.shape}")
        #print(f"V {V}")
        QKT = torch.mm(Q, K.t()).to(device)
        #print(f"QKT shape {QKT.shape}")
        #print(f"QKT  {QKT}")
      
        # Q d_lenXd_dim
        # K e_lenXd_dim
        # V e_lenXd_dim
        QKT_div = torch.div(QKT,math.sqrt(self.d_dim))
        
        SM = self.softmax(QKT_div).to(device) # may need the div from my earlier transformer
        #print(f"SM  {SM}")
        
        attention = torch.mm(SM, V).to(device) 
        #print(f"attention shape {attention.shape}")
        return attention


In [6]:
class ProposedModel(nn.Module):
    # def __init__(self, max_length):
    def __init__(self, encoder_dim, decoder_dim, attention_dim = None):
        """
        Part by part feed forward
        """
        super().__init__()
        self.e_dim = encoder_dim
        self.d_dim = decoder_dim
        if attention_dim is None:
            self.attention_dim = decoder_dim
        else:
            self.attention_dim = attention_dim
        self.cross_a = CrossAttentionSingle(self.e_dim, self.d_dim, self.attention_dim).to(device)
        self.FF = nn.Linear(self.attention_dim, self.d_dim).to(device)
        self.lm_head = lm_head
        
    def forward(self, encoder_x, decoder_x):
        attention = self.cross_a(encoder_x, decoder_x)
        adjustment = self.FF(attention)
        adjusted_output = adjustment + decoder_x
        # ######
        # adjusted_output = decoder_x
        # ######
        output = self.lm_head(adjusted_output)
        # print(attention.shape)
        # print(adjusted_output.shape)
        # print(output.shape)
        return output
        

In [7]:
text = "I work as a data scientist"
text_ids = tokenizer.encode(text, return_tensors = 'pt').to(device)
print(text_ids)
# logits = head_transformer(text_ids).last_hidden_state.squeeze()
logits = head_model(text_ids).logits
logits_shape = logits.shape
print(f"decoder logits shape {logits_shape}")
print(f"decoder logits sum {torch.sum(logits, dim = 1)}")

tensor([[   40,   670,   355,   257,  1366, 11444]], device='cuda:0')
decoder logits shape torch.Size([1, 6, 50257])
decoder logits sum tensor([[ 12.1287,  15.6366,  -9.6813,  ..., -31.3261, -35.7890,   6.1169]],
       device='cuda:0')


In [8]:
# lm_head(logits).shape

In [9]:
R_tokenized = R_tokenizer(text, return_tensors = 'pt')
R_embed = Roberta_model(**R_tokenized).last_hidden_state.squeeze()
R_embed_shape = R_embed.shape
print(f"Roberta shape {R_embed_shape}")

Roberta shape torch.Size([8, 768])


In [10]:
test_model = ProposedModel(R_embed_shape[1], logits_shape[1], attention_dim = None)
# test_model.forward(R_embed, logits)

In [11]:
go_emotions_train = pd.read_csv('train.tsv.txt', sep='\t', header = None)

In [12]:
go_emotions_train

Unnamed: 0,0,1,2
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj
...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,edsb738
43406,Always thought that was funny but is it a refe...,6,ee7fdou
43407,What are you talking about? Anything bad that ...,3,efgbhks
43408,"More like a baptism, with sexy results!",13,ed1naf8


In [13]:
go_emotions_train.values[1][0]

'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead'

In [14]:
emotions_dict_emoToidx = {} # emo -> idx
emotions_dict_idxToemo = {} # idx -> emo
for idx, val in enumerate(go_emotions_train.values):
    for emotion in val[1].split(','):
        if emotion not in emotions_dict_emoToidx:
            emotions_dict_emoToidx[emotion] = []
        emotions_dict_emoToidx[emotion].append(idx)
        
        if idx not in emotions_dict_idxToemo:
            emotions_dict_idxToemo[idx] = []
        emotions_dict_idxToemo[idx].append(emotion)


    

In [15]:
# go_emotions_train.values[emotions_dict['27']]
#print(go_emotions_train.values[emotions_dict_emoToidx['6']])
print(emotions_dict_idxToemo[7])

['8', '20']


In [16]:
# torch.load(emo_gpt2-xl.pt, 
emo_gpt_embed = torch.load('emo_gpt2-xl.pt', map_location=lambda storage, loc: storage.cuda(0))

In [17]:
emo_roberta_embed = torch.load('emo_Roberta.pt', map_location=lambda storage, loc: storage.cuda(0))

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 39.41 GiB total capacity; 12.23 GiB already allocated; 4.56 MiB free; 12.25 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
test_model = ProposedModel(emo_roberta_embed[0].shape[1],emo_gpt_embed[0].shape[1], attention_dim = None)

In [None]:
print(emo_roberta_embed[0].shape[1])
print(emo_gpt_embed[0].shape[1])

In [None]:
# test_model.forward(emo_roberta_embed[0], emo_gpt_embed[0])

In [None]:
import random

In [None]:
def prepare_train_data(gpt_embeddings):
    train_data_tuples = []
    count = 0
    for example in range(len(gpt_embeddings)):
        emotion_list = emotions_dict_idxToemo[example]
        for emotion in emotion_list:
            text = go_emotions_train.values[example][0]

                
            text_ids = tokenizer.encode(text, return_tensors = 'pt', truncation=True).to(device)
            
            # text_id_shape = text_ids.shape
            
            # if count == 3487:
            #     print(text_id_shape)
            #     print(gpt_embeddings[example].shape)
            # if count == 1:
            #     print(text_id_shape)
            #     print(gpt_embeddings[example].shape)
            
            if len(gpt_embeddings[example].shape) == 1:
                print(gpt_embeddings[example].shape)
                gpt_embeddings[example] = torch.reshape(gpt_embeddings[example], (1, len(gpt_embeddings[example])))
                print(gpt_embeddings[example].shape)
                
            train_data_tuples.append((gpt_embeddings[example], emotion, text_ids))
            count += 1
    return train_data_tuples
                
            
    

In [None]:
gpt_embeddings_emotion_tuples = prepare_train_data(emo_gpt_embed)

In [None]:
def train(model, optimizer, context_embeddings, gpt_embeddings_emotion_tuples, num_context_samples, epochs):
    model.train()
    CELoss = nn.CrossEntropyLoss()
    random.shuffle(gpt_embeddings_emotion_tuples)
    # gpt_embeddings_emotion_tuples = gpt_embeddings_emotion_tuples[:100]
    print(f"Num examples: {len(gpt_embeddings_emotion_tuples)}")
    total_example_count = 0
    for epoch in range(epochs):
        
        random.shuffle(gpt_embeddings_emotion_tuples)
        count = 0
        ag_loss = 0
        ag_loss_epoch = 0
        for gpt_idx_emo_tup in gpt_embeddings_emotion_tuples:
            emotion = gpt_idx_emo_tup[1]
            
            #print(emotion) 
            
            emotion_idxs = emotions_dict_emoToidx[emotion]

            # for idx in emotion_idxs:
            context_sample_list = []
            for context_doc in range(num_context_samples): # without network training takes 23 seconds
                # sample average and stack document samples from a particular emotion
                context_sample_idx = random.sample(emotion_idxs,1)
                #print(context_sample_idx)
                single_context_sample = context_embeddings[context_sample_idx[0]]
                mean_of_sample = torch.mean(single_context_sample, 0)
                #
                # mean_of_sample = torch.randn(mean_of_sample.size())
                # mean_of_sample = torch.zeros(mean_of_sample.size())
                #
                context_sample_list.append(mean_of_sample)
            agregated_stacked_context_sample = torch.stack(context_sample_list, dim = 0)
            
            # print(agregated_stacked_context_sample.shape)
            # print(gpt_idx_emo_tups[0].shape)
#             if count == 3487:
#                 print(f"Count: {count} Text ids: {gpt_idx_emo_tup[2]}")
            
            optimizer.zero_grad()
            network_output = model(agregated_stacked_context_sample, gpt_idx_emo_tup[0])
            true_output = lm_head(gpt_idx_emo_tup[0])
            
            # https://huggingface.co/transformers/v3.5.1/_modules/transformers/modeling_gpt2.html referenced from here
            #print(gpt_idx_emo_tup[2].shape[1])
            if gpt_idx_emo_tup[2].shape[1] == 1:
                #print("ONE text id?")
                #print(gpt_idx_emo_tup[2].shape[1])
                continue
            shifted_network_output = network_output[..., :-1, :].contiguous()
            shifted_text_ids = gpt_idx_emo_tup[2][..., 1:].contiguous()
            loss = CELoss(shifted_network_output.view(-1, shifted_network_output.size(-1)), shifted_text_ids.view(-1))
            ag_loss += loss
            ag_loss_epoch += loss
            total_example_count += 1
            ## extra stuff from before
            # print(f"True output: {torch.sum(true_output,dim =1)}")
            # print(f"network_output: {network_output.shape}")
            # print(f"True output: {true_output.shape}")
            # print(f"network_output: {network_output.squeeze().shape}")
            # print(f"True output: {true_output.squeeze().shape}")
            # print(f"network_output: {torch.sum(network_output,dim =1)}")
            
            
            loss.backward()
            optimizer.step()
            if count%1000 == 0:
                # print(f"For Epoch: {epoch}, Example: {count}")
                # print(f"TRAIN LOSS: {ag_loss/1000}")
                print(".")
                # ag_loss = 0
            count+=1
        if epoch % 1 == 0:
            if epoch == 0:
                print(f"FIRST epoch: {epoch}, Total Examples: {total_example_count}")
                print(f"TRAIN LOSS: {ag_loss_epoch/len(gpt_embeddings_emotion_tuples)}")
                print("----------------------------------------")
            else:
                print(f"For Epoch: {epoch}, Total Examples: {total_example_count}")
                print(f"TRAIN LOSS: {ag_loss_epoch/len(gpt_embeddings_emotion_tuples)}")
                print("----------------------------------------")


            

In [None]:
test_model = ProposedModel(emo_roberta_embed[0].shape[1],emo_gpt_embed[0].shape[1], attention_dim = None)
optimizer = optim.Adam(test_model.parameters(), lr=0.00001,  weight_decay=0.001)


In [None]:
train(test_model, optimizer, emo_roberta_embed, gpt_embeddings_emotion_tuples, 10, 10)

In [None]:
text = "I work as a data scientist"
text_ids = tokenizer.encode(text, return_tensors = 'pt').to(device)
print(text_ids)
# logits = head_transformer(text_ids).last_hidden_state.squeeze()
logits = head_model(text_ids).logits
logits_shape = logits.shape
print(f"decoder logits shape {logits_shape}")
print(f"decoder logits sum {torch.sum(logits, dim = 1)}")

In [None]:
logits = head_model(text_ids).logits

In [None]:
# for each emotion
    # list_ofidx for that emoution = emotion_list
    # for each shuffle(emotion_list):
        # randomly samle from emotion_list to get context embed
        # model(random_sample_context, current_Gpt_embed) 

In [None]:
# for each emotion
    # list_ofidx for that emoution = emotion_list
    # for each shuffle(emotion_list):
        # randomly samle from emotion_list to get context embed
        # model(random_sample_context, current_Gpt_embed) 