In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
import torch
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaModel
import math
import pandas as pd
from torch import optim

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
#GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
head_model = GPT2LMHeadModel.from_pretrained('gpt2-xl').to(device)
for param in head_model.parameters():
    
   param.requires_grad = False


lm_head = head_model.lm_head

# head_transformer = head_model.transformer

In [None]:
# R_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
# Roberta_model = RobertaModel.from_pretrained("roberta-base")

In [None]:
class CrossAttentionSingle(nn.Module):
    # def __init__(self, max_length):
    def __init__(self, encoder_dim, decoder_dim, attention_dim = None):
        """
        Single head cross attention block scaled
        """
        super().__init__()
        self.e_dim = encoder_dim
        self.d_dim = decoder_dim
        if attention_dim is None:
            self.attention_dim = decoder_dim
        else:
            self.attention_dim = attention_dim
        
        self.WQ = torch.randn((self.d_dim, self.attention_dim), requires_grad=True).to(device)
        self.WK = torch.randn((self.e_dim, self.attention_dim), requires_grad=True).to(device)
        self.WV = torch.randn((self.e_dim, self.attention_dim), requires_grad=True).to(device)
        self.softmax = nn.Softmax(dim=1).to(device)
        

    def forward(self, encoder_x, decoder_x):
        
        #print(f"self.WQ: {self.WQ}")
        Q = torch.mm(decoder_x.to(device), self.WQ ).to(device)
        #print(f"Q shape {Q.shape}")
        #print(f"Q {Q}")
        K = torch.mm(encoder_x.to(device), self.WK ).to(device)
        #print(f"K shape {K.shape}")
        #print(f"K {K}")
        V = torch.mm(encoder_x.to(device), self.WV ) .to(device)
        #print(f"V shape {V.shape}")
        #print(f"V {V}")
        QKT = torch.mm(Q, K.t()).to(device)
        #print(f"QKT shape {QKT.shape}")
        #print(f"QKT  {QKT}")
      
        # Q d_lenXd_dim
        # K e_lenXd_dim
        # V e_lenXd_dim
        QKT_div = torch.div(QKT,math.sqrt(self.d_dim))
        
        SM = self.softmax(QKT_div).to(device) # may need the div from my earlier transformer
        #print(f"SM  {SM}")
        
        attention = torch.mm(SM, V).to(device) 
        #print(f"attention shape {attention.shape}")
        return attention


In [None]:
class ProposedModel(nn.Module):
    # def __init__(self, max_length):
    def __init__(self, encoder_dim, decoder_dim, attention_dim = None):
        """
        Part by part feed forward
        """
        super().__init__()
        self.e_dim = encoder_dim
        self.d_dim = decoder_dim
        if attention_dim is None:
            self.attention_dim = decoder_dim
        else:
            self.attention_dim = attention_dim
        self.cross_a = CrossAttentionSingle(self.e_dim, self.d_dim, self.attention_dim).to(device)
        self.FF = nn.Linear(self.attention_dim, self.d_dim).to(device)
        self.Relu = nn.ReLU().to(device)
        self.FF2 = nn.Linear(self.d_dim, self.d_dim).to(device)
        self.lm_head = lm_head
        
    def forward(self, encoder_x, decoder_x):
        attention = self.cross_a(encoder_x, decoder_x)
        adjustment = self.FF(attention)
        non_lin_adjustment = self.Relu(adjustment)
        adjustment = self.FF2(non_lin_adjustment)
        adjusted_output = adjustment + decoder_x
        # ######
        # adjusted_output = decoder_x
        # ######
        output = self.lm_head(adjusted_output)
        # print(attention.shape)
        # print(adjusted_output.shape)
        # print(output.shape)
        return output
        

In [None]:
# text = "I work as a data scientist"
# text_ids = tokenizer.encode(text, return_tensors = 'pt').to(device)
# print(text_ids)
# # logits = head_transformer(text_ids).last_hidden_state.squeeze()
# logits = head_model(text_ids).logits
# logits_shape = logits.shape
# print(f"decoder logits shape {logits_shape}")
# print(f"decoder logits sum {torch.sum(logits, dim = 1)}")

In [None]:
# lm_head(logits).shape

In [None]:
# R_tokenized = R_tokenizer(text, return_tensors = 'pt')
# R_embed = Roberta_model(**R_tokenized).last_hidden_state.squeeze()
# R_embed_shape = R_embed.shape
# print(f"Roberta shape {R_embed_shape}")

In [None]:
# test_model = ProposedModel(R_embed_shape[1], logits_shape[1], attention_dim = None)
# # test_model.forward(R_embed, logits)

In [None]:
go_emotions_train = pd.read_csv('train.tsv.txt', sep='\t', header = None)

In [None]:
go_emotions_train

Unnamed: 0,0,1,2
0,My favourite food is anything I didn't have to...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj
...,...,...,...
43405,Added you mate well I’ve just got the bow and ...,18,edsb738
43406,Always thought that was funny but is it a refe...,6,ee7fdou
43407,What are you talking about? Anything bad that ...,3,efgbhks
43408,"More like a baptism, with sexy results!",13,ed1naf8


In [None]:
go_emotions_train.values[1][0]

'Now if he does off himself, everyone will think hes having a laugh screwing with people instead of actually dead'

In [None]:
# go_emotions_train[1].unique()

In [None]:

emotions_dict_emoToidx = {} # emo -> idx
emotions_dict_idxToemo = {} # idx -> emo
for idx, val in enumerate(go_emotions_train.values):
    for emotion in val[1].split(','):
        if emotion not in emotions_dict_emoToidx:
            emotions_dict_emoToidx[emotion] = []
        emotions_dict_emoToidx[emotion].append(idx)
        
        if idx not in emotions_dict_idxToemo:
            emotions_dict_idxToemo[idx] = []
        emotions_dict_idxToemo[idx].append(emotion)


    

In [None]:
# go_emotions_train.values[emotions_dict['27']]
#print(go_emotions_train.values[emotions_dict_emoToidx['6']])
# print(emotions_dict_idxToemo[7])

In [None]:
# torch.load(emo_gpt2-xl.pt, 
# emo_gpt_embed = torch.load('emo_gpt2-xl.pt', map_location=lambda storage, loc: storage.cuda(0))

In [None]:
# emo_roberta_embed = torch.load('emo_Roberta.pt', map_location=lambda storage, loc: storage.cuda(0))

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
6.43825664
6.421954048
0.016302592


In [None]:
def load_model_inputs(context_tensors, generator_logits):
    context = torch.load(context_tensors, map_location=lambda storage, loc: storage.cuda(0))
    logits = torch.load(generator_logits, map_location=lambda storage, loc: storage.cuda(0))
    return context, logits

In [None]:
emo_roberta_embed, emo_gpt_embed = load_model_inputs('emo_Roberta.pt', 'emo_gpt2-xl.pt')

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
14.506000384
14.483106304
0.02289408


In [None]:
# test_model = ProposedModel(emo_roberta_embed[0].shape[1],emo_gpt_embed[0].shape[1], attention_dim = None)

In [None]:
print(emo_roberta_embed[0].shape[1])
print(emo_gpt_embed[0].shape[1])

1024
1600


In [None]:
# test_model.forward(emo_roberta_embed[0], emo_gpt_embed[0])

In [None]:
import random

In [None]:
def prepare_train_data(gpt_embeddings, tokenizer, go_emotions_train):
    emotions_dict_emoToidx = {} # emo -> idx
    emotions_dict_idxToemo = {} # idx -> emo
    for idx, val in enumerate(go_emotions_train.values):
        for emotion in val[1].split(','):
            if emotion not in emotions_dict_emoToidx:
                emotions_dict_emoToidx[emotion] = []
            emotions_dict_emoToidx[emotion].append(idx)

            if idx not in emotions_dict_idxToemo:
                emotions_dict_idxToemo[idx] = []
            emotions_dict_idxToemo[idx].append(emotion)

    
    
    
    train_data_tuples = []
    count = 0
    for example in range(len(gpt_embeddings)):
        emotion_list = emotions_dict_idxToemo[example]
        for emotion in emotion_list:
            text = go_emotions_train.values[example][0]

                
            text_ids = tokenizer.encode(text, return_tensors = 'pt', truncation=True).to(device) ### GPU USAGE THAT CAN BE MADE MORE EFFICIENT
            
            # text_id_shape = text_ids.shape
            
            # if count == 3487:
            #     print(text_id_shape)
            #     print(gpt_embeddings[example].shape)
            # if count == 1:
            #     print(text_id_shape)
            #     print(gpt_embeddings[example].shape)
            
            if len(gpt_embeddings[example].shape) == 1:
                print(gpt_embeddings[example].shape)
                gpt_embeddings[example] = torch.reshape(gpt_embeddings[example], (1, len(gpt_embeddings[example])))
                print(gpt_embeddings[example].shape)
                
            train_data_tuples.append((gpt_embeddings[example], emotion, text_ids))
            count += 1
    return train_data_tuples
                
            
    

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
14.506000384
14.483106304
0.02289408


In [None]:
val_context, val_logits = load_model_inputs('emo_roberta_dev.pt', 'emo_gpt2_dev.pt')
dev_emo_DF = pd.read_csv('dev.tsv.txt', sep='\t', header = None)

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
15.512633344
15.486588416
0.026044928


In [None]:
def prepare_test_data(gpt_embeddings, tokenizer, data, num_context_samples, context_embeddings):
    emotions_dict_emoToidx = {} # emo -> idx
    emotions_dict_idxToemo = {} # idx -> emo
    for idx, val in enumerate(data.values):
        for emotion in val[1].split(','):
            if emotion not in emotions_dict_emoToidx:
                emotions_dict_emoToidx[emotion] = []
            emotions_dict_emoToidx[emotion].append(idx)

            if idx not in emotions_dict_idxToemo:
                emotions_dict_idxToemo[idx] = []
            emotions_dict_idxToemo[idx].append(emotion)

    
    
    
    test_data_tuples = []
    count = 0
    for example in range(len(gpt_embeddings)):
        emotion_list = emotions_dict_idxToemo[example]
        for emotion in emotion_list:
            text = data.values[example][0]

                
            text_ids = tokenizer.encode(text, return_tensors = 'pt', truncation=True).to(device) # ### GPU USAGE THAT CAN BE MADE MORE EFFICIENT
            
            emotion_idxs = emotions_dict_emoToidx[emotion]
            context_sample_list = []
            for context_doc in range(num_context_samples): # without network  takes 23 seconds
                # sample average and stack document samples from a particular emotion
                context_sample_idx = random.sample(emotion_idxs,1)
                #print(context_sample_idx)
                single_context_sample = context_embeddings[context_sample_idx[0]]
                mean_of_sample = torch.mean(single_context_sample, 0)
                #
                # mean_of_sample = torch.randn(mean_of_sample.size())
                # mean_of_sample = torch.zeros(mean_of_sample.size())
                #
                context_sample_list.append(mean_of_sample)
            agregated_stacked_context_sample = torch.stack(context_sample_list, dim = 0)
            
            if len(gpt_embeddings[example].shape) == 1:
                print(gpt_embeddings[example].shape)
                gpt_embeddings[example] = torch.reshape(gpt_embeddings[example], (1, len(gpt_embeddings[example])))
                print(gpt_embeddings[example].shape)
                
            test_data_tuples.append((gpt_embeddings[example], emotion, text_ids, agregated_stacked_context_sample))
            count += 1
    return test_data_tuples
                
            
    

In [None]:
gpt_embeddings_emotion_tuples = prepare_train_data(emo_gpt_embed, tokenizer, go_emotions_train)

torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])
torch.Size([1600])
torch.Size([1, 1600])


In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
15.514730496
15.512767488
0.001963008


In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
15.514730496
15.512767488
0.001963008


In [None]:
loss = 0
gpt_dev_logits = []
for sen in dev_emo_DF.values:
    gpt_tokenized = tokenizer.encode(sen[0], return_tensors = 'pt').to(device)
    if gpt_tokenized.shape[1] == 1:
        continue
    base_output = head_model(gpt_tokenized, labels = gpt_tokenized)
    loss += base_output.loss.to('cpu')
    #gpt_dev_logits.append(head_model.transformer(gpt_tokenized))
    
    del gpt_tokenized
    del base_output
    
    print(".", end='')
average_loss = loss/len(dev_emo_DF.values)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [None]:
print(average_loss)

tensor(4.4947)


In [None]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
15.514730496
15.512767488
0.001963008


In [None]:
print(average_loss)

tensor(4.4947)


In [None]:
dev_prepared = prepare_test_data(val_logits, tokenizer, dev_emo_DF, 100, val_context)

torch.Size([1600])
torch.Size([1, 1600])


In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
18.190696448
18.12928256
0.061413888


In [None]:
def train(model, optimizer, context_embeddings, gpt_embeddings_emotion_tuples, num_context_samples, epochs, dev_tuples, num_examples=None):
    
    CELoss = nn.CrossEntropyLoss()
    random.shuffle(gpt_embeddings_emotion_tuples)
    if num_examples is not None:
        gpt_embeddings_emotion_tuples = gpt_embeddings_emotion_tuples[:num_examples]
    print(f"Num examples: {len(gpt_embeddings_emotion_tuples)}")
    total_example_count = 0
    for epoch in range(epochs):
        model.train()
        random.shuffle(gpt_embeddings_emotion_tuples)
        count = 0
        ag_loss = 0
        ag_loss_epoch = 0
        for gpt_idx_emo_tup in gpt_embeddings_emotion_tuples:
            emotion = gpt_idx_emo_tup[1]
            
            #print(emotion) 
            
            emotion_idxs = emotions_dict_emoToidx[emotion]

            # for idx in emotion_idxs:
            context_sample_list = []
            for context_doc in range(num_context_samples): # without network  takes 23 seconds
                # sample average and stack document samples from a particular emotion
                context_sample_idx = random.sample(emotion_idxs,1)
                #print(context_sample_idx)
                single_context_sample = context_embeddings[context_sample_idx[0]]
                mean_of_sample = torch.mean(single_context_sample, 0)
                #
                # mean_of_sample = torch.randn(mean_of_sample.size())
                # mean_of_sample = torch.zeros(mean_of_sample.size())
                #
                context_sample_list.append(mean_of_sample)
            agregated_stacked_context_sample = torch.stack(context_sample_list, dim = 0)
            
            # print(agregated_stacked_context_sample.shape)
            # print(gpt_idx_emo_tups[0].shape)
#             if count == 3487:
#                 print(f"Count: {count} Text ids: {gpt_idx_emo_tup[2]}")
            
            optimizer.zero_grad()
            network_output = model(agregated_stacked_context_sample, gpt_idx_emo_tup[0])
            
            # https://huggingface.co/transformers/v3.5.1/_modules/transformers/modeling_gpt2.html referenced from here
            #print(gpt_idx_emo_tup[2].shape[1])
            if gpt_idx_emo_tup[2].shape[1] == 1:
                #print("ONE text id?")
                #print(gpt_idx_emo_tup[2].shape[1])
                continue
            shifted_network_output = network_output[..., :-1, :].contiguous()
            shifted_text_ids = gpt_idx_emo_tup[2][..., 1:].contiguous()
            loss = CELoss(shifted_network_output.view(-1, shifted_network_output.size(-1)), shifted_text_ids.view(-1))
            ag_loss += loss
            ag_loss_epoch += loss
            total_example_count += 1
            ## extra stuff from before
            # print(f"True output: {torch.sum(true_output,dim =1)}")
            # print(f"network_output: {network_output.shape}")
            # print(f"True output: {true_output.shape}")
            # print(f"network_output: {network_output.squeeze().shape}")
            # print(f"True output: {true_output.squeeze().shape}")
            # print(f"network_output: {torch.sum(network_output,dim =1)}")
            
            
            loss.backward()
            optimizer.step()
            if count%1000 == 0:
                # print(f"For Epoch: {epoch}, Example: {count}")
                # print(f"TRAIN LOSS: {ag_loss/1000}")
                print(".", end='')
                # ag_loss = 0
            count+=1
            
        model.eval()
        CELoss_dev = nn.CrossEntropyLoss()
        dev_loss_acum = 0
        for dev_example in dev_tuples:
            # print(dev_example[3].shape)
            # print(dev_example[0].shape)
            # print(gpt_idx_emo_tup[0].shape)
            
            dev_network_output = model(dev_example[3], dev_example[0])
            
#             # https://huggingface.co/transformers/v3.5.1/_modules/transformers/modeling_gpt2.html referenced from here
#             #print(gpt_idx_emo_tup[2].shape[1])
            if dev_example[2].shape[1] == 1:
                #print("ONE text id?")
                #print(gpt_idx_emo_tup[2].shape[1])
                continue
            shifted_network_output_dev = dev_network_output[..., :-1, :].contiguous()
            shifted_text_ids_dev = dev_example[2][..., 1:].contiguous()
            dev_loss = CELoss_dev(shifted_network_output_dev.view(-1, shifted_network_output_dev.size(-1)), shifted_text_ids_dev.view(-1))
            # print(dev_loss.float())
            dev_loss_acum += dev_loss.item()
        full_dev_loss = dev_loss_acum / len(dev_tuples)
        
        if epoch % 1 == 0:
            if epoch == 0:
                print(f"FIRST epoch: {epoch}, Total Examples: {total_example_count}")
                print(f"TRAIN LOSS: {ag_loss_epoch/len(gpt_embeddings_emotion_tuples)}")
                print(f"DEV LOSS: {full_dev_loss}")
                print("----------------------------------------")
            else:
                print(f"For Epoch: {epoch}, Total Examples: {total_example_count}")
                print(f"TRAIN LOSS: {ag_loss_epoch/len(gpt_embeddings_emotion_tuples)}")
                print(f"DEV LOSS: {full_dev_loss}")
                print("----------------------------------------")


            

In [None]:
test_model = ProposedModel(emo_roberta_embed[0].shape[1],emo_gpt_embed[0].shape[1], attention_dim = None)
# optimizer = optim.Adam(test_model.parameters(), lr=0.00001,  weight_decay=0.001)
optimizer = optim.Adam(test_model.parameters(), lr=0.00001,  weight_decay=0.001)


In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
18.253611008
18.173123072
0.080487936


In [None]:
# train(test_model, optimizer, emo_roberta_embed, gpt_embeddings_emotion_tuples, 100, 100, dev_prepared)

In [None]:
!pwd
!ls

/home/ubuntu
20B_waiting.ipynb      SimpleFineTune.ipynb  history.ipynb
Emot_get_embeds.ipynb  Split_emo_data.ipynb  neg.zip
Extension.ipynb        Training_ouput	     notebook_file.ipynb
FineTuneText	       __pycache__	     pos.zip
GPT-2_parts.ipynb      data		     pos_finetune_lam_try.ipynb
Generation.ipynb       dev.tsv.txt	     pytorch-transformers
IMDB_train	       emo_Roberta.pt	     records.txt
K2-Extension.ipynb     emo_gpt2-xl.pt	     records_extended.txt
Model_Import.py        emo_gpt2_dev.pt	     test.tsv.txt
Models		       emo_neo1_3.pt	     train.tsv.txt
README.md	       emo_neo2_7.pt
RoBERTa_test.ipynb     emo_roberta_dev.pt


In [None]:
torch.save(test_model.state_dict(), '/home/ubuntu/Models/Emo_100_100')

In [None]:
# text = "I work as a data scientist"
# text_ids = tokenizer.encode(text, return_tensors = 'pt').to(device)
# print(text_ids)
# # logits = head_transformer(text_ids).last_hidden_state.squeeze()
# logits = head_model(text_ids).logits
# logits_shape = logits.shape
# print(f"decoder logits shape {logits_shape}")
# print(f"decoder logits sum {torch.sum(logits, dim = 1)}")

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
18.253611008
18.173123072
0.080487936


In [None]:
# print(len(dev_emo_DF.values))

In [None]:
def train_ones(model, optimizer, context_embeddings, gpt_embeddings_emotion_tuples, num_context_samples, epochs, dev_tuples, num_examples=None):
    
    CELoss = nn.CrossEntropyLoss()
    random.shuffle(gpt_embeddings_emotion_tuples)
    if num_examples is not None:
        gpt_embeddings_emotion_tuples = gpt_embeddings_emotion_tuples[:num_examples]
    print(f"Num examples: {len(gpt_embeddings_emotion_tuples)}")
    total_example_count = 0
    for epoch in range(epochs):
        model.train()
        random.shuffle(gpt_embeddings_emotion_tuples)
        count = 0
        ag_loss = 0
        ag_loss_epoch = 0
        for gpt_idx_emo_tup in gpt_embeddings_emotion_tuples:
            emotion = gpt_idx_emo_tup[1]
            
            #print(emotion) 
            
            emotion_idxs = emotions_dict_emoToidx[emotion]

            # for idx in emotion_idxs:
            context_sample_list = []
            for context_doc in range(num_context_samples): # without network  takes 23 seconds
                # sample average and stack document samples from a particular emotion
                context_sample_idx = random.sample(emotion_idxs,1)
                #print(context_sample_idx)
                single_context_sample = context_embeddings[context_sample_idx[0]]
                mean_of_sample = torch.mean(single_context_sample, 0)
                #
                # mean_of_sample = torch.randn(mean_of_sample.size())
                mean_of_sample = torch.ones(mean_of_sample.size())
                #
                context_sample_list.append(mean_of_sample)
            agregated_stacked_context_sample = torch.stack(context_sample_list, dim = 0)
            
            # print(agregated_stacked_context_sample.shape)
            # print(gpt_idx_emo_tups[0].shape)
#             if count == 3487:
#                 print(f"Count: {count} Text ids: {gpt_idx_emo_tup[2]}")
            
            optimizer.zero_grad()
            network_output = model(agregated_stacked_context_sample, gpt_idx_emo_tup[0])
            
            # https://huggingface.co/transformers/v3.5.1/_modules/transformers/modeling_gpt2.html referenced from here
            #print(gpt_idx_emo_tup[2].shape[1])
            if gpt_idx_emo_tup[2].shape[1] == 1:
                #print("ONE text id?")
                #print(gpt_idx_emo_tup[2].shape[1])
                continue
            shifted_network_output = network_output[..., :-1, :].contiguous()
            shifted_text_ids = gpt_idx_emo_tup[2][..., 1:].contiguous()
            loss = CELoss(shifted_network_output.view(-1, shifted_network_output.size(-1)), shifted_text_ids.view(-1))
            ag_loss += loss
            ag_loss_epoch += loss
            total_example_count += 1
            ## extra stuff from before
            # print(f"True output: {torch.sum(true_output,dim =1)}")
            # print(f"network_output: {network_output.shape}")
            # print(f"True output: {true_output.shape}")
            # print(f"network_output: {network_output.squeeze().shape}")
            # print(f"True output: {true_output.squeeze().shape}")
            # print(f"network_output: {torch.sum(network_output,dim =1)}")
            
            
            loss.backward()
            optimizer.step()
            if count%1000 == 0:
                # print(f"For Epoch: {epoch}, Example: {count}")
                # print(f"TRAIN LOSS: {ag_loss/1000}")
                print(".", end='')
                # ag_loss = 0
            count+=1
            
        model.eval()
        CELoss_dev = nn.CrossEntropyLoss()
        dev_loss_acum = 0
        for dev_example in dev_tuples:
            # print(dev_example[3].shape)
            # print(dev_example[0].shape)
            # print(gpt_idx_emo_tup[0].shape)
            
            dev_network_output = model(dev_example[3], dev_example[0])
            
#             # https://huggingface.co/transformers/v3.5.1/_modules/transformers/modeling_gpt2.html referenced from here
#             #print(gpt_idx_emo_tup[2].shape[1])
            if dev_example[2].shape[1] == 1:
                #print("ONE text id?")
                #print(gpt_idx_emo_tup[2].shape[1])
                continue
            shifted_network_output_dev = dev_network_output[..., :-1, :].contiguous()
            shifted_text_ids_dev = dev_example[2][..., 1:].contiguous()
            dev_loss = CELoss_dev(shifted_network_output_dev.view(-1, shifted_network_output_dev.size(-1)), shifted_text_ids_dev.view(-1))
            # print(dev_loss.float())
            dev_loss_acum += dev_loss.item()
        full_dev_loss = dev_loss_acum / len(dev_tuples)
        
        if epoch % 1 == 0:
            if epoch == 0:
                print(f"FIRST epoch: {epoch}, Total Examples: {total_example_count}")
                print(f"TRAIN LOSS: {ag_loss_epoch/len(gpt_embeddings_emotion_tuples)}")
                print(f"DEV LOSS: {full_dev_loss}")
                print("----------------------------------------")
            else:
                print(f"For Epoch: {epoch}, Total Examples: {total_example_count}")
                print(f"TRAIN LOSS: {ag_loss_epoch/len(gpt_embeddings_emotion_tuples)}")
                print(f"DEV LOSS: {full_dev_loss}")
                print("----------------------------------------")


            

In [None]:
test_model2 = ProposedModel(emo_roberta_embed[0].shape[1],emo_gpt_embed[0].shape[1], attention_dim = None)
# optimizer = optim.Adam(test_model.parameters(), lr=0.00001,  weight_decay=0.001)
optimizer = optim.Adam(test_model2.parameters(), lr=0.00001,  weight_decay=0.001)


In [None]:
# train_ones(test_model2, optimizer, emo_roberta_embed, gpt_embeddings_emotion_tuples, 100, 100, dev_prepared)

In [None]:
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r-a  # free inside reserved
print(t/1000000000)
print(r/1000000000)
print(a/1000000000)
print(f/1000000000)

42.314694656
18.295554048
18.217946624
0.077607424


In [None]:
test_model_dif_num_context = ProposedModel(emo_roberta_embed[0].shape[1],emo_gpt_embed[0].shape[1], attention_dim = None)

optimizer = optim.Adam(test_model_dif_num_context.parameters(), lr=0.00001,  weight_decay=0.001)

In [None]:
dev_prepared_10 = prepare_test_data(val_logits, tokenizer, dev_emo_DF, 10, val_context)

In [None]:
dev_prepared_1 = prepare_test_data(val_logits, tokenizer, dev_emo_DF, 1, val_context)

In [None]:
train(test_model_dif_num_context, optimizer, emo_roberta_embed, gpt_embeddings_emotion_tuples, 100, 100, dev_prepared_1)

Num examples: 51103
....................................................FIRST epoch: 0, Total Examples: 51094
TRAIN LOSS: 4.272182941436768
DEV LOSS: 4.186780326129127
----------------------------------------
....................................................For Epoch: 1, Total Examples: 102188
TRAIN LOSS: 4.196245193481445
DEV LOSS: 4.195555310357701
----------------------------------------
....................................................For Epoch: 2, Total Examples: 153282
TRAIN LOSS: 4.188021183013916
DEV LOSS: 4.199080481377889
----------------------------------------
....................................................For Epoch: 3, Total Examples: 204376
TRAIN LOSS: 4.18240213394165
DEV LOSS: 4.195852897935152
----------------------------------------
....................................................For Epoch: 4, Total Examples: 255470
TRAIN LOSS: 4.17851448059082
DEV LOSS: 4.197913621725707
----------------------------------------
.........................................

In [None]:
print(torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |   17756 MB |   18567 MB |  629802 GB |  629785 GB |
|       from large pool |    6274 MB |    7083 MB |  603443 GB |  603437 GB |
|       from small pool |   11481 MB |   11487 MB |   26359 GB |   26348 GB |
|---------------------------------------------------------------------------|
| Active memory         |   17756 MB |   18567 MB |  629802 GB |  629785 GB |
|       from large pool |    6274 MB |    7083 MB |  603443 GB |  603437 GB |
|       from small pool |   11481 MB |   11487 MB |   26359 GB |   26348 GB |
|---------------------------------------------------------------

In [None]:
# import torch
# import gc
# for obj in gc.get_objects():
#     try:
#         if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
#             print(type(obj), obj.size())
#     except:
#         pass