In [1]:
!python -m pip install transformers



In [2]:
import traceback
import csv

import pandas as pd


def write_tsv_dataframe(filepath, dataframe):
    """
        Stores `DataFrame` as tsv file

        Parameters
        ----------
        filepath : str
            Path to tsv file
        dataframe : pd.DataFrame
            DataFrame to store

        Raises
        ------
        IOError
            if the file can't be opened
    """
    try:
        dataframe.to_csv(filepath, encoding='utf-8', sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE)
    except IOError:
        traceback.print_exc()


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
def combine_columns(df_arguments, df_labels):
    """Combines the two `DataFrames` on column `Argument ID`"""
    return pd.merge(df_arguments, df_labels, on='Argument ID')


In [4]:
def split_arguments(df_arguments):
    """Splits `DataFrame` by column `Usage` into `train`-, `validation`-, and `test`-arguments"""
    train_arguments = df_arguments.loc[df_arguments['Usage'] == 'train'].drop(['Usage'], axis=1).reset_index(drop=True)
    valid_arguments = df_arguments.loc[df_arguments['Usage'] == 'validation'].drop(['Usage'], axis=1).reset_index(drop=True)
    test_arguments = df_arguments.loc[df_arguments['Usage'] == 'test'].drop(['Usage'], axis=1).reset_index(drop=True)
    
    return train_arguments, valid_arguments, test_arguments


In [5]:
def create_dataframe_head(argument_ids, model_name):
    """
        Creates `DataFrame` usable to append predictions to it

        Parameters
        ----------
        argument_ids : list[str]
            First column of the resulting DataFrame
        model_name : str
            Second column of DataFrame will contain the given model name

        Returns
        -------
        pd.DataFrame
            prepared DataFrame
    """
    df_model_head = pd.DataFrame(argument_ids, columns=['Argument ID'])
    df_model_head['Method'] = [model_name] * len(argument_ids)

    return df_model_head


In [6]:
import json
class MissingColumnError(AttributeError):
    """Error indicating that an imported DataFrame lacks necessary columns"""
    pass


In [7]:
def load_json_file(filepath):
    """Load content of json-file from `filepath`"""
    with open(filepath, 'r') as  json_file:
        return json.load(json_file)


In [8]:
def load_values_from_json(filepath):
    """Load values per level from json-file from `filepath`"""
    json_values = load_json_file(filepath)
    values = { "1":set(), "2":set(), "3":set(), "4a":set(), "4b":set() }
    for value in json_values["values"]:
        values["1"].add(value["name"])
        values["2"].add(value["level2"])
        for valueLevel3 in value["level3"]:
            values["3"].add(valueLevel3)
        for valueLevel4a in value["level4a"]:
            values["4a"].add(valueLevel4a)
        for valueLevel4b in value["level4b"]:
            values["4b"].add(valueLevel4b)
    values["1"] = sorted(values["1"])
    values["2"] = sorted(values["2"])
    values["3"] = sorted(values["3"])
    values["4a"] = sorted(values["4a"])
    values["4b"] = sorted(values["4b"])
    return values


In [9]:
def load_arguments_from_tsv(filepath, default_usage='test'):
    """
        Reads arguments from tsv file

        Parameters
        ----------
        filepath : str
            The path to the tsv file
        default_usage : str, optional
            The default value if the column "Usage" is missing

        Returns
        -------
        pd.DataFrame
            the DataFrame with all arguments

        Raises
        ------
        MissingColumnError
            if the required columns "Argument ID" or "Premise" are missing in the read data
        IOError
            if the file can't be read
        """
    try:
        dataframe = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=0)
        if not {'Argument ID', 'Premise'}.issubset(set(dataframe.columns.values)):
            raise MissingColumnError('The argument "%s" file does not contain the minimum required columns [Argument ID, Premise].' % filepath)
        if 'Usage' not in dataframe.columns.values:
            dataframe['Usage'] = [default_usage] * len(dataframe)
        return dataframe
    except IOError:
        traceback.print_exc()
        raise


In [10]:
def load_labels_from_tsv(filepath, label_order):
    """
        Reads label annotations from tsv file

        Parameters
        ----------
        filepath : str
            The path to the tsv file
        label_order : list[str]
            The listing and order of the labels to use from the read data

        Returns
        -------
        pd.DataFrame
            the DataFrame with the annotations

        Raises
        ------
        MissingColumnError
            if the required columns "Argument ID" or names from `label_order` are missing in the read data
        IOError
            if the file can't be read
        """
    try:
        dataframe = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=0)
        dataframe = dataframe[['Argument ID'] + label_order]
        return dataframe
    except IOError:
        traceback.print_exc()
        raise
    except KeyError:
        raise MissingColumnError('The file "%s" does not contain the required columns for its level.' % filepath)


In [11]:
import sys
import getopt
import os

In [12]:
model_dir = 'models'
data_dir = 'data'

In [13]:
if not os.path.exists(model_dir):
    os.makedirs(model_dir)


In [14]:
argument_filepath = os.path.join(data_dir, 'arguments.tsv')
value_json_filepath = os.path.join(data_dir, 'values.json')


In [15]:
df_arguments = load_arguments_from_tsv(argument_filepath, default_usage='train')

In [16]:
values = load_values_from_json(value_json_filepath)
num_labels_Lv2 = len(values['2'])


In [17]:
df_arguments.keys()

Index(['Argument ID', 'Part', 'Usage', 'Conclusion', 'Stance', 'Premise'], dtype='object')

In [18]:
# for ip in df_arguments['Argument ID']:
#   #print(df_arguments['Stance'][ip])
#   print(ip)

In [19]:
level =2
label_filepath = os.path.join(data_dir, 'labels-level{}.tsv'.format(str(level)))
df_labels = load_labels_from_tsv(label_filepath, values[str(level)])

In [20]:
a = df_labels.keys()
for key in df_labels.keys():
  print(len(df_labels[key]),key)

5270 Argument ID
5270 Achievement
5270 Benevolence: caring
5270 Benevolence: dependability
5270 Conformity: interpersonal
5270 Conformity: rules
5270 Face
5270 Hedonism
5270 Humility
5270 Power: dominance
5270 Power: resources
5270 Security: personal
5270 Security: societal
5270 Self-direction: action
5270 Self-direction: thought
5270 Stimulation
5270 Tradition
5270 Universalism: concern
5270 Universalism: nature
5270 Universalism: objectivity
5270 Universalism: tolerance


In [21]:
df_labels['Achievement'][0]

0

In [22]:
# from typing import Dict, List
# #def generate_pairwise_input(dataset: Dict[List], labels: Dict[List]) -> (List[str], List[str], List[str], List[int]):
# def generate_pairwise_input(dataset, labels):
#     """
#     TODO: group all premises and corresponding hypotheses and labels of the datapoints
#     a datapoint as seen earlier is a dict of premis, hypothesis and label
#     """
#     #raise NotImplementedError
#     premise=[]
#     conclusion=[]
#     stance=[]
#     n_labels =labels.keys()
#     n_labels = n_labels[1:]
#     print(n_labels)
#     label=[]
    
#     n = len(dataset['Argument ID'])
#     m = len(labels['Argument ID'])
#     print(n,m)
#     for i in range(n):
#         premise.append(dataset['Premise'][i])
#         conclusion.append(dataset['Conclusion'][i])
#         stance.append(dataset['Stance'][i])
#     for l in range(len(n_labels)):
#         label_id = []
#         #print(i)
#         for i in range(m):
#             #print(n_labels[l])
#             label_id.append(int(labels[n_labels[l]][i]))
#         label.append(label_id)

#     return premise, conclusion, stance, label

In [23]:
from typing import Dict, List
#def generate_pairwise_input(dataset: Dict[List], labels: Dict[List]) -> (List[str], List[str], List[str], List[int]):
def generate_pairwise_input(dataset, labels):
    """
    TODO: group all premises and corresponding hypotheses and labels of the datapoints
    a datapoint as seen earlier is a dict of premis, hypothesis and label
    """
    #raise NotImplementedError
    premise=[]
    conclusion=[]
    stance=[]
    n_labels =labels.keys()
    n_labels = n_labels[1:]
    print(n_labels)
    label=[]
    
    n = len(dataset['Argument ID'])
    m = len(labels['Argument ID'])
    print(n,m)
    for i in range(n):
        premise.append(dataset['Premise'][i])
        conclusion.append(dataset['Conclusion'][i])
        stance.append(dataset['Stance'][i])
    for i in range(m):
        sent_label = []
        #print(i)
        for l in range(len(n_labels)):
            #print(n_labels[l])
            sent_label.append(int(labels[n_labels[l]][i]))
        label.append(sent_label)

    return premise, conclusion, stance, label

In [24]:
#Randomize them first
train_premises, train_conclusion, train_stance, train_labels = generate_pairwise_input(df_arguments, df_labels)


Index(['Achievement', 'Benevolence: caring', 'Benevolence: dependability',
       'Conformity: interpersonal', 'Conformity: rules', 'Face', 'Hedonism',
       'Humility', 'Power: dominance', 'Power: resources',
       'Security: personal', 'Security: societal', 'Self-direction: action',
       'Self-direction: thought', 'Stimulation', 'Tradition',
       'Universalism: concern', 'Universalism: nature',
       'Universalism: objectivity', 'Universalism: tolerance'],
      dtype='object')
5270 5270


In [25]:
import random 
random.seed(42)
def randomize_data(premises, conclusion, stance, labels):
  n = len(premises)
  data = list(range(n))
  random.shuffle(data)
  train_premises = []
  train_conclusion = []
  train_stance = []
  train_labels = []
  for i in data:
    train_premises.append(premises[i])
    train_conclusion.append(conclusion[i])
    train_stance.append(stance[i])
    train_labels.append(labels[:][i])
  return train_premises, train_conclusion, train_stance, train_labels


In [26]:
# a = train_labels[:][-5:]
# len(a)

In [27]:
train_premises, train_conclusion, train_stance, train_labels = randomize_data(train_premises, train_conclusion, train_stance, train_labels)

In [28]:
val_premises = train_premises[-500:]
val_conclusion = train_conclusion[-500:]
val_stance = train_stance[-500:]
val_labels = train_labels[:][-500:]

In [29]:
train_premises = train_premises[:-500]
train_conclusion = train_conclusion[:-500]
train_stance = train_stance[:-500]
train_labels = train_labels[:][:-500]

In [30]:
# Nothing to do for this class!
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token
    
    def __call__(self, prem_batch: List[str], hyp_batch: List[str], stance_batch: List[str]) -> List[List[str]]:
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # The two sentences deimited by the [SEP] token.
        batch_len = len(prem_batch)
        #spaces = [" "]*batch_len
        conc_batch = [stance_batch[i]+" "+hyp_batch[i] for i in range(batch_len)]
        enc = self.hf_tokenizer(
            prem_batch,
            conc_batch,
            padding=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return enc
    

# HERE IS AN EXAMPLE OF HOW TO USE THE BATCH TOKENIZER
tokenizer = BatchTokenizer()
a = [["this is the premise.", "This is also a premise"], ["this is the hypothesis", "This is a second hypothesis"],["in favour of", "against"]]
x = tokenizer(*a)
print(x)
tokenizer.hf_tokenizer.batch_decode(x["input_ids"])



{'input_ids': tensor([[  101,  2023,  2003,  1996, 18458,  1012,   102,  1999,  7927,  1997,
          2023,  2003,  1996, 10744,   102],
        [  101,  2023,  2003,  2036,  1037, 18458,   102,  2114,  2023,  2003,
          1037,  2117, 10744,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}


['[CLS] this is the premise. [SEP] in favour of this is the hypothesis [SEP]',
 '[CLS] this is also a premise [SEP] against this is a second hypothesis [SEP] [PAD]']

In [31]:
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[:][i:i + n]

def chunk_multi(lst1, lst2, lst3, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i + n], lst2[i: i + n], lst3[i: i + n]
        


In [32]:
sum=0
import numpy as np
# for i in range(5270):
#   sum += np.sum(np.array(train_labels[:][i]))
print(np.sum(np.array(train_labels)))
#print(sum)

16277


In [33]:
against=0
infavour = 0
for i in range(4770):
  if(train_stance[i]=='against'):
    against +=1
  elif(train_stance[i]=='in favor of'):
    infavour += 1
  else:
    print(train_stance[i])


In [34]:
# Notice that since we use huggingface, we tokenize and
# encode in all at once!
batch_size=64
tokenizer = BatchTokenizer()
train_input_batches = [b for b in chunk_multi(train_premises, train_conclusion, train_stance, batch_size)]
# Tokenize + encode
train_input_batches = [tokenizer(*batch) for batch in train_input_batches]

In [35]:
val_input_batches = [b for b in chunk_multi(val_premises, val_conclusion, val_stance, batch_size)]
# Tokenize + encode
val_input_batches = [tokenizer(*batch) for batch in val_input_batches]


In [36]:
len(val_labels[0])

20

In [37]:
def encode_labels(labels: List[List[int]]) -> List[torch.FloatTensor]:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[List[int]]): List of all labels in the batch

    Returns:
        List[torch.FloatTensor]: List of Tensors of all labels in the batch
    """
    
    return torch.LongTensor(labels)


In [38]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
device

device(type='cpu')

In [39]:
train_label_batches = [b for b in chunk(train_labels, batch_size)]
train_label_batches = [encode_labels(batch) for batch in train_label_batches]

In [40]:
val_label_batches = [b for b in chunk(val_labels, batch_size)]
val_label_batches = [encode_labels(batch) for batch in val_label_batches]

In [41]:
val_label_batches[0][0]

tensor([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0])

In [42]:
# class NLIClassifier(torch.nn.Module):
#     def __init__(self, output_size: int, hidden_size: int):
#         super().__init__()
#         self.output_size = output_size
#         self.hidden_size = hidden_size
#         # Initialize BERT, which we use instead of a single embedding layer.
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
#         # Freeze them if training is too slow. Notice that the learning
#         # rate should probably be smaller in this case.
#         # Uncommenting out the below 2 lines means only our classification layer will be updated.
#         for param in self.bert.parameters():
#             param.requires_grad = False
#         self.bert_hidden_dimension = self.bert.config.hidden_size
#         print(self.bert_hidden_dimension)
#         # TODO: Add an extra hidden layer in the classifier, projecting
#         #      from the BERT hidden dimension to hidden size.
#         # TODO: Add a relu nonlinearity to be used in the forward method
#         #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
#         self.hidden_layer1 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer2 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer3 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer4 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer5 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer6 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer7 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer8 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer9 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer10 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer11 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer12 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer13 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer14 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer15 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer16 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer17 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer18 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer19 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer20 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layers = [torch.nn.Linear(self.bert_hidden_dimension, 32).to(device) for i in range(self.output_size)]
#         self.classifiers = [torch.nn.Linear(32, 1).to(device) for i in range(self.output_size)]
#         #self.hidden_layer2 = torch.nn.Linear(self.hidden_size, 32)
#         #self.hidden_layer3 = torch.nn.Linear(128, 32)
#         #self.hidden_layer4 = torch.nn.Linear(32, 8)
#         self.relu = torch.nn.ReLU()
#         self.classifier1 = torch.nn.Linear(32, 1)
#         self.classifier2 = torch.nn.Linear(32, 1)
#         self.classifier3 = torch.nn.Linear(32, 1)
#         self.classifier4 = torch.nn.Linear(32, 1)
#         self.classifier5 = torch.nn.Linear(32, 1)
#         self.classifier6 = torch.nn.Linear(32, 1)
#         self.classifier7 = torch.nn.Linear(32, 1)
#         self.classifier8 = torch.nn.Linear(32, 1)
#         self.classifier9 = torch.nn.Linear(32, 1)
#         self.classifier10 = torch.nn.Linear(32, 1)
#         self.classifier11 = torch.nn.Linear(32, 1)
#         self.classifier12 = torch.nn.Linear(32, 1)
#         self.classifier13 = torch.nn.Linear(32, 1)
#         self.classifier14 = torch.nn.Linear(32, 1)
#         self.classifier15 = torch.nn.Linear(32, 1)
#         self.classifier16 = torch.nn.Linear(32, 1)
#         self.classifier17 = torch.nn.Linear(32, 1)
#         self.classifier18 = torch.nn.Linear(32, 1)
#         self.classifier19 = torch.nn.Linear(32, 1)
#         self.classifier20 = torch.nn.Linear(32, 1)
#         #self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
#         self.log_softmax = torch.nn.LogSoftmax(dim=2)

#     def encode_text(
#         self,
#         symbols: Dict
#     ) -> torch.Tensor:
#         """Encode the (batch of) sequence(s) of token symbols with an LSTM.
#             Then, get the last (non-padded) hidden state for each symbol and return that.

#         Args:
#             symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

#         Returns:
#             torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
#                 the entire sentence
#         """
#         # First we get the contextualized embedding for each input symbol
#         # We no longer need an LSTM, since BERT encodes context and 
#         # gives us a single vector describing the sequence in the form of the [CLS] token.
#         embedded = self.bert(**symbols)
#         #print(embedded)
#         #print("Embedded", embedded.pooler_output.shape, embedded.last_hidden_state.shape)
#         # TODO: Get the [CLS] token using the `pooler_output` from 
#         #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
#         #      and check the returns for the forward method.
#         # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
#         #raise NotImplementedError
        
#         #pool_output_shape = embedded.pooler_output.shape
#         #return torch.reshape(embedded.pooler_output,(pool_output_shape[0],1,pool_output_shape[1]) )
#         last_hidden_state = embedded.last_hidden_state[:,0,:]
#         hidden_shape = last_hidden_state.shape
#         return torch.reshape(last_hidden_state,(hidden_shape[0],1,hidden_shape[1]) )

#     def forward(
#         self,
#         symbols: Dict,
#     ) -> torch.Tensor:
#         """_summary_

#         Args:
#             symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

#         Returns:
#             torch.Tensor: _description_
#         """
#         encoded_sents = self.encode_text(symbols)
#         #output = self.hidden_layer1(encoded_sents)
#         #output = self.relu(output)
#         #outputs = [self.hidden_layers[i](encoded_sents) for i in range(self.output_size)]
#         #outputs = [self.relu(outputs[i].to(device)) for i in range(self.output_size)]
#         #outputs = [self.classifiers[i](outputs[i].to(device)) for i in range(self.output_size)]
#         # outputs = []
#         # for i in range(self.output_size):
#         #     output = self.hidden_layers[i](encoded_sents)
#         #     output = self.relu(output)
#         #     output = self.classifiers[i](output)
#         #     output = torch.nn.Sigmoid()(output)
#         #     outputs.append(output)
#         output = self.hidden_layer1(encoded_sents)
#         output = self.relu(output)
#         output = self.classifier1(output)
#         output = torch.nn.Sigmoid()(output)
        
#         output2 = self.hidden_layer2(encoded_sents)
#         output2 = self.relu(output2)
#         output2 = self.classifier2(output2)
#         output2 = torch.nn.Sigmoid()(output2)
        
#         output3 = self.hidden_layer1(encoded_sents)
#         output3 = self.relu(output3)
#         output3 = self.classifier1(output3)
#         output3 = torch.nn.Sigmoid()(output3)
        
#         output4 = self.hidden_layer4(encoded_sents)
#         output4 = self.relu(output4)
#         output4 = self.classifier4(output4)
#         output4 = torch.nn.Sigmoid()(output4)
        
#         output5 = self.hidden_layer5(encoded_sents)
#         output5 = self.relu(output5)
#         output5 = self.classifier5(output5)
#         output5 = torch.nn.Sigmoid()(output5)
        
#         output6 = self.hidden_layer6(encoded_sents)
#         output6 = self.relu(output6)
#         output6 = self.classifier6(output6)
#         output6 = torch.nn.Sigmoid()(output6)
        
#         output7 = self.hidden_layer7(encoded_sents)
#         output7 = self.relu(output7)
#         output7 = self.classifier7(output7)
#         output7 = torch.nn.Sigmoid()(output7)
        
#         output8 = self.hidden_layer8(encoded_sents)
#         output8 = self.relu(output8)
#         output8 = self.classifier8(output8)
#         output8 = torch.nn.Sigmoid()(output8)
        
#         output9 = self.hidden_layer9(encoded_sents)
#         output9 = self.relu(output9)
#         output9 = self.classifier9(output9)
#         output9 = torch.nn.Sigmoid()(output9)
        
#         output10 = self.hidden_layer10(encoded_sents)
#         output10 = self.relu(output10)
#         output10 = self.classifier10(output10)
#         output10 = torch.nn.Sigmoid()(output10)
        
#         output11 = self.hidden_layer11(encoded_sents)
#         output11 = self.relu(output11)
#         output11 = self.classifier11(output11)
#         output11 = torch.nn.Sigmoid()(output11)
        
#         output12 = self.hidden_layer12(encoded_sents)
#         output12 = self.relu(output12)
#         output12 = self.classifier12(output12)
#         output12 = torch.nn.Sigmoid()(output12)
        
#         output13 = self.hidden_layer13(encoded_sents)
#         output13 = self.relu(output13)
#         output13 = self.classifier13(output13)
#         output13 = torch.nn.Sigmoid()(output13)
        
#         output14 = self.hidden_layer14(encoded_sents)
#         output14 = self.relu(output14)
#         output14 = self.classifier14(output14)
#         output14 = torch.nn.Sigmoid()(output14)

#         output15 = self.hidden_layer15(encoded_sents)
#         output15 = self.relu(output15)
#         output15 = self.classifier15(output15)
#         output15 = torch.nn.Sigmoid()(output15)
        
#         output16 = self.hidden_layer16(encoded_sents)
#         output16 = self.relu(output16)
#         output16 = self.classifier16(output16)
#         output16 = torch.nn.Sigmoid()(output16)

#         output17 = self.hidden_layer17(encoded_sents)
#         output17 = self.relu(output17)
#         output17 = self.classifier17(output17)
#         output17 = torch.nn.Sigmoid()(output17)
        
#         output18 = self.hidden_layer18(encoded_sents)
#         output18 = self.relu(output18)
#         output18 = self.classifier18(output18)
#         output18 = torch.nn.Sigmoid()(output18)

#         output19 = self.hidden_layer19(encoded_sents)
#         output19 = self.relu(output19)
#         output19 = self.classifier19(output19)
#         output19 = torch.nn.Sigmoid()(output19)
        
#         output20 = self.hidden_layer20(encoded_sents)
#         output20 = self.relu(output20)
#         output20 = self.classifier20(output20)
#         output20 = torch.nn.Sigmoid()(output20)
#         #output = self.hidden_layer2(output)
#         #output = self.relu(output)
#         #output = self.hidden_layer3(output)
#         #output = self.relu(output)
#         #output = self.hidden_layer4(output)
#         #output = self.relu(output)
#         #output = self.classifier(output)
#         #return self.log_softmax(output)
#         return output, output2, output3, output4, output5, output6, output7, output8, output9, output10, output11, output12, output13, output14, output15, output16, output17, output18, output19, output20

In [43]:
# class NLIClassifier(torch.nn.Module):
#     def __init__(self, output_size: int, hidden_size: int):
#         super().__init__()
#         self.output_size = output_size
#         self.hidden_size = hidden_size
#         # Initialize BERT, which we use instead of a single embedding layer.
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
#         # Freeze them if training is too slow. Notice that the learning
#         # rate should probably be smaller in this case.
#         # Uncommenting out the below 2 lines means only our classification layer will be updated.
#         for param in self.bert.parameters():
#             param.requires_grad = False
#         self.bert_hidden_dimension = self.bert.config.hidden_size
#         print(self.bert_hidden_dimension)
#         # TODO: Add an extra hidden layer in the classifier, projecting
#         #      from the BERT hidden dimension to hidden size.
#         # TODO: Add a relu nonlinearity to be used in the forward method
#         #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
#         self.hidden_layer1 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer2 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer3 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer4 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer5 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer6 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer7 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer8 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer9 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer10 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer11 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer12 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer13 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer14 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer15 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer16 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer17 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer18 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer19 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer20 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         # self.hidden_layers = [torch.nn.Linear(self.bert_hidden_dimension, 32).to(device) for i in range(self.output_size)]
#         # self.classifiers = [torch.nn.Linear(32, 1).to(device) for i in range(self.output_size)]
#         # #self.hidden_layer2 = torch.nn.Linear(self.hidden_size, 32)
#         #self.hidden_layer3 = torch.nn.Linear(128, 32)
#         #self.hidden_layer4 = torch.nn.Linear(32, 8)
#         self.hidden_layers = []
#         for i in range(20):
#             self.hidden_layers.append(torch.nn.Linear(self.bert_hidden_dimension, 32))
#         self.relu = torch.nn.ReLU()
#         self.classifier1 = torch.nn.Linear(32, 1)
#         self.classifier2 = torch.nn.Linear(32, 1)
#         self.classifier3 = torch.nn.Linear(32, 1)
#         self.classifier4 = torch.nn.Linear(32, 1)
#         self.classifier5 = torch.nn.Linear(32, 1)
#         self.classifier6 = torch.nn.Linear(32, 1)
#         self.classifier7 = torch.nn.Linear(32, 1)
#         self.classifier8 = torch.nn.Linear(32, 1)
#         self.classifier9 = torch.nn.Linear(32, 1)
#         self.classifier10 = torch.nn.Linear(32, 1)
#         self.classifier11 = torch.nn.Linear(32, 1)
#         self.classifier12 = torch.nn.Linear(32, 1)
#         self.classifier13 = torch.nn.Linear(32, 1)
#         self.classifier14 = torch.nn.Linear(32, 1)
#         self.classifier15 = torch.nn.Linear(32, 1)
#         self.classifier16 = torch.nn.Linear(32, 1)
#         self.classifier17 = torch.nn.Linear(32, 1)
#         self.classifier18 = torch.nn.Linear(32, 1)
#         self.classifier19 = torch.nn.Linear(32, 1)
#         self.classifier20 = torch.nn.Linear(32, 1)
#         self.classifiers = []
#         for i in range(20):
#           self.classifiers.append(torch.nn.Linear(32, 1))
#         #self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
#         self.log_softmax = torch.nn.LogSoftmax(dim=2)

#     def encode_text(
#         self,
#         symbols: Dict
#     ) -> torch.Tensor:
#         """Encode the (batch of) sequence(s) of token symbols with an LSTM.
#             Then, get the last (non-padded) hidden state for each symbol and return that.

#         Args:
#             symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

#         Returns:
#             torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
#                 the entire sentence
#         """
#         # First we get the contextualized embedding for each input symbol
#         # We no longer need an LSTM, since BERT encodes context and 
#         # gives us a single vector describing the sequence in the form of the [CLS] token.
#         embedded = self.bert(**symbols)
#         #print(embedded)
#         #print("Embedded", embedded.pooler_output.shape, embedded.last_hidden_state.shape)
#         # TODO: Get the [CLS] token using the `pooler_output` from 
#         #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
#         #      and check the returns for the forward method.
#         # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
#         #raise NotImplementedError
        
#         #pool_output_shape = embedded.pooler_output.shape
#         #return torch.reshape(embedded.pooler_output,(pool_output_shape[0],1,pool_output_shape[1]) )
#         last_hidden_state = embedded.last_hidden_state[:,0,:]
#         hidden_shape = last_hidden_state.shape
#         return torch.reshape(last_hidden_state,(hidden_shape[0],1,hidden_shape[1]) )

#     def forward(
#         self,
#         symbols: Dict,
#     ) -> torch.Tensor:
#         """_summary_

#         Args:
#             symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

#         Returns:
#             torch.Tensor: _description_
#         """
#         encoded_sents = self.encode_text(symbols)
#         #output = self.hidden_layer1(encoded_sents)
#         #output = self.relu(output)
#         #outputs = [self.hidden_layers[i](encoded_sents) for i in range(self.output_size)]
#         #outputs = [self.relu(outputs[i].to(device)) for i in range(self.output_size)]
#         #outputs = [self.classifiers[i](outputs[i].to(device)) for i in range(self.output_size)]
#         # outputs = []
#         # for i in range(self.output_size):
#         #     output = self.hidden_layers[i](encoded_sents)
#         #     output = self.relu(output)
#         #     output = self.classifiers[i](output)
#         #     output = torch.nn.Sigmoid()(output)
#         #     outputs.append(output)
#         # output = self.hidden_layer1(encoded_sents)
#         # output = self.relu(output)
#         # output = self.classifier1(output)
#         # output = torch.nn.Sigmoid()(output)
        
#         # output2 = self.hidden_layer2(encoded_sents)
#         # output2 = self.relu(output2)
#         # output2 = self.classifier2(output2)
#         # output2 = torch.nn.Sigmoid()(output2)
        
#         # output3 = self.hidden_layer1(encoded_sents)
#         # output3 = self.relu(output3)
#         # output3 = self.classifier1(output3)
#         # output3 = torch.nn.Sigmoid()(output3)
        
#         # output4 = self.hidden_layer4(encoded_sents)
#         # output4 = self.relu(output4)
#         # output4 = self.classifier4(output4)
#         # output4 = torch.nn.Sigmoid()(output4)
        
#         # output5 = self.hidden_layer5(encoded_sents)
#         # output5 = self.relu(output5)
#         # output5 = self.classifier5(output5)
#         # output5 = torch.nn.Sigmoid()(output5)
        
#         # output6 = self.hidden_layer6(encoded_sents)
#         # output6 = self.relu(output6)
#         # output6 = self.classifier6(output6)
#         # output6 = torch.nn.Sigmoid()(output6)
        
#         # output7 = self.hidden_layer7(encoded_sents)
#         # output7 = self.relu(output7)
#         # output7 = self.classifier7(output7)
#         # output7 = torch.nn.Sigmoid()(output7)
        
#         # output8 = self.hidden_layer8(encoded_sents)
#         # output8 = self.relu(output8)
#         # output8 = self.classifier8(output8)
#         # output8 = torch.nn.Sigmoid()(output8)
        
#         # output9 = self.hidden_layer9(encoded_sents)
#         # output9 = self.relu(output9)
#         # output9 = self.classifier9(output9)
#         # output9 = torch.nn.Sigmoid()(output9)
        
#         # output10 = self.hidden_layer10(encoded_sents)
#         # output10 = self.relu(output10)
#         # output10 = self.classifier10(output10)
#         # output10 = torch.nn.Sigmoid()(output10)
        
#         # output11 = self.hidden_layer11(encoded_sents)
#         # output11 = self.relu(output11)
#         # output11 = self.classifier11(output11)
#         # output11 = torch.nn.Sigmoid()(output11)
        
#         # output12 = self.hidden_layer12(encoded_sents)
#         # output12 = self.relu(output12)
#         # output12 = self.classifier12(output12)
#         # output12 = torch.nn.Sigmoid()(output12)
        
#         # output13 = self.hidden_layer13(encoded_sents)
#         # output13 = self.relu(output13)
#         # output13 = self.classifier13(output13)
#         # output13 = torch.nn.Sigmoid()(output13)
        
#         # output14 = self.hidden_layer14(encoded_sents)
#         # output14 = self.relu(output14)
#         # output14 = self.classifier14(output14)
#         # output14 = torch.nn.Sigmoid()(output14)

#         # output15 = self.hidden_layer15(encoded_sents)
#         # output15 = self.relu(output15)
#         # output15 = self.classifier15(output15)
#         # output15 = torch.nn.Sigmoid()(output15)
        
#         # output16 = self.hidden_layer16(encoded_sents)
#         # output16 = self.relu(output16)
#         # output16 = self.classifier16(output16)
#         # output16 = torch.nn.Sigmoid()(output16)

#         # output17 = self.hidden_layer17(encoded_sents)
#         # output17 = self.relu(output17)
#         # output17 = self.classifier17(output17)
#         # output17 = torch.nn.Sigmoid()(output17)
        
#         # output18 = self.hidden_layer18(encoded_sents)
#         # output18 = self.relu(output18)
#         # output18 = self.classifier18(output18)
#         # output18 = torch.nn.Sigmoid()(output18)

#         # output19 = self.hidden_layer19(encoded_sents)
#         # output19 = self.relu(output19)
#         # output19 = self.classifier19(output19)
#         # output19 = torch.nn.Sigmoid()(output19)
        
#         # output20 = self.hidden_layer20(encoded_sents)
#         # output20 = self.relu(output20)
#         # output20 = self.classifier20(output20)
#         # output20 = torch.nn.Sigmoid()(output20)
#         # #output = self.hidden_layer2(output)
#         # #output = self.relu(output)
#         # #output = self.hidden_layer3(output)
#         # #output = self.relu(output)
#         # #output = self.hidden_layer4(output)
#         # #output = self.relu(output)
#         # #output = self.classifier(output)
#         # #return self.log_softmax(output)
#         # return output, output2, output3, output4, output5, output6, output7, output8, output9, output10, output11, output12, output13, output14, output15, output16, output17, output18, output19, output20
#         outputs = []
#         for i in range(20):
#             output = self.hidden_layers[i](encoded_sents)
#             output = self.relu(output)
#             output = self.classifiers[i](output)
#             output = torch.nn.Sigmoid()(output)
#             outputs.append(output)
#         return outputs


In [47]:
# For making predictions at test time TODO: Multi-label
def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    sents = sents.to(device)
    logits = model(sents)
    res = []
    logitslen = logits[0].shape[0]
    for i in range(logitslen):
        datares = []
        for j in range(20):
            datares.append(logits[j][i][0][0] > 0.5)
        res.append(datares)
    return res
    #return list(torch.argmax(logits, axis=2).squeeze().numpy())
    #print(torch.max(logits), torch.min(logits))
    #return list((logits>0).squeeze())

In [48]:
import numpy as np

from numpy import logical_and, sum as t_sum
def precision(predicted_labels, true_labels, which_label=1):
    """
    Precision is True Positives / All Positives Predictions
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(pred_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(true_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def f1_score(
    predicted_labels: List[int],
    true_labels: List[int],
    which_label: int
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels, which_label=which_label)
    R = recall(predicted_labels, true_labels, which_label=which_label)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


def macro_f1(
    predicted_labels: List[int],
    true_labels: List[int],
    possible_labels: List[int]
):
    scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
    # Macro, so we take the uniform avg.
    print(scores)
    return sum(scores) / len(scores)

In [42]:
def f1Score_multiLabel(preds, labels):
    nLabels = 20
    relevants = [0]*20
    positives = [0]*20
    truePositives = [0]*20
    for i in range(len(preds)):
        for j in range(nLabels):
            if(preds[i][j]==1):
                positives[j] += 1
                if(labels[i][j]==1):
                    truePositives[j] += 1
    
    for i in range(len(labels)):
        for j in range(nLabels):
            if(labels[i][j]==1):
                relevants[j] += 1
    
    precisions = []*nLabels
    recalls = []*nLabels
    f1Scores = []*nLabels
    precision =0
    recall = 0
    f1 = 0
    #print(truePositives, positives, relevants)
    for i in range(nLabels):
        if(positives[i]>0):
            precision = truePositives[i]/positives[i]
        precisions.append(precision)
        if(relevants[i]>0):
            recall = truePositives[i]/relevants[i]
        recalls.append(recall)
        #print(precision,recall,i)
        if(precision>0 and recall>0):
            f1 = 2 * precision * recall / (precision + recall)
        f1Scores.append(f1)
    precision_mean = np.mean(precisions)
    recall_mean = np.mean(recalls)
    f1_mean = np.mean(f1Scores)
    return f1_mean, precision_mean, recall_mean
    


In [213]:
# import random
# from tqdm import tqdm_notebook as tqdm
# def training_loop(
#     num_epochs,
#     train_features,
#     train_labels,
#     dev_sents,
#     dev_labels,
#     optimizer,
#     #scheduler,
#     model,
# ):
#     print("Training...")
#     loss_func = torch.nn.BCELoss()
#     batches = list(zip(train_features, train_labels))
#     random.shuffle(batches)
#     for i in range(num_epochs):
#         losses = []
#         for features, labels in tqdm(batches):
#             # Empty the dynamic computation graph
#             features = features.to(device)
#             labels = labels.float()
#             labels = labels.to(device)
#             optimizer.zero_grad()
#             #preds0, preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, preds10, preds11, preds12, preds13, preds14, preds15, preds16, preds17, preds18, preds19  = model(features)
#             preds = model(features)

#             #print(preds[0].shape)
#             #featlen = preds[0].shape[0]
#             #preds_temp = torch.empty((featlen, 20), dtype=torch.float)
#             #for k in range(featlen):
#             #    for j in range(20):
#             #        preds_temp[k][j] = preds[j][k][0][0]
#             #preds = preds.squeeze(1)
#             #print("Preds ",preds.shape)
#             #print("Labels ", labels.shape)
#             #preds_temp = preds_temp.to(device)
#             #print(preds_temp.is_cuda, labels.is_cuda)
#             #print(preds_temp.shape, labels.shape)
#             #loss = loss_func(preds_temp, labels)
#             #losses = [0]*20
#             #for i in range(20):
#             #    loss1 = loss_func(preds[i],labels[:,i])
#             #    losses.append(loss1)
#             #print(preds[0].squeeze(1).squeeze(1).shape, labels[:,0].shape)
#             loss0 = loss_func(preds[0].squeeze(1).squeeze(1), labels[:,0])
#             #loss = loss_func(preds0.squeeze(1), labels)
#             loss1 = loss_func(preds[1].squeeze(1).squeeze(1), labels[:,1]) 
#             loss2 = loss_func(preds[2].squeeze(1).squeeze(1), labels[:,2]) 
#             loss3 = loss_func(preds[3].squeeze(1).squeeze(1), labels[:,3]) 
#             loss4 = loss_func(preds[4].squeeze(1).squeeze(1), labels[:,4]) 
#             loss5 = loss_func(preds[5].squeeze(1).squeeze(1), labels[:,5]) 
#             loss6 = loss_func(preds[6].squeeze(1).squeeze(1), labels[:,6]) 
#             loss7 = loss_func(preds[7].squeeze(1).squeeze(1), labels[:,7]) 
#             loss8 = loss_func(preds[8].squeeze(1).squeeze(1), labels[:,8]) 
#             loss9 = loss_func(preds[9].squeeze(1).squeeze(1), labels[:,9]) 
#             loss10 = loss_func(preds[10].squeeze(1).squeeze(1), labels[:,10])
#             loss11 = loss_func(preds[11].squeeze(1).squeeze(1), labels[:,11]) 
#             loss12 = loss_func(preds[12].squeeze(1).squeeze(1), labels[:,12]) 
#             loss13 = loss_func(preds[13].squeeze(1).squeeze(1), labels[:,13]) 
#             loss14 = loss_func(preds[14].squeeze(1).squeeze(1), labels[:,14]) 
#             loss15 = loss_func(preds[15].squeeze(1).squeeze(1), labels[:,15]) 
#             loss16 = loss_func(preds[16].squeeze(1).squeeze(1), labels[:,16]) 
#             loss17 = loss_func(preds[17].squeeze(1).squeeze(1), labels[:,17]) 
#             loss18 = loss_func(preds[18].squeeze(1).squeeze(1), labels[:,18]) 
#             loss19 = loss_func(preds[19].squeeze(1).squeeze(1), labels[:,19])  
#             loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 + loss9 + loss10 + loss11 + loss12 + loss13 + loss14 + loss15 + loss16 + loss17 + loss18 + loss19
#             # Backpropogate the loss through our model
#             #loss.register_hook(lambda grad: print(grad))
#             #print(model.hidden_layers[0].weight.grad)
#             #print(loss.grad)
#             loss = loss*1000
#             #print(i,model.hidden_layers[0].weight)
#             loss.backward()
#             #print(model.hidden_layers[0].weight.grad)
#             #print(loss.grad)
#             optimizer.step()
#             #print("After",i, model.hidden_layers[0].weight)
#             losses.append(loss.item())
        
#         print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")
#         # Estimate the f1 score for the development set
#         print("Evaluating dev...")
#         all_preds = []
#         all_labels = []
#         for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
#             sents = sents.to(device)
#             pred = predict(model, sents)
#             all_preds.extend(pred)
#             all_labels.extend(list(labels))
#         # #print(range(len(set(train_labels))))

#         dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
#         print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")
#         # # #scheduler.step()
#         #print(optimizer)
#     # Return the trained model
#     return model

In [214]:
# import random
# from tqdm import tqdm_notebook as tqdm
# def training_loop(
#     num_epochs,
#     train_features,
#     train_labels,
#     dev_sents,
#     dev_labels,
#     optimizer,
#     #scheduler,
#     model,
# ):
#     print("Training...")
#     loss_func = torch.nn.BCELoss()
#     batches = list(zip(train_features, train_labels))
#     random.shuffle(batches)
#     for i in range(num_epochs):
#         losses = []
#         for features, labels in tqdm(batches):
#             # Empty the dynamic computation graph
#             features = features.to(device)
#             labels = labels.float()
#             labels = labels.to(device)
#             optimizer.zero_grad()
#             #preds0, preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, preds10, preds11, preds12, preds13, preds14, preds15, preds16, preds17, preds18, preds19  = model(features)
#             preds = model(features)

#             #print(preds[0].shape)
#             #featlen = preds[0].shape[0]
#             #preds_temp = torch.empty((featlen, 20), dtype=torch.float)
#             #for k in range(featlen):
#             #    for j in range(20):
#             #        preds_temp[k][j] = preds[j][k][0][0]
#             #preds = preds.squeeze(1)
#             #print("Preds ",preds.shape)
#             #print("Labels ", labels.shape)
#             #preds_temp = preds_temp.to(device)
#             #print(preds_temp.is_cuda, labels.is_cuda)
#             #print(preds_temp.shape, labels.shape)
#             #loss = loss_func(preds_temp, labels)
#             #losses = [0]*20
#             #for i in range(20):
#             #    loss1 = loss_func(preds[i],labels[:,i])
#             #    losses.append(loss1)
#             #print(preds[0].squeeze(1).squeeze(1).shape, labels[:,0].shape)
#             loss0 = loss_func(preds[0].squeeze(1).squeeze(1), labels[:,0])
#             #loss = loss_func(preds0.squeeze(1), labels)
#             loss1 = loss_func(preds[1].squeeze(1).squeeze(1), labels[:,1]) 
#             loss2 = loss_func(preds[2].squeeze(1).squeeze(1), labels[:,2]) 
#             loss3 = loss_func(preds[3].squeeze(1).squeeze(1), labels[:,3]) 
#             loss4 = loss_func(preds[4].squeeze(1).squeeze(1), labels[:,4]) 
#             loss5 = loss_func(preds[5].squeeze(1).squeeze(1), labels[:,5]) 
#             loss6 = loss_func(preds[6].squeeze(1).squeeze(1), labels[:,6]) 
#             loss7 = loss_func(preds[7].squeeze(1).squeeze(1), labels[:,7]) 
#             loss8 = loss_func(preds[8].squeeze(1).squeeze(1), labels[:,8]) 
#             loss9 = loss_func(preds[9].squeeze(1).squeeze(1), labels[:,9]) 
#             loss10 = loss_func(preds[10].squeeze(1).squeeze(1), labels[:,10])
#             loss11 = loss_func(preds[11].squeeze(1).squeeze(1), labels[:,11]) 
#             loss12 = loss_func(preds[12].squeeze(1).squeeze(1), labels[:,12]) 
#             loss13 = loss_func(preds[13].squeeze(1).squeeze(1), labels[:,13]) 
#             loss14 = loss_func(preds[14].squeeze(1).squeeze(1), labels[:,14]) 
#             loss15 = loss_func(preds[15].squeeze(1).squeeze(1), labels[:,15]) 
#             loss16 = loss_func(preds[16].squeeze(1).squeeze(1), labels[:,16]) 
#             loss17 = loss_func(preds[17].squeeze(1).squeeze(1), labels[:,17]) 
#             loss18 = loss_func(preds[18].squeeze(1).squeeze(1), labels[:,18]) 
#             loss19 = loss_func(preds[19].squeeze(1).squeeze(1), labels[:,19])  
#             loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 + loss9 + loss10 + loss11 + loss12 + loss13 + loss14 + loss15 + loss16 + loss17 + loss18 + loss19
#             # Backpropogate the loss through our model
#             #loss.register_hook(lambda grad: print(grad))
#             #print(model.hidden_layers[0].weight.grad)
#             #print(loss.grad)
#             loss = loss*1000
#             #print(i,model.hidden_layers[0].weight)
#             loss.backward()
#             #print(model.hidden_layers[0].weight.grad)
#             #print(loss.grad)
#             optimizer.step()
#             #print("After",i, model.hidden_layers[0].weight)
#             losses.append(loss.item())
        
#         print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")
#         # Estimate the f1 score for the development set
#         print("Evaluating dev...")
#         all_preds = []
#         all_labels = []
#         for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
#             sents = sents.to(device)
#             pred = predict(model, sents)
#             all_preds.extend(pred)
#             all_labels.extend(list(labels))
#         # #print(range(len(set(train_labels))))

#         dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
#         print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")
#         # # #scheduler.step()
#         #print(optimizer)
#     # Return the trained model
#     return model

In [215]:
# epochs = 50
# # TODO: Find a good learning rate
# LR = 1e-5

# possible_labels = 20
# model = NLIClassifier(output_size=possible_labels, hidden_size=512)

# optimizer = torch.optim.AdamW(model.parameters(), LR)
# model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [216]:
# LR = 1e-4
# optimizer = torch.optim.AdamW(model.parameters(), LR)

In [217]:
# model =training_loop(
#     epochs,
#     train_input_batches,
#     train_label_batches,
#     val_input_batches,
#     val_label_batches,
#     optimizer,
#     #scheduler,
#     model,
# )

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for features, labels in tqdm(batches):


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 0, loss: 10096.968411458334
Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.0049261083743842365,  Dev Precision 0.25, Dev Recall 0.0002487562189054726


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 1, loss: 8293.17400390625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.1168773521181125,  Dev Precision 0.4, Dev Recall 0.013223356899711967


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 2, loss: 8067.354036458333
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.12090278576646678,  Dev Precision 0.37081168831168837, Dev Recall 0.030975551953755544


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 3, loss: 7909.934381510417
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.21754909134541425,  Dev Precision 0.6849062472229627, Dev Recall 0.04902560446474307


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 4, loss: 7768.724010416667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2638121954568129,  Dev Precision 0.7149518053855569, Dev Recall 0.05877032138290435


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 5, loss: 7639.946666666667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2941093005366055,  Dev Precision 0.7162752124754229, Dev Recall 0.06359745066502016


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 6, loss: 7521.068046875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3209315090723286,  Dev Precision 0.4809999027691732, Dev Recall 0.06819520088436373


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 7, loss: 7418.4901171875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.23686049744869378,  Dev Precision 0.6481945487500751, Dev Recall 0.07354940514596664


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 8, loss: 7329.204095052083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.15203691380695183,  Dev Precision 0.6272634773811244, Dev Recall 0.08070905786287973


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 9, loss: 7250.412141927083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.1770663287582855,  Dev Precision 0.623450777653433, Dev Recall 0.09004532938635298


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 10, loss: 7180.087864583334
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.21654671670560693,  Dev Precision 0.6809531399972577, Dev Recall 0.10936103647922246


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 11, loss: 7117.231087239584
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.24323340253503067,  Dev Precision 0.6679034594955694, Dev Recall 0.12525765325499236


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 12, loss: 7060.667369791667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2608049729270882,  Dev Precision 0.6535534511605823, Dev Recall 0.13309008980557366


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 13, loss: 7009.623489583333
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.24893278923077555,  Dev Precision 0.6953447371125934, Dev Recall 0.1388248972248218


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 14, loss: 6963.0047265625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.25962745093462214,  Dev Precision 0.6874514119515853, Dev Recall 0.14525065108622376


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 15, loss: 6919.97751953125
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.269685509006842,  Dev Precision 0.6805106172579696, Dev Recall 0.15105381189268338


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 16, loss: 6880.33947265625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.27508363963700727,  Dev Precision 0.6843568354314005, Dev Recall 0.15513606722993475


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 17, loss: 6843.68640625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2895034291807901,  Dev Precision 0.6675391251069618, Dev Recall 0.1657595403603947


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 18, loss: 6809.58732421875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2974485907781345,  Dev Precision 0.6745024140809706, Dev Recall 0.16846436330250739


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 19, loss: 6777.720631510417
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.30532952258826485,  Dev Precision 0.6753317654371753, Dev Recall 0.1744298375296022


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 20, loss: 6747.65576171875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3075259477350162,  Dev Precision 0.6599254201168494, Dev Recall 0.17673783790624867


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 21, loss: 6719.274368489583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.31085600952374504,  Dev Precision 0.6503159492426741, Dev Recall 0.1781054405407621


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 22, loss: 6692.445377604166
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.32212443702875804,  Dev Precision 0.661657180185645, Dev Recall 0.18543055129589248


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 23, loss: 6666.865572916667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.32332395378975043,  Dev Precision 0.6635929071741491, Dev Recall 0.1862118012958925


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 24, loss: 6642.251041666666
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3258641129528803,  Dev Precision 0.6617793962532732, Dev Recall 0.18904237602368848


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 25, loss: 6618.813424479166
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.33234568693085087,  Dev Precision 0.6611589171823882, Dev Recall 0.19150273390845313


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 26, loss: 6596.1173697916665
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3348626915203359,  Dev Precision 0.6593965563925394, Dev Recall 0.1946823128702928


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 27, loss: 6574.322415364583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3430842323654767,  Dev Precision 0.6291717545267173, Dev Recall 0.19958906447125913


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 28, loss: 6553.46763671875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.35638874799391307,  Dev Precision 0.6397003361084574, Dev Recall 0.2052487152155981


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 29, loss: 6533.250768229167
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.35954707605470737,  Dev Precision 0.6418437639664873, Dev Recall 0.20642576037993296


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 30, loss: 6513.739375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.36287896152470556,  Dev Precision 0.6379282884108721, Dev Recall 0.20755113184475654


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 31, loss: 6494.866399739583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3650862461259137,  Dev Precision 0.6390075305774648, Dev Recall 0.20986136252430349


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 32, loss: 6476.727109375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.36319210644656824,  Dev Precision 0.6350326885804828, Dev Recall 0.20928664988062531


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 33, loss: 6459.11853515625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3650805960929114,  Dev Precision 0.6211851689976691, Dev Recall 0.21090577041904032


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 34, loss: 6442.006380208333
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.36744333198910656,  Dev Precision 0.6217578168865026, Dev Recall 0.21353550705318974


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 35, loss: 6425.491295572917
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3673877591321776,  Dev Precision 0.6199120851814357, Dev Recall 0.2137842632720952


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 36, loss: 6409.480638020833
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.36659682661472626,  Dev Precision 0.6138570631594327, Dev Recall 0.21346780757589268


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 37, loss: 6393.84005859375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3704555459114608,  Dev Precision 0.6007916363072671, Dev Recall 0.21669469601672736


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 38, loss: 6378.661178385417
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.37365213873314546,  Dev Precision 0.6036347390939109, Dev Recall 0.21944443271763953


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 39, loss: 6363.865872395833
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.37980551901757476,  Dev Precision 0.6113064304009338, Dev Recall 0.22289736731999005


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 40, loss: 6349.396223958333
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3824496092687632,  Dev Precision 0.6101170759101447, Dev Recall 0.22631995499250915


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 41, loss: 6335.1900846354165
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.34254406349849525,  Dev Precision 0.6482770534620419, Dev Recall 0.22947712285126337


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 42, loss: 6321.3806640625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.34422756446381947,  Dev Precision 0.6490094388416792, Dev Recall 0.23006452713876327


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 43, loss: 6307.579348958333
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3483023228452903,  Dev Precision 0.6521618753134252, Dev Recall 0.23247169194346745


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 44, loss: 6294.166263020833
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3501033060823969,  Dev Precision 0.6494997294131838, Dev Recall 0.23343165232830057


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 45, loss: 6281.0606901041665
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3515262908955171,  Dev Precision 0.6495705602316897, Dev Recall 0.23493984669954887


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 46, loss: 6268.1725065104165
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3541640978792725,  Dev Precision 0.6485421737755839, Dev Recall 0.23679635268310775


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 47, loss: 6255.5157291666665
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.35399607793973875,  Dev Precision 0.6451151653242879, Dev Recall 0.2375225481529064


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 48, loss: 6243.066399739583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3493076552205716,  Dev Precision 0.6458980308327631, Dev Recall 0.23973788579658456


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 49, loss: 6230.8771875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.348030067737655,  Dev Precision 0.6450152869495663, Dev Recall 0.23921822768310777


In [141]:
# a = (1,2,3)
# a[1]

2

In [None]:
# print("Evaluating dev...")
# all_preds = []
# all_labels = []
# for sents, labels in tqdm(zip(val_input_batches, val_label_batches), total=len(val_input_batches)):
#     pred = predict(model, sents)
#     all_preds.extend(pred)
#     all_labels.extend(list(labels.numpy()))
# # #print(range(len(set(train_labels))))

# dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
# print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")


Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(val_input_batches, val_label_batches), total=len(val_input_batches)):


  0%|          | 0/8 [00:00<?, ?it/s]

[140, 114, 2, 0, 34, 0, 5, 34, 31, 37, 185, 0, 91, 0, 13, 0, 0, 0, 10, 50] [500, 500, 12, 9, 153, 0, 500, 500, 500, 500, 500, 0, 394, 0, 493, 0, 1, 0, 82, 482] [140, 114, 53, 6, 108, 22, 5, 34, 31, 37, 185, 145, 105, 62, 13, 39, 171, 26, 99, 52]
0.28 1.0 0
0.228 1.0 1
0.16666666666666666 0.03773584905660377 2
0.0 0.0 3
0.2222222222222222 0.3148148148148148 4
0.2222222222222222 0.0 5
0.01 1.0 6
0.068 1.0 7
0.062 1.0 8
0.074 1.0 9
0.37 1.0 10
0.37 0.0 11
0.23096446700507614 0.8666666666666667 12
0.23096446700507614 0.0 13
0.02636916835699797 1.0 14
0.02636916835699797 0.0 15
0.0 0.0 16
0.0 0.0 17
0.12195121951219512 0.10101010101010101 18
0.1037344398340249 0.9615384615384616 19
Dev F1 0.5140882946543324,  Dev Precision 0.140673202059074, Dev Recall 0.5140882946543324


In [None]:
# np.sum(all_preds)

0

In [None]:
# def f1Score_multiLabel(preds, labels):
#     nLabels = 20
#     relevants = [0]*20
#     positives = [0]*20
#     truePositives = [0]*20
#     for i in range(len(preds)):
#         for j in range(nLabels):
#             if(preds[i][j]==1):
#                 positives[j] += 1
#                 if(labels[i][j]==1):
#                     truePositives[j] += 1
    
#     for i in range(len(labels)):
#         for j in range(nLabels):
#             if(labels[i][j]==1):
#                 relevants[j] += 1
    
#     precisions = []*nLabels
#     recalls = []*nLabels
#     f1Scores = []*nLabels
#     precision =0
#     recall = 0
#     f1 = 0
#     print(truePositives, positives, relevants)
#     for i in range(nLabels):
#         if(positives[i]>0):
#             precision = truePositives[i]/positives[i]
#         precisions.append(precision)
#         if(relevants[i]>0):
#             recall = truePositives[i]/relevants[i]
#         recalls.append(recall)
#         print(precision,recall,i)
#         if(positives[i]>0 and relevants[i]>0):
#             f1 = 2 * precision * recall / (precision + recall)
#         f1Scores.append(f1)
#     precision_mean = np.mean(precisions)
#     recall_mean = np.mean(recalls)
#     f1_mean = np.mean(recalls)
#     return f1_mean, precision_mean, recall_mean
    


In [None]:
# train_label_batches[0][0][10]

tensor(0)

In [None]:
# from torch import nn
# loss = nn.CrossEntropyLoss()
# input = torch.randn(5, requires_grad=True)
# #target = torch.empty(3, dtype=torch.long).random_(5)
# target = torch.randn(5).softmax(dim=0)
# print(input, target)


tensor([-0.0589,  0.2156, -1.1975, -0.6472, -0.6455], requires_grad=True) tensor([0.2369, 0.0586, 0.4756, 0.1383, 0.0906])


In [None]:
# # aoutput = loss(input, target)
# print(output)

tensor(1.9810, grad_fn=<DivBackward1>)


In [None]:
# a = torch.log(input)
# a

tensor([[-0.1021,  0.0259,     nan, -0.8798,  0.3323]], grad_fn=<LogBackward0>)

In [None]:
# a*target

tensor([[-0.0537,  0.0045,     nan, -0.0182,  0.0056]], grad_fn=<MulBackward0>)

In [None]:
# target = torch.empty(1, dtype=torch.long)
# target[0] = 1
# output = loss(input, target)
# print(output)

tensor(1.5930, grad_fn=<NllLossBackward0>)


In [None]:
# i = 1
# losses=[10]
# print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")

epoch 1, loss: 10.0


In [None]:
# y.dtype

torch.float32

In [None]:
# from torch import nn
# loss = nn.CrossEntropyLoss()
# input = torch.randn(2, 5, requires_grad=True)
# input

tensor([[-0.4920, -0.2966, -0.1552,  0.1589,  0.5150],
        [-0.3642,  0.2618,  0.9885,  0.9222, -0.0693]], requires_grad=True)

In [None]:
# a = input[:,2]
# a

tensor([-0.1552,  0.9885], grad_fn=<SelectBackward0>)

In [43]:
class ValEvalClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
        # Freeze them if training is too slow. Notice that the learning
        # rate should probably be smaller in this case.
        # Uncommenting out the below 2 lines means only our classification layer will be updated.
        for param in self.bert.parameters():
            param.requires_grad = False
        self.bert_hidden_dimension = self.bert.config.hidden_size
        print(self.bert_hidden_dimension)
        # TODO: Add an extra hidden layer in the classifier, projecting
        #      from the BERT hidden dimension to hidden size.
        # TODO: Add a relu nonlinearity to be used in the forward method
        #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        self.hidden_layer = torch.nn.Linear(self.bert_hidden_dimension, 512)
        hidden_dimension = 512
        hidden_dimension_2 = 128
        hidden_dimension_3 = 32
        
        self.hidden_layer1 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer2 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer3 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer4 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer5 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer6 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer7 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer8 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer9 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer10 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer11 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer12 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer13 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer14 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer15 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer16 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer17 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer18 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer19 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        self.hidden_layer20 = torch.nn.Linear(hidden_dimension, hidden_dimension_2)
        
        
        self.hidden_layer1_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer2_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer3_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer4_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer5_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer6_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer7_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer8_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer9_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer10_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer11_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer12_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer13_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer14_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer15_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer16_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer17_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer18_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer19_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        self.hidden_layer20_2 = torch.nn.Linear(hidden_dimension_2, hidden_dimension_3)
        
        self.relu = torch.nn.ReLU()
        
        self.classifier1 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier2 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier3 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier4 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier5 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier6 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier7 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier8 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier9 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier10 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier11 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier12 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier13 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier14 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier15 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier16 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier17 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier18 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier19 = torch.nn.Linear(hidden_dimension_3, 1)
        self.classifier20 = torch.nn.Linear(hidden_dimension_3, 1)
        #self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=2)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Encode the (batch of) sequence(s) of token symbols with an LSTM.
            Then, get the last (non-padded) hidden state for each symbol and return that.

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
                the entire sentence
        """
        # First we get the contextualized embedding for each input symbol
        # We no longer need an LSTM, since BERT encodes context and 
        # gives us a single vector describing the sequence in the form of the [CLS] token.
        embedded = self.bert(**symbols)
        #print(embedded)
        #print("Embedded", embedded.pooler_output.shape, embedded.last_hidden_state.shape)
        # TODO: Get the [CLS] token using the `pooler_output` from 
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        #raise NotImplementedError
        
        #pool_output_shape = embedded.pooler_output.shape
        #return torch.reshape(embedded.pooler_output,(pool_output_shape[0],1,pool_output_shape[1]) )
        last_hidden_state = embedded.last_hidden_state[:,0,:]
        hidden_shape = last_hidden_state.shape
        return torch.reshape(last_hidden_state,(hidden_shape[0],1,hidden_shape[1]) )

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        hidden_output = self.hidden_layer(encoded_sents)
        output = self.hidden_layer1(hidden_output)
        output = self.relu(output)
        
        output2 = self.hidden_layer2(hidden_output)
        output2 = self.relu(output2)
        
        output3 = self.hidden_layer3(hidden_output)
        output3 = self.relu(output3)
        
        output4 = self.hidden_layer4(hidden_output)
        output4 = self.relu(output4)
        
        output5 = self.hidden_layer5(hidden_output)
        output5 = self.relu(output5)
        
        output6 = self.hidden_layer6(hidden_output)
        output6 = self.relu(output6)
        
        output7 = self.hidden_layer7(hidden_output)
        output7 = self.relu(output7)
        
        output8 = self.hidden_layer8(hidden_output)
        output8 = self.relu(output8)
        
        output9 = self.hidden_layer9(hidden_output)
        output9 = self.relu(output9)
        
        output10 = self.hidden_layer10(hidden_output)
        output10 = self.relu(output10)
        
        output11 = self.hidden_layer11(hidden_output)
        output11 = self.relu(output11)
        
        output12 = self.hidden_layer12(hidden_output)
        output12 = self.relu(output12)
        
        output13 = self.hidden_layer13(hidden_output)
        output13 = self.relu(output13)
        
        output14 = self.hidden_layer14(hidden_output)
        output14 = self.relu(output14)
        
        output15 = self.hidden_layer15(hidden_output)
        output15 = self.relu(output15)
        
        output16 = self.hidden_layer16(hidden_output)
        output16 = self.relu(output16)
        
        output17 = self.hidden_layer17(hidden_output)
        output17 = self.relu(output17)
        
        output18 = self.hidden_layer18(hidden_output)
        output18 = self.relu(output18)
        
        output19 = self.hidden_layer19(hidden_output)
        output19 = self.relu(output19)
        
        output20 = self.hidden_layer20(hidden_output)
        output20 = self.relu(output20)
        
        #Add layers
        output_add1 = torch.add(output, output2)
        output_add2 = torch.add(output3, output4)
        output_add3 = torch.add(output5, output6)
        output_add4 = torch.add(output7, output8)
        output_add5 = torch.add(output9, output10)
        output_add6 = torch.add(output11, output12)
        output_add7 = torch.add(output13, output14)
        output_add8 = torch.add(output15, output16)
        output_add9 = torch.add(output17, output18)
        output_add10 = torch.add(output19, output20)

        output_add11 = torch.add(output_add1, output_add2)
        output_add12 = torch.add(output_add3, output_add4)
        output_add13 = torch.add(output_add5, output_add6)
        output_add14 = torch.add(output_add7, output_add8)
        output_add15 = torch.add(output_add9, output_add10)

        output_add21 = torch.add(output_add11, output_add12)
        output_add22 = torch.add(output_add13, output_add14)
        output_add23 = torch.add(output_add22, output_add15)

        output_add31 =torch.add(output_add21, output_add23)
        
        
        #Second Hidden Layer
        
        # output = self.hidden_layer1_2(output)
        output = self.hidden_layer1_2(output_add31)
        output = self.relu(output)
        
        # output2 = self.hidden_layer2_2(output2)
        output2 = self.hidden_layer2_2(output_add31)
        output2 = self.relu(output2)
        
        # output3 = self.hidden_layer3_2(output3)
        output3 = self.hidden_layer3_2(output_add31)
        output3 = self.relu(output3)
        
        # output4 = self.hidden_layer4_2(output4)
        output4 = self.hidden_layer4_2(output_add31)
        output4 = self.relu(output4)
        
        # output5 = self.hidden_layer5_2(output5)
        output5 = self.hidden_layer5_2(output_add31)
        output5 = self.relu(output5)
        
        # output6 = self.hidden_layer6_2(output6)
        output6 = self.hidden_layer6_2(output_add31)
        output6 = self.relu(output6)
        
        # output7 = self.hidden_layer7_2(output7)
        output7 = self.hidden_layer7_2(output_add31)
        output7 = self.relu(output7)
        
        # output8 = self.hidden_layer8_2(output8)
        output8 = self.hidden_layer8_2(output_add31)
        output8 = self.relu(output8)
        
        # output9 = self.hidden_layer9_2(output9)
        output9 = self.hidden_layer9_2(output_add31)
        output9 = self.relu(output9)
        
        # output10 = self.hidden_layer10_2(output10)
        output10 = self.hidden_layer10_2(output_add31)
        output10 = self.relu(output10)
        
        # output11 = self.hidden_layer11_2(output11)
        output11 = self.hidden_layer11_2(output_add31)
        output11 = self.relu(output11)
        
        # output12 = self.hidden_layer12_2(output12)
        output12 = self.hidden_layer12_2(output_add31)
        output12 = self.relu(output12)
        
        # output13 = self.hidden_layer13_2(output13)
        output13 = self.hidden_layer13_2(output_add31)
        output13 = self.relu(output13)
        
        # output14 = self.hidden_layer14_2(output14)
        output14 = self.hidden_layer14_2(output_add31)
        output14 = self.relu(output14)
        
        # output15 = self.hidden_layer15_2(output15)
        output15 = self.hidden_layer15_2(output_add31)
        output15 = self.relu(output15)
        
        # output16 = self.hidden_layer16_2(output16)
        output16 = self.hidden_layer16_2(output_add31)
        output16 = self.relu(output16)
        
        # output17 = self.hidden_layer17_2(output17)
        output17 = self.hidden_layer17_2(output_add31)
        output17 = self.relu(output17)
        
        # output18 = self.hidden_layer18_2(output18)
        output18 = self.hidden_layer18_2(output_add31)
        output18 = self.relu(output18)

        # output19 = self.hidden_layer19_2(output19)
        output19 = self.hidden_layer19_2(output_add31)
        output19 = self.relu(output19)

        # output20 = self.hidden_layer20_2(output20)
        output20 = self.hidden_layer20_2(output_add31)
        output20 = self.relu(output20)
        

        output = self.classifier1(output)
        output = torch.nn.Sigmoid()(output)
        
        output2 = self.classifier2(output2)
        output2 = torch.nn.Sigmoid()(output2)
        
        output3 = self.classifier1(output3)
        output3 = torch.nn.Sigmoid()(output3)
        
        output4 = self.classifier4(output4)
        output4 = torch.nn.Sigmoid()(output4)
        
        output5 = self.classifier5(output5)
        output5 = torch.nn.Sigmoid()(output5)
        
        output6 = self.classifier6(output6)
        output6 = torch.nn.Sigmoid()(output6)
        
        output7 = self.classifier7(output7)
        output7 = torch.nn.Sigmoid()(output7)
        
        output8 = self.classifier8(output8)
        output8 = torch.nn.Sigmoid()(output8)
        
        output9 = self.classifier9(output9)
        output9 = torch.nn.Sigmoid()(output9)
        
        output10 = self.classifier10(output10)
        output10 = torch.nn.Sigmoid()(output10)
        
        output11 = self.classifier11(output11)
        output11 = torch.nn.Sigmoid()(output11)
        
        output12 = self.classifier12(output12)
        output12 = torch.nn.Sigmoid()(output12)
        
        output13 = self.classifier13(output13)
        output13 = torch.nn.Sigmoid()(output13)
        
        output14 = self.classifier14(output14)
        output14 = torch.nn.Sigmoid()(output14)

        output15 = self.classifier15(output15)
        output15 = torch.nn.Sigmoid()(output15)
        
        output16 = self.classifier16(output16)
        output16 = torch.nn.Sigmoid()(output16)

        output17 = self.classifier17(output17)
        output17 = torch.nn.Sigmoid()(output17)
        
        output18 = self.classifier18(output18)
        output18 = torch.nn.Sigmoid()(output18)

        output19 = self.classifier19(output19)
        output19 = torch.nn.Sigmoid()(output19)

        output20 = self.classifier20(output20)
        output20 = torch.nn.Sigmoid()(output20)
        
        return output, output2, output3, output4, output5, output6, output7, output8, output9, output10, output11, output12, output13, output14, output15, output16, output17, output18, output19, output20

In [44]:
# For making predictions at test time TODO: Multi-label
def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    sents = sents.to(device)
    logits = model(sents)
    res = []
    logitslen = logits[0].shape[0]
    for i in range(logitslen):
        datares = []
        for j in range(20):
            datares.append(logits[j][i][0][0] > 0.5)
        res.append(datares)
    return res
    #return list(torch.argmax(logits, axis=2).squeeze().numpy())
    #print(torch.max(logits), torch.min(logits))
    #return list((logits>0).squeeze())

In [45]:
import random
from tqdm import tqdm_notebook as tqdm
def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_sents,
    dev_labels,
    optimizer,
    #scheduler,
    model,
):
    print("Training...")
    loss_func = torch.nn.BCELoss()
    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):
            # Empty the dynamic computation graph
            features = features.to(device)
            labels = labels.float()
            labels = labels.to(device)
            optimizer.zero_grad()
            #preds0, preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, preds10, preds11, preds12, preds13, preds14, preds15, preds16, preds17, preds18, preds19  = model(features)
            preds = model(features)

            #print(preds[0].shape)
            #featlen = preds[0].shape[0]
            #preds_temp = torch.empty((featlen, 20), dtype=torch.float)
            #for k in range(featlen):
            #    for j in range(20):
            #        preds_temp[k][j] = preds[j][k][0][0]
            #preds = preds.squeeze(1)
            #print("Preds ",preds.shape)
            #print("Labels ", labels.shape)
            #preds_temp = preds_temp.to(device)
            #print(preds_temp.is_cuda, labels.is_cuda)
            #print(preds_temp.shape, labels.shape)
            #loss = loss_func(preds_temp, labels)
            #losses = [0]*20
            #for i in range(20):
            #    loss1 = loss_func(preds[i],labels[:,i])
            #    losses.append(loss1)
            #print(preds[0].squeeze(1).squeeze(1).shape, labels[:,0].shape)
            loss0 = loss_func(preds[0].squeeze(1).squeeze(1), labels[:,0])
            #loss = loss_func(preds0.squeeze(1), labels)
            loss1 = loss_func(preds[1].squeeze(1).squeeze(1), labels[:,1]) 
            loss2 = loss_func(preds[2].squeeze(1).squeeze(1), labels[:,2]) 
            loss3 = loss_func(preds[3].squeeze(1).squeeze(1), labels[:,3]) 
            loss4 = loss_func(preds[4].squeeze(1).squeeze(1), labels[:,4]) 
            loss5 = loss_func(preds[5].squeeze(1).squeeze(1), labels[:,5]) 
            loss6 = loss_func(preds[6].squeeze(1).squeeze(1), labels[:,6]) 
            loss7 = loss_func(preds[7].squeeze(1).squeeze(1), labels[:,7]) 
            loss8 = loss_func(preds[8].squeeze(1).squeeze(1), labels[:,8]) 
            loss9 = loss_func(preds[9].squeeze(1).squeeze(1), labels[:,9]) 
            loss10 = loss_func(preds[10].squeeze(1).squeeze(1), labels[:,10])
            loss11 = loss_func(preds[11].squeeze(1).squeeze(1), labels[:,11]) 
            loss12 = loss_func(preds[12].squeeze(1).squeeze(1), labels[:,12]) 
            loss13 = loss_func(preds[13].squeeze(1).squeeze(1), labels[:,13]) 
            loss14 = loss_func(preds[14].squeeze(1).squeeze(1), labels[:,14]) 
            loss15 = loss_func(preds[15].squeeze(1).squeeze(1), labels[:,15]) 
            loss16 = loss_func(preds[16].squeeze(1).squeeze(1), labels[:,16]) 
            loss17 = loss_func(preds[17].squeeze(1).squeeze(1), labels[:,17]) 
            loss18 = loss_func(preds[18].squeeze(1).squeeze(1), labels[:,18]) 
            loss19 = loss_func(preds[19].squeeze(1).squeeze(1), labels[:,19])  
            loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 + loss9 + loss10 + loss11 + loss12 + loss13 + loss14 + loss15 + loss16 + loss17 + loss18 + loss19
            # Backpropogate the loss through our model
            #loss.register_hook(lambda grad: print(grad))
            #print(model.hidden_layers[0].weight.grad)
            #print(loss.grad)
            loss = loss*1000
            #print(i,model.hidden_layers[0].weight)
            loss.backward()
            #print(model.hidden_layers[0].weight.grad)
            #print(loss.grad)
            optimizer.step()
            #print("After",i, model.hidden_layers[0].weight)
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            sents = sents.to(device)
            pred = predict(model, sents)
            all_preds.extend(pred)
            all_labels.extend(list(labels))
        # #print(range(len(set(train_labels))))

        dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
        print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")
        # # #scheduler.step()
        #print(optimizer)
    # Return the trained model
    return model

In [46]:
epochs = 50
# TODO: Find a good learning rate
LR = 1e-4

possible_labels = 20
model = ValEvalClassifier(output_size=possible_labels, hidden_size=512)

optimizer = torch.optim.AdamW(model.parameters(), LR)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [48]:
epochs = 100

In [None]:
model =training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    val_input_batches,
    val_label_batches,
    optimizer,
    #scheduler,
    model,
)

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for features, labels in tqdm(batches):


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 0, loss: 7601.568098958333
Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2794986258543002,  Dev Precision 0.5394983567982038, Dev Recall 0.09604256407283462


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 1, loss: 7228.3250520833335
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.2821512696541436,  Dev Precision 0.6650822305415504, Dev Recall 0.14608110306730868


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 2, loss: 6984.906979166667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3384893675627717,  Dev Precision 0.6741434234107596, Dev Recall 0.17881344558658013


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 3, loss: 6814.83185546875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3657534750934789,  Dev Precision 0.6619101964795061, Dev Recall 0.19950396359591177


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 4, loss: 6683.08853515625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3906908025156662,  Dev Precision 0.6462512368070099, Dev Recall 0.21807146662068094


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 5, loss: 6573.493424479167
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4192399994935556,  Dev Precision 0.645536349726706, Dev Recall 0.23380978894031118


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 6, loss: 6480.4767578125
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3899465906829485,  Dev Precision 0.7111839553629117, Dev Recall 0.23935702144726295


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 7, loss: 6399.486861979167
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3798076224101272,  Dev Precision 0.72596210445239, Dev Recall 0.24916814326745773


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 8, loss: 6326.248515625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.40566853567021505,  Dev Precision 0.7053834016468379, Dev Recall 0.2613036945880832


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 9, loss: 6259.997727864583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.39566840249149143,  Dev Precision 0.6897046272322508, Dev Recall 0.26790706591611396


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 10, loss: 6198.67916015625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4028547417573451,  Dev Precision 0.7072173872383881, Dev Recall 0.27286232300488944


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 11, loss: 6141.706783854167
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.3937891764961841,  Dev Precision 0.7257162169215851, Dev Recall 0.2807114378814628


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 12, loss: 6087.969694010417
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.40346213798309744,  Dev Precision 0.7257660756571153, Dev Recall 0.28865856015740776


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 13, loss: 6035.76833984375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.40601539594179104,  Dev Precision 0.7259678310926644, Dev Recall 0.29045913580784494


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 14, loss: 5986.64427734375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.41219006620720977,  Dev Precision 0.723962165724285, Dev Recall 0.2982209031619556


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 15, loss: 5939.1905859375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42119531793497017,  Dev Precision 0.6995371766152159, Dev Recall 0.3042702480323436


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 16, loss: 5892.119856770833
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4215164373627812,  Dev Precision 0.6983338929333796, Dev Recall 0.3044899620331568


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 17, loss: 5847.6488671875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4274032232076522,  Dev Precision 0.6735722927726341, Dev Recall 0.30708318223332076


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 18, loss: 5803.308346354167
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42981107830012455,  Dev Precision 0.6759915055769015, Dev Recall 0.30929973434513125


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 19, loss: 5760.344791666666
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.413006561529721,  Dev Precision 0.7054289799709512, Dev Recall 0.31375367620783756


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 20, loss: 5718.178313802084
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.41372864956863625,  Dev Precision 0.651256697361934, Dev Recall 0.3149879658453094


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 21, loss: 5676.804446614583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.41750391503331646,  Dev Precision 0.6182751059893266, Dev Recall 0.3169695179283044


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 22, loss: 5635.116953125
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.41765297570067456,  Dev Precision 0.6188035592683445, Dev Recall 0.3165062270156701


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 23, loss: 5594.2184375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4268186356593947,  Dev Precision 0.6169222533458162, Dev Recall 0.3214760266375353


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 24, loss: 5553.539440104167
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.427895954497271,  Dev Precision 0.6256209353366687, Dev Recall 0.3215783707212036


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 25, loss: 5513.118776041667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4280930762638954,  Dev Precision 0.6203078321192644, Dev Recall 0.32204686160014223


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 26, loss: 5472.9885546875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42997679856317006,  Dev Precision 0.6157563730303408, Dev Recall 0.32512150578566223


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 27, loss: 5432.611647135417
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4343768643725213,  Dev Precision 0.6051187691795256, Dev Recall 0.32982550190359744


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 28, loss: 5393.568932291667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.43509122131530625,  Dev Precision 0.6068782622394558, Dev Recall 0.33092872499107806


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 29, loss: 5354.402428385417
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.43942588149998996,  Dev Precision 0.6027980267826616, Dev Recall 0.3352894690641556


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 30, loss: 5314.828235677083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.44297173113525357,  Dev Precision 0.6080701494564493, Dev Recall 0.3392480665117553


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 31, loss: 5275.13189453125
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.44472647267907506,  Dev Precision 0.6050028844978445, Dev Recall 0.3410148611306794


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 32, loss: 5236.592688802083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4488125112425495,  Dev Precision 0.6054311134997923, Dev Recall 0.34508588514201366


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 33, loss: 5196.990419921875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4257808184831705,  Dev Precision 0.6293878362054264, Dev Recall 0.34798654059020795


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 34, loss: 5157.906175130208
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42391395849393526,  Dev Precision 0.6143278239272796, Dev Recall 0.3483979789987991


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 35, loss: 5118.020849609375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42505859854639744,  Dev Precision 0.6129556741892691, Dev Recall 0.3503672339797811


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 36, loss: 5077.7184049479165
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4251176668008155,  Dev Precision 0.6107530046365666, Dev Recall 0.34937232108925176


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 37, loss: 5038.567666015625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4254831795941708,  Dev Precision 0.608571665589665, Dev Recall 0.35066048603053585


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 38, loss: 4998.881461588541
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4193250457022534,  Dev Precision 0.6323537683472542, Dev Recall 0.3555209694292136


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 39, loss: 4959.336985677083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4213728730127591,  Dev Precision 0.6303192703814764, Dev Recall 0.3567103351008554


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 40, loss: 4918.692213541667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42120308121373357,  Dev Precision 0.6305141666718667, Dev Recall 0.35721248375134046


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 41, loss: 4879.806178385416
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4213237775711426,  Dev Precision 0.6259587932200549, Dev Recall 0.35913433694407976


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 42, loss: 4840.188759765625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42071394717016075,  Dev Precision 0.6228745769000341, Dev Recall 0.35984474037723163


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 43, loss: 4800.426194661458
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4221493986339099,  Dev Precision 0.623568376840236, Dev Recall 0.36247706481422715


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 44, loss: 4760.840387369792
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4219450485752695,  Dev Precision 0.617205151124432, Dev Recall 0.3633796694159454


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 45, loss: 4721.095654296875
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4228376669192782,  Dev Precision 0.6176097304232067, Dev Recall 0.3647467466247873


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 46, loss: 4681.961477864584
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42686479970790847,  Dev Precision 0.6121770600770035, Dev Recall 0.3677010696145021


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 47, loss: 4642.761155598958
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.42619210271296265,  Dev Precision 0.6091927853160557, Dev Recall 0.3671590198821223


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 48, loss: 4603.07962890625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4260990077124223,  Dev Precision 0.6031432496148579, Dev Recall 0.36825401928539236


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 49, loss: 4562.644251302083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.4270711137368658,  Dev Precision 0.6010814572589597, Dev Recall 0.36951832924767947


  0%|          | 0/75 [00:00<?, ?it/s]

In [223]:
input1 = torch.randn(64, 1, 32, requires_grad=True)
input2 = torch.randn(64, 1, 32, requires_grad=True)
a = torch.cat((input1, input2), dim=2)
print(a.shape)

torch.Size([64, 1, 64])
