In [1]:
!python -m pip install transformers



In [2]:
import traceback
import csv

import pandas as pd


def write_tsv_dataframe(filepath, dataframe):
    """
        Stores `DataFrame` as tsv file

        Parameters
        ----------
        filepath : str
            Path to tsv file
        dataframe : pd.DataFrame
            DataFrame to store

        Raises
        ------
        IOError
            if the file can't be opened
    """
    try:
        dataframe.to_csv(filepath, encoding='utf-8', sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE)
    except IOError:
        traceback.print_exc()


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
def combine_columns(df_arguments, df_labels):
    """Combines the two `DataFrames` on column `Argument ID`"""
    return pd.merge(df_arguments, df_labels, on='Argument ID')


In [4]:
def split_arguments(df_arguments):
    """Splits `DataFrame` by column `Usage` into `train`-, `validation`-, and `test`-arguments"""
    train_arguments = df_arguments.loc[df_arguments['Usage'] == 'train'].drop(['Usage'], axis=1).reset_index(drop=True)
    valid_arguments = df_arguments.loc[df_arguments['Usage'] == 'validation'].drop(['Usage'], axis=1).reset_index(drop=True)
    test_arguments = df_arguments.loc[df_arguments['Usage'] == 'test'].drop(['Usage'], axis=1).reset_index(drop=True)
    
    return train_arguments, valid_arguments, test_arguments


In [5]:
def create_dataframe_head(argument_ids, model_name):
    """
        Creates `DataFrame` usable to append predictions to it

        Parameters
        ----------
        argument_ids : list[str]
            First column of the resulting DataFrame
        model_name : str
            Second column of DataFrame will contain the given model name

        Returns
        -------
        pd.DataFrame
            prepared DataFrame
    """
    df_model_head = pd.DataFrame(argument_ids, columns=['Argument ID'])
    df_model_head['Method'] = [model_name] * len(argument_ids)

    return df_model_head


In [6]:
import json
class MissingColumnError(AttributeError):
    """Error indicating that an imported DataFrame lacks necessary columns"""
    pass


In [7]:
def load_json_file(filepath):
    """Load content of json-file from `filepath`"""
    with open(filepath, 'r') as  json_file:
        return json.load(json_file)


In [8]:
def load_values_from_json(filepath):
    """Load values per level from json-file from `filepath`"""
    json_values = load_json_file(filepath)
    values = { "1":set(), "2":set(), "3":set(), "4a":set(), "4b":set() }
    for value in json_values["values"]:
        values["1"].add(value["name"])
        values["2"].add(value["level2"])
        for valueLevel3 in value["level3"]:
            values["3"].add(valueLevel3)
        for valueLevel4a in value["level4a"]:
            values["4a"].add(valueLevel4a)
        for valueLevel4b in value["level4b"]:
            values["4b"].add(valueLevel4b)
    values["1"] = sorted(values["1"])
    values["2"] = sorted(values["2"])
    values["3"] = sorted(values["3"])
    values["4a"] = sorted(values["4a"])
    values["4b"] = sorted(values["4b"])
    return values


In [9]:
def load_arguments_from_tsv(filepath, default_usage='test'):
    """
        Reads arguments from tsv file

        Parameters
        ----------
        filepath : str
            The path to the tsv file
        default_usage : str, optional
            The default value if the column "Usage" is missing

        Returns
        -------
        pd.DataFrame
            the DataFrame with all arguments

        Raises
        ------
        MissingColumnError
            if the required columns "Argument ID" or "Premise" are missing in the read data
        IOError
            if the file can't be read
        """
    try:
        dataframe = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=0)
        if not {'Argument ID', 'Premise'}.issubset(set(dataframe.columns.values)):
            raise MissingColumnError('The argument "%s" file does not contain the minimum required columns [Argument ID, Premise].' % filepath)
        if 'Usage' not in dataframe.columns.values:
            dataframe['Usage'] = [default_usage] * len(dataframe)
        return dataframe
    except IOError:
        traceback.print_exc()
        raise


In [10]:
def load_labels_from_tsv(filepath, label_order):
    """
        Reads label annotations from tsv file

        Parameters
        ----------
        filepath : str
            The path to the tsv file
        label_order : list[str]
            The listing and order of the labels to use from the read data

        Returns
        -------
        pd.DataFrame
            the DataFrame with the annotations

        Raises
        ------
        MissingColumnError
            if the required columns "Argument ID" or names from `label_order` are missing in the read data
        IOError
            if the file can't be read
        """
    try:
        dataframe = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=0)
        dataframe = dataframe[['Argument ID'] + label_order]
        return dataframe
    except IOError:
        traceback.print_exc()
        raise
    except KeyError:
        raise MissingColumnError('The file "%s" does not contain the required columns for its level.' % filepath)


In [11]:
import sys
import getopt
import os

In [12]:
model_dir = 'models'
data_dir = 'data'

In [13]:
if not os.path.exists(model_dir):
    os.makedirs(model_dir)


In [70]:
argument_filepath = os.path.join(data_dir, 'arguments.tsv')
value_json_filepath = os.path.join(data_dir, 'values.json')


In [71]:
df_arguments = load_arguments_from_tsv(argument_filepath, default_usage='train')

In [72]:
values = load_values_from_json(value_json_filepath)
num_labels_Lv2 = len(values['2'])


In [73]:
df_arguments.keys()

Index(['Argument ID', 'Part', 'Usage', 'Conclusion', 'Stance', 'Premise'], dtype='object')

In [18]:
# for ip in df_arguments['Argument ID']:
#   #print(df_arguments['Stance'][ip])
#   print(ip)

In [74]:
level =2
label_filepath = os.path.join(data_dir, 'labels-level{}.tsv'.format(str(level)))
df_labels = load_labels_from_tsv(label_filepath, values[str(level)])

In [75]:
a = df_labels.keys()
for key in df_labels.keys():
  print(len(df_labels[key]),key)

5270 Argument ID
5270 Achievement
5270 Benevolence: caring
5270 Benevolence: dependability
5270 Conformity: interpersonal
5270 Conformity: rules
5270 Face
5270 Hedonism
5270 Humility
5270 Power: dominance
5270 Power: resources
5270 Security: personal
5270 Security: societal
5270 Self-direction: action
5270 Self-direction: thought
5270 Stimulation
5270 Tradition
5270 Universalism: concern
5270 Universalism: nature
5270 Universalism: objectivity
5270 Universalism: tolerance


In [76]:
df_labels['Achievement'][0]

0

In [22]:
# from typing import Dict, List
# #def generate_pairwise_input(dataset: Dict[List], labels: Dict[List]) -> (List[str], List[str], List[str], List[int]):
# def generate_pairwise_input(dataset, labels):
#     """
#     TODO: group all premises and corresponding hypotheses and labels of the datapoints
#     a datapoint as seen earlier is a dict of premis, hypothesis and label
#     """
#     #raise NotImplementedError
#     premise=[]
#     conclusion=[]
#     stance=[]
#     n_labels =labels.keys()
#     n_labels = n_labels[1:]
#     print(n_labels)
#     label=[]
    
#     n = len(dataset['Argument ID'])
#     m = len(labels['Argument ID'])
#     print(n,m)
#     for i in range(n):
#         premise.append(dataset['Premise'][i])
#         conclusion.append(dataset['Conclusion'][i])
#         stance.append(dataset['Stance'][i])
#     for l in range(len(n_labels)):
#         label_id = []
#         #print(i)
#         for i in range(m):
#             #print(n_labels[l])
#             label_id.append(int(labels[n_labels[l]][i]))
#         label.append(label_id)

#     return premise, conclusion, stance, label

In [77]:
from typing import Dict, List
#def generate_pairwise_input(dataset: Dict[List], labels: Dict[List]) -> (List[str], List[str], List[str], List[int]):
def generate_pairwise_input(dataset, labels):
    """
    TODO: group all premises and corresponding hypotheses and labels of the datapoints
    a datapoint as seen earlier is a dict of premis, hypothesis and label
    """
    #raise NotImplementedError
    premise=[]
    conclusion=[]
    stance=[]
    n_labels =labels.keys()
    n_labels = n_labels[1:]
    print(n_labels)
    label=[]
    
    n = len(dataset['Argument ID'])
    m = len(labels['Argument ID'])
    print(n,m)
    for i in range(n):
        premise.append(dataset['Premise'][i])
        conclusion.append(dataset['Conclusion'][i])
        stance.append(dataset['Stance'][i])
    for i in range(m):
        sent_label = []
        #print(i)
        for l in range(len(n_labels)):
            #print(n_labels[l])
            sent_label.append(int(labels[n_labels[l]][i]))
        label.append(sent_label)

    return premise, conclusion, stance, label

In [78]:
#Randomize them first
train_premises, train_conclusion, train_stance, train_labels = generate_pairwise_input(df_arguments, df_labels)


Index(['Achievement', 'Benevolence: caring', 'Benevolence: dependability',
       'Conformity: interpersonal', 'Conformity: rules', 'Face', 'Hedonism',
       'Humility', 'Power: dominance', 'Power: resources',
       'Security: personal', 'Security: societal', 'Self-direction: action',
       'Self-direction: thought', 'Stimulation', 'Tradition',
       'Universalism: concern', 'Universalism: nature',
       'Universalism: objectivity', 'Universalism: tolerance'],
      dtype='object')
5270 5270


In [79]:
import random 
random.seed(42)
def randomize_data(premises, conclusion, stance, labels):
  n = len(premises)
  data = list(range(n))
  random.shuffle(data)
  train_premises = []
  train_conclusion = []
  train_stance = []
  train_labels = []
  for i in data:
    train_premises.append(premises[i])
    train_conclusion.append(conclusion[i])
    train_stance.append(stance[i])
    train_labels.append(labels[:][i])
  return train_premises, train_conclusion, train_stance, train_labels


In [26]:
# a = train_labels[:][-5:]
# len(a)

In [80]:
train_premises, train_conclusion, train_stance, train_labels = randomize_data(train_premises, train_conclusion, train_stance, train_labels)

In [81]:
val_premises = train_premises[-500:]
val_conclusion = train_conclusion[-500:]
val_stance = train_stance[-500:]
val_labels = train_labels[:][-500:]

In [82]:
train_premises = train_premises[:-500]
train_conclusion = train_conclusion[:-500]
train_stance = train_stance[:-500]
train_labels = train_labels[:][:-500]

In [83]:
# Nothing to do for this class!
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token
    
    def __call__(self, prem_batch: List[str], hyp_batch: List[str], stance_batch: List[str]) -> List[List[str]]:
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # The two sentences deimited by the [SEP] token.
        batch_len = len(prem_batch)
        #spaces = [" "]*batch_len
        conc_batch = [stance_batch[i]+" "+hyp_batch[i] for i in range(batch_len)]
        enc = self.hf_tokenizer(
            prem_batch,
            conc_batch,
            padding=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return enc
    

# HERE IS AN EXAMPLE OF HOW TO USE THE BATCH TOKENIZER
tokenizer = BatchTokenizer()
a = [["this is the premise.", "This is also a premise"], ["this is the hypothesis", "This is a second hypothesis"],["in favour of", "against"]]
x = tokenizer(*a)
print(x)
tokenizer.hf_tokenizer.batch_decode(x["input_ids"])



{'input_ids': tensor([[  101,  2023,  2003,  1996, 18458,  1012,   102,  1999,  7927,  1997,
          2023,  2003,  1996, 10744,   102],
        [  101,  2023,  2003,  2036,  1037, 18458,   102,  2114,  2023,  2003,
          1037,  2117, 10744,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}


['[CLS] this is the premise. [SEP] in favour of this is the hypothesis [SEP]',
 '[CLS] this is also a premise [SEP] against this is a second hypothesis [SEP] [PAD]']

In [84]:
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[:][i:i + n]

def chunk_multi(lst1, lst2, lst3, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i + n], lst2[i: i + n], lst3[i: i + n]
        


In [85]:
sum=0
import numpy as np
# for i in range(5270):
#   sum += np.sum(np.array(train_labels[:][i]))
print(np.sum(np.array(train_labels)))
#print(sum)

16277


In [86]:
against=0
infavour = 0
for i in range(4770):
  if(train_stance[i]=='against'):
    against +=1
  elif(train_stance[i]=='in favor of'):
    infavour += 1
  else:
    print(train_stance[i])


In [87]:
# Notice that since we use huggingface, we tokenize and
# encode in all at once!
batch_size=64
tokenizer = BatchTokenizer()
train_input_batches = [b for b in chunk_multi(train_premises, train_conclusion, train_stance, batch_size)]
# Tokenize + encode
train_input_batches = [tokenizer(*batch) for batch in train_input_batches]

In [88]:
val_input_batches = [b for b in chunk_multi(val_premises, val_conclusion, val_stance, batch_size)]
# Tokenize + encode
val_input_batches = [tokenizer(*batch) for batch in val_input_batches]


In [89]:
len(val_labels[0])

20

In [90]:
def encode_labels(labels: List[List[int]]) -> List[torch.FloatTensor]:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[List[int]]): List of all labels in the batch

    Returns:
        List[torch.FloatTensor]: List of Tensors of all labels in the batch
    """
    
    return torch.LongTensor(labels)


In [91]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
device

device(type='cpu')

In [92]:
print(len(train_labels), len(val_labels))


4770 500


In [93]:
def group_labels(labels):
    #JUst Grouping 17,18,19,20
    extended_labels = []
    for i in range(len(labels)):
        elabels = labels[i]
        #Add group labels
        if (elabels[16]==1 or elabels[17]==1 or elabels[18]==1 or elabels[19]==1):
            elabels.append(1)
        else:
            elabels.append(0)
        extended_labels.append(elabels)
    return extended_labels
train_labels = group_labels(train_labels)
val_labels = group_labels(val_labels)

In [94]:
print(len(train_labels), len(val_labels))


4770 500


In [95]:
len(train_labels[0])

21

In [96]:
train_label_batches = [b for b in chunk(train_labels, batch_size)]
train_label_batches = [encode_labels(batch) for batch in train_label_batches]

In [97]:
val_label_batches = [b for b in chunk(val_labels, batch_size)]
val_label_batches = [encode_labels(batch) for batch in val_label_batches]

In [98]:
val_label_batches[0][0]

tensor([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1])

In [99]:
class NLIClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
        # Freeze them if training is too slow. Notice that the learning
        # rate should probably be smaller in this case.
        # Uncommenting out the below 2 lines means only our classification layer will be updated.
        for param in self.bert.parameters():
            param.requires_grad = False
        self.bert_hidden_dimension = self.bert.config.hidden_size
        print(self.bert_hidden_dimension)
        # TODO: Add an extra hidden layer in the classifier, projecting
        #      from the BERT hidden dimension to hidden size.
        # TODO: Add a relu nonlinearity to be used in the forward method
        #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        self.middle_layer1 = torch.nn.Linear(self.bert_hidden_dimension, 64)
        self.middle_layer2 = torch.nn.Linear(self.bert_hidden_dimension, 64)
        self.middle_layer3 = torch.nn.Linear(self.bert_hidden_dimension, 64)
        self.middle_layer4 = torch.nn.Linear(self.bert_hidden_dimension, 64)
        self.middle_layer5 = torch.nn.Linear(self.bert_hidden_dimension, 64)
        self.middle_layer6 = torch.nn.Linear(self.bert_hidden_dimension, 64)
        
        
        self.hidden_layer1 = torch.nn.Linear(64, 32)
        self.hidden_layer2 = torch.nn.Linear(64, 32)
        self.hidden_layer3 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layer4 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layer5 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layer6 = torch.nn.Linear(64, 32)
        self.hidden_layer7 = torch.nn.Linear(64, 32)
        self.hidden_layer8 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layer9 = torch.nn.Linear(64, 32)
        self.hidden_layer10 = torch.nn.Linear(64, 32)
        self.hidden_layer11 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layer12 = torch.nn.Linear(64, 32)
        self.hidden_layer13 = torch.nn.Linear(64, 32)
        self.hidden_layer14 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layer15 = torch.nn.Linear(64, 32)
        self.hidden_layer16 = torch.nn.Linear(64, 32)
        self.hidden_layer17 = torch.nn.Linear(64, 32)
        self.hidden_layer18 = torch.nn.Linear(64, 32)
        self.hidden_layer19 = torch.nn.Linear(64, 32)
        self.hidden_layer20 = torch.nn.Linear(64, 32)
        self.hidden_layers = [torch.nn.Linear(self.bert_hidden_dimension, 32).to(device) for i in range(self.output_size)]
        self.classifiers = [torch.nn.Linear(32, 1).to(device) for i in range(self.output_size)]
        #self.hidden_layer2 = torch.nn.Linear(self.hidden_size, 32)
        #self.hidden_layer3 = torch.nn.Linear(128, 32)
        #self.hidden_layer4 = torch.nn.Linear(32, 8)
        self.relu = torch.nn.ReLU()
        self.classifier1 = torch.nn.Linear(32, 1)
        self.classifier2 = torch.nn.Linear(32, 1)
        self.classifier3 = torch.nn.Linear(32, 1)
        self.classifier4 = torch.nn.Linear(32, 1)
        self.classifier5 = torch.nn.Linear(32, 1)
        self.classifier6 = torch.nn.Linear(32, 1)
        self.classifier7 = torch.nn.Linear(32, 1)
        self.classifier8 = torch.nn.Linear(32, 1)
        self.classifier9 = torch.nn.Linear(32, 1)
        self.classifier10 = torch.nn.Linear(32, 1)
        self.classifier11 = torch.nn.Linear(32, 1)
        self.classifier12 = torch.nn.Linear(32, 1)
        self.classifier13 = torch.nn.Linear(32, 1)
        self.classifier14 = torch.nn.Linear(32, 1)
        self.classifier15 = torch.nn.Linear(32, 1)
        self.classifier16 = torch.nn.Linear(32, 1)
        self.classifier17 = torch.nn.Linear(32, 1)
        self.classifier18 = torch.nn.Linear(32, 1)
        self.classifier19 = torch.nn.Linear(32, 1)
        self.classifier20 = torch.nn.Linear(32, 1)
        
        self.classifier_middle6 = torch.nn.Linear(64, 1)
        #self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=2)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Encode the (batch of) sequence(s) of token symbols with an LSTM.
            Then, get the last (non-padded) hidden state for each symbol and return that.

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
                the entire sentence
        """
        # First we get the contextualized embedding for each input symbol
        # We no longer need an LSTM, since BERT encodes context and 
        # gives us a single vector describing the sequence in the form of the [CLS] token.
        embedded = self.bert(**symbols)
        #print(embedded)
        #print("Embedded", embedded.pooler_output.shape, embedded.last_hidden_state.shape)
        # TODO: Get the [CLS] token using the `pooler_output` from 
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        #raise NotImplementedError
        
        #pool_output_shape = embedded.pooler_output.shape
        #return torch.reshape(embedded.pooler_output,(pool_output_shape[0],1,pool_output_shape[1]) )
        last_hidden_state = embedded.last_hidden_state[:,0,:]
        hidden_shape = last_hidden_state.shape
        return torch.reshape(last_hidden_state,(hidden_shape[0],1,hidden_shape[1]) )

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        #output = self.hidden_layer1(encoded_sents)
        #output = self.relu(output)
        #outputs = [self.hidden_layers[i](encoded_sents) for i in range(self.output_size)]
        #outputs = [self.relu(outputs[i].to(device)) for i in range(self.output_size)]
        #outputs = [self.classifiers[i](outputs[i].to(device)) for i in range(self.output_size)]
        # outputs = []
        # for i in range(self.output_size):
        #     output = self.hidden_layers[i](encoded_sents)
        #     output = self.relu(output)
        #     output = self.classifiers[i](output)
        #     output = torch.nn.Sigmoid()(output)
        #     outputs.append(output)
        
        middle = self.middle_layer1(encoded_sents)
        middle = self.relu(middle)
        
        #Self-direction: thought
        output = self.hidden_layer1(middle)
        output = self.relu(output)
        output = self.classifier1(output)
        output = torch.nn.Sigmoid()(output)
        
        #Self-direction: action
        output2 = self.hidden_layer2(middle)
        output2 = self.relu(output2)
        output2 = self.classifier2(output2)
        output2 = torch.nn.Sigmoid()(output2)
        
        #Stimulation
        output3 = self.hidden_layer3(encoded_sents)
        output3 = self.relu(output3)
        output3 = self.classifier1(output3)
        output3 = torch.nn.Sigmoid()(output3)
        
        #Hedonism
        output4 = self.hidden_layer4(encoded_sents)
        output4 = self.relu(output4)
        output4 = self.classifier4(output4)
        output4 = torch.nn.Sigmoid()(output4)
        
        #Achievement
        output5 = self.hidden_layer5(encoded_sents)
        output5 = self.relu(output5)
        output5 = self.classifier5(output5)
        output5 = torch.nn.Sigmoid()(output5)
        
        middle2 = self.middle_layer2(encoded_sents)
        middle2 = self.relu(middle2)
        
        #Power: dominance
        output6 = self.hidden_layer6(middle2)
        output6 = self.relu(output6)
        output6 = self.classifier6(output6)
        output6 = torch.nn.Sigmoid()(output6)
        
        #Power: resources
        output7 = self.hidden_layer7(middle2)
        output7 = self.relu(output7)
        output7 = self.classifier7(output7)
        output7 = torch.nn.Sigmoid()(output7)
        
        #Face
        output8 = self.hidden_layer8(encoded_sents)
        output8 = self.relu(output8)
        output8 = self.classifier8(output8)
        output8 = torch.nn.Sigmoid()(output8)
        
        middle3 = self.middle_layer3(encoded_sents)
        middle3 = self.relu(middle3)
        
        #Security: personal
        output9 = self.hidden_layer9(middle3)
        output9 = self.relu(output9)
        output9 = self.classifier9(output9)
        output9 = torch.nn.Sigmoid()(output9)
        
        #Security: societal
        output10 = self.hidden_layer10(middle3)
        output10 = self.relu(output10)
        output10 = self.classifier10(output10)
        output10 = torch.nn.Sigmoid()(output10)
        
        #Tradition
        output11 = self.hidden_layer11(encoded_sents)
        output11 = self.relu(output11)
        output11 = self.classifier11(output11)
        output11 = torch.nn.Sigmoid()(output11)
        
        middle4 = self.middle_layer4(encoded_sents)
        middle4 = self.relu(middle4)
        
        #Conformity: rules
        output12 = self.hidden_layer12(middle4)
        output12 = self.relu(output12)
        output12 = self.classifier12(output12)
        output12 = torch.nn.Sigmoid()(output12)
        
        #Conformity: interpersonal
        output13 = self.hidden_layer13(middle4)
        output13 = self.relu(output13)
        output13 = self.classifier13(output13)
        output13 = torch.nn.Sigmoid()(output13)
        
        #Humility
        output14 = self.hidden_layer14(encoded_sents)
        output14 = self.relu(output14)
        output14 = self.classifier14(output14)
        output14 = torch.nn.Sigmoid()(output14)
        
        middle5 = self.middle_layer5(encoded_sents)
        middle5 = self.relu(middle5)
        
        #Benevolence: caring
        output15 = self.hidden_layer15(middle5)
        output15 = self.relu(output15)
        output15 = self.classifier15(output15)
        output15 = torch.nn.Sigmoid()(output15)
        
        #Benevolence: dependability
        output16 = self.hidden_layer16(middle5)
        output16 = self.relu(output16)
        output16 = self.classifier16(output16)
        output16 = torch.nn.Sigmoid()(output16)
        
        middle6 = self.middle_layer6(encoded_sents)
        middle6 = self.relu(middle6)
        output_middle6 = self.classifier_middle6(middle6)
        output_middle6 = torch.nn.Sigmoid()(output_middle6)
        
        #Universalism: concern
        output17 = self.hidden_layer17(middle6)
        output17 = self.relu(output17)
        output17 = self.classifier17(output17)
        output17 = torch.nn.Sigmoid()(output17)
        
        #Universalism: nature
        output18 = self.hidden_layer18(middle6)
        output18 = self.relu(output18)
        output18 = self.classifier18(output18)
        output18 = torch.nn.Sigmoid()(output18)
        
        #Universalism: tolerance
        output19 = self.hidden_layer19(middle6)
        output19 = self.relu(output19)
        output19 = self.classifier19(output19)
        output19 = torch.nn.Sigmoid()(output19)
        
        #Universalism: objectivity
        output20 = self.hidden_layer20(middle6)
        output20 = self.relu(output20)
        output20 = self.classifier20(output20)
        output20 = torch.nn.Sigmoid()(output20)
        #output = self.hidden_layer2(output)
        #output = self.relu(output)
        #output = self.hidden_layer3(output)
        #output = self.relu(output)
        #output = self.hidden_layer4(output)
        #output = self.relu(output)
        #output = self.classifier(output)
        #return self.log_softmax(output)
        return output, output2, output3, output4, output5, output6, output7, output8, output9, output10, output11, output12, output13, output14, output15, output16, output17, output18, output19, output20, output_middle6

In [100]:
# class NLIClassifier(torch.nn.Module):
#     def __init__(self, output_size: int, hidden_size: int):
#         super().__init__()
#         self.output_size = output_size
#         self.hidden_size = hidden_size
#         # Initialize BERT, which we use instead of a single embedding layer.
#         self.bert = BertModel.from_pretrained("bert-base-uncased")
#         # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
#         # Freeze them if training is too slow. Notice that the learning
#         # rate should probably be smaller in this case.
#         # Uncommenting out the below 2 lines means only our classification layer will be updated.
#         for param in self.bert.parameters():
#             param.requires_grad = False
#         self.bert_hidden_dimension = self.bert.config.hidden_size
#         print(self.bert_hidden_dimension)
#         # TODO: Add an extra hidden layer in the classifier, projecting
#         #      from the BERT hidden dimension to hidden size.
#         # TODO: Add a relu nonlinearity to be used in the forward method
#         #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
#         self.hidden_layer1 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer2 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer3 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer4 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer5 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer6 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer7 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer8 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer9 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer10 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer11 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer12 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer13 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer14 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer15 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer16 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer17 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer18 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer19 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         self.hidden_layer20 = torch.nn.Linear(self.bert_hidden_dimension, 32)
#         # self.hidden_layers = [torch.nn.Linear(self.bert_hidden_dimension, 32).to(device) for i in range(self.output_size)]
#         # self.classifiers = [torch.nn.Linear(32, 1).to(device) for i in range(self.output_size)]
#         # #self.hidden_layer2 = torch.nn.Linear(self.hidden_size, 32)
#         #self.hidden_layer3 = torch.nn.Linear(128, 32)
#         #self.hidden_layer4 = torch.nn.Linear(32, 8)
#         self.hidden_layers = []
#         for i in range(20):
#             self.hidden_layers.append(torch.nn.Linear(self.bert_hidden_dimension, 32))
#         self.relu = torch.nn.ReLU()
#         self.classifier1 = torch.nn.Linear(32, 1)
#         self.classifier2 = torch.nn.Linear(32, 1)
#         self.classifier3 = torch.nn.Linear(32, 1)
#         self.classifier4 = torch.nn.Linear(32, 1)
#         self.classifier5 = torch.nn.Linear(32, 1)
#         self.classifier6 = torch.nn.Linear(32, 1)
#         self.classifier7 = torch.nn.Linear(32, 1)
#         self.classifier8 = torch.nn.Linear(32, 1)
#         self.classifier9 = torch.nn.Linear(32, 1)
#         self.classifier10 = torch.nn.Linear(32, 1)
#         self.classifier11 = torch.nn.Linear(32, 1)
#         self.classifier12 = torch.nn.Linear(32, 1)
#         self.classifier13 = torch.nn.Linear(32, 1)
#         self.classifier14 = torch.nn.Linear(32, 1)
#         self.classifier15 = torch.nn.Linear(32, 1)
#         self.classifier16 = torch.nn.Linear(32, 1)
#         self.classifier17 = torch.nn.Linear(32, 1)
#         self.classifier18 = torch.nn.Linear(32, 1)
#         self.classifier19 = torch.nn.Linear(32, 1)
#         self.classifier20 = torch.nn.Linear(32, 1)
#         self.classifiers = []
#         for i in range(20):
#           self.classifiers.append(torch.nn.Linear(32, 1))
#         #self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
#         self.log_softmax = torch.nn.LogSoftmax(dim=2)

#     def encode_text(
#         self,
#         symbols: Dict
#     ) -> torch.Tensor:
#         """Encode the (batch of) sequence(s) of token symbols with an LSTM.
#             Then, get the last (non-padded) hidden state for each symbol and return that.

#         Args:
#             symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

#         Returns:
#             torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
#                 the entire sentence
#         """
#         # First we get the contextualized embedding for each input symbol
#         # We no longer need an LSTM, since BERT encodes context and 
#         # gives us a single vector describing the sequence in the form of the [CLS] token.
#         embedded = self.bert(**symbols)
#         #print(embedded)
#         #print("Embedded", embedded.pooler_output.shape, embedded.last_hidden_state.shape)
#         # TODO: Get the [CLS] token using the `pooler_output` from 
#         #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
#         #      and check the returns for the forward method.
#         # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
#         #raise NotImplementedError
        
#         #pool_output_shape = embedded.pooler_output.shape
#         #return torch.reshape(embedded.pooler_output,(pool_output_shape[0],1,pool_output_shape[1]) )
#         last_hidden_state = embedded.last_hidden_state[:,0,:]
#         hidden_shape = last_hidden_state.shape
#         return torch.reshape(last_hidden_state,(hidden_shape[0],1,hidden_shape[1]) )

#     def forward(
#         self,
#         symbols: Dict,
#     ) -> torch.Tensor:
#         """_summary_

#         Args:
#             symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

#         Returns:
#             torch.Tensor: _description_
#         """
#         encoded_sents = self.encode_text(symbols)
#         #output = self.hidden_layer1(encoded_sents)
#         #output = self.relu(output)
#         #outputs = [self.hidden_layers[i](encoded_sents) for i in range(self.output_size)]
#         #outputs = [self.relu(outputs[i].to(device)) for i in range(self.output_size)]
#         #outputs = [self.classifiers[i](outputs[i].to(device)) for i in range(self.output_size)]
#         # outputs = []
#         # for i in range(self.output_size):
#         #     output = self.hidden_layers[i](encoded_sents)
#         #     output = self.relu(output)
#         #     output = self.classifiers[i](output)
#         #     output = torch.nn.Sigmoid()(output)
#         #     outputs.append(output)
#         # output = self.hidden_layer1(encoded_sents)
#         # output = self.relu(output)
#         # output = self.classifier1(output)
#         # output = torch.nn.Sigmoid()(output)
        
#         # output2 = self.hidden_layer2(encoded_sents)
#         # output2 = self.relu(output2)
#         # output2 = self.classifier2(output2)
#         # output2 = torch.nn.Sigmoid()(output2)
        
#         # output3 = self.hidden_layer1(encoded_sents)
#         # output3 = self.relu(output3)
#         # output3 = self.classifier1(output3)
#         # output3 = torch.nn.Sigmoid()(output3)
        
#         # output4 = self.hidden_layer4(encoded_sents)
#         # output4 = self.relu(output4)
#         # output4 = self.classifier4(output4)
#         # output4 = torch.nn.Sigmoid()(output4)
        
#         # output5 = self.hidden_layer5(encoded_sents)
#         # output5 = self.relu(output5)
#         # output5 = self.classifier5(output5)
#         # output5 = torch.nn.Sigmoid()(output5)
        
#         # output6 = self.hidden_layer6(encoded_sents)
#         # output6 = self.relu(output6)
#         # output6 = self.classifier6(output6)
#         # output6 = torch.nn.Sigmoid()(output6)
        
#         # output7 = self.hidden_layer7(encoded_sents)
#         # output7 = self.relu(output7)
#         # output7 = self.classifier7(output7)
#         # output7 = torch.nn.Sigmoid()(output7)
        
#         # output8 = self.hidden_layer8(encoded_sents)
#         # output8 = self.relu(output8)
#         # output8 = self.classifier8(output8)
#         # output8 = torch.nn.Sigmoid()(output8)
        
#         # output9 = self.hidden_layer9(encoded_sents)
#         # output9 = self.relu(output9)
#         # output9 = self.classifier9(output9)
#         # output9 = torch.nn.Sigmoid()(output9)
        
#         # output10 = self.hidden_layer10(encoded_sents)
#         # output10 = self.relu(output10)
#         # output10 = self.classifier10(output10)
#         # output10 = torch.nn.Sigmoid()(output10)
        
#         # output11 = self.hidden_layer11(encoded_sents)
#         # output11 = self.relu(output11)
#         # output11 = self.classifier11(output11)
#         # output11 = torch.nn.Sigmoid()(output11)
        
#         # output12 = self.hidden_layer12(encoded_sents)
#         # output12 = self.relu(output12)
#         # output12 = self.classifier12(output12)
#         # output12 = torch.nn.Sigmoid()(output12)
        
#         # output13 = self.hidden_layer13(encoded_sents)
#         # output13 = self.relu(output13)
#         # output13 = self.classifier13(output13)
#         # output13 = torch.nn.Sigmoid()(output13)
        
#         # output14 = self.hidden_layer14(encoded_sents)
#         # output14 = self.relu(output14)
#         # output14 = self.classifier14(output14)
#         # output14 = torch.nn.Sigmoid()(output14)

#         # output15 = self.hidden_layer15(encoded_sents)
#         # output15 = self.relu(output15)
#         # output15 = self.classifier15(output15)
#         # output15 = torch.nn.Sigmoid()(output15)
        
#         # output16 = self.hidden_layer16(encoded_sents)
#         # output16 = self.relu(output16)
#         # output16 = self.classifier16(output16)
#         # output16 = torch.nn.Sigmoid()(output16)

#         # output17 = self.hidden_layer17(encoded_sents)
#         # output17 = self.relu(output17)
#         # output17 = self.classifier17(output17)
#         # output17 = torch.nn.Sigmoid()(output17)
        
#         # output18 = self.hidden_layer18(encoded_sents)
#         # output18 = self.relu(output18)
#         # output18 = self.classifier18(output18)
#         # output18 = torch.nn.Sigmoid()(output18)

#         # output19 = self.hidden_layer19(encoded_sents)
#         # output19 = self.relu(output19)
#         # output19 = self.classifier19(output19)
#         # output19 = torch.nn.Sigmoid()(output19)
        
#         # output20 = self.hidden_layer20(encoded_sents)
#         # output20 = self.relu(output20)
#         # output20 = self.classifier20(output20)
#         # output20 = torch.nn.Sigmoid()(output20)
#         # #output = self.hidden_layer2(output)
#         # #output = self.relu(output)
#         # #output = self.hidden_layer3(output)
#         # #output = self.relu(output)
#         # #output = self.hidden_layer4(output)
#         # #output = self.relu(output)
#         # #output = self.classifier(output)
#         # #return self.log_softmax(output)
#         # return output, output2, output3, output4, output5, output6, output7, output8, output9, output10, output11, output12, output13, output14, output15, output16, output17, output18, output19, output20
#         outputs = []
#         for i in range(20):
#             output = self.hidden_layers[i](encoded_sents)
#             output = self.relu(output)
#             output = self.classifiers[i](output)
#             output = torch.nn.Sigmoid()(output)
#             outputs.append(output)
#         return outputs


In [101]:
# For making predictions at test time TODO: Multi-label
def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    sents = sents.to(device)
    logits = model(sents)
    res = []
    logitslen = logits[0].shape[0]
    for i in range(logitslen):
        datares = []
        for j in range(20):
            datares.append(logits[j][i][0][0] > 0.5)
        res.append(datares)
    return res
    #return list(torch.argmax(logits, axis=2).squeeze().numpy())
    #print(torch.max(logits), torch.min(logits))
    #return list((logits>0).squeeze())

In [102]:
import numpy as np

from numpy import logical_and, sum as t_sum
def precision(predicted_labels, true_labels, which_label=1):
    """
    Precision is True Positives / All Positives Predictions
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(pred_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(true_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def f1_score(
    predicted_labels: List[int],
    true_labels: List[int],
    which_label: int
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels, which_label=which_label)
    R = recall(predicted_labels, true_labels, which_label=which_label)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


def macro_f1(
    predicted_labels: List[int],
    true_labels: List[int],
    possible_labels: List[int]
):
    scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
    # Macro, so we take the uniform avg.
    print(scores)
    return sum(scores) / len(scores)

In [103]:
def f1Score_multiLabel(preds, labels):
    nLabels = 20
    relevants = [0]*20
    positives = [0]*20
    truePositives = [0]*20
    for i in range(len(preds)):
        for j in range(nLabels):
            if(preds[i][j]==1):
                positives[j] += 1
                if(labels[i][j]==1):
                    truePositives[j] += 1
    
    for i in range(len(labels)):
        for j in range(nLabels):
            if(labels[i][j]==1):
                relevants[j] += 1
    
    precisions = []*nLabels
    recalls = []*nLabels
    f1Scores = []*nLabels
    precision =0
    recall = 0
    f1 = 0
    #print(truePositives, positives, relevants)
    for i in range(nLabels):
        if(positives[i]>0):
            precision = truePositives[i]/positives[i]
        precisions.append(precision)
        if(relevants[i]>0):
            recall = truePositives[i]/relevants[i]
        recalls.append(recall)
        #print(precision,recall,i)
        if(precision>0 and recall>0):
            f1 = 2 * precision * recall / (precision + recall)
        f1Scores.append(f1)
    precision_mean = np.mean(precisions)
    recall_mean = np.mean(recalls)
    f1_mean = np.mean(f1Scores)
    return f1_mean, precision_mean, recall_mean
    


In [104]:
import random
from tqdm import tqdm_notebook as tqdm
def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_sents,
    dev_labels,
    optimizer,
    scheduler,
    model,
):
    print("Training...")
    loss_func = torch.nn.BCELoss()
    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):
            # Empty the dynamic computation graph
            features = features.to(device)
            labels = labels.float()
            labels = labels.to(device)
            optimizer.zero_grad()
            #preds0, preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, preds10, preds11, preds12, preds13, preds14, preds15, preds16, preds17, preds18, preds19  = model(features)
            preds = model(features)

            #print(preds[0].shape)
            #featlen = preds[0].shape[0]
            #preds_temp = torch.empty((featlen, 20), dtype=torch.float)
            #for k in range(featlen):
            #    for j in range(20):
            #        preds_temp[k][j] = preds[j][k][0][0]
            #preds = preds.squeeze(1)
            #print("Preds ",preds.shape)
            #print("Labels ", labels.shape)
            #preds_temp = preds_temp.to(device)
            #print(preds_temp.is_cuda, labels.is_cuda)
            #print(preds_temp.shape, labels.shape)
            #loss = loss_func(preds_temp, labels)
            #losses = [0]*20
            #for i in range(20):
            #    loss1 = loss_func(preds[i],labels[:,i])
            #    losses.append(loss1)
            #print(preds[0].squeeze(1).squeeze(1).shape, labels[:,0].shape)
            loss0 = loss_func(preds[0].squeeze(1).squeeze(1), labels[:,0])
            #loss = loss_func(preds0.squeeze(1), labels)
            loss1 = loss_func(preds[1].squeeze(1).squeeze(1), labels[:,1]) 
            loss2 = loss_func(preds[2].squeeze(1).squeeze(1), labels[:,2]) 
            loss3 = loss_func(preds[3].squeeze(1).squeeze(1), labels[:,3]) 
            loss4 = loss_func(preds[4].squeeze(1).squeeze(1), labels[:,4]) 
            loss5 = loss_func(preds[5].squeeze(1).squeeze(1), labels[:,5]) 
            loss6 = loss_func(preds[6].squeeze(1).squeeze(1), labels[:,6]) 
            loss7 = loss_func(preds[7].squeeze(1).squeeze(1), labels[:,7]) 
            loss8 = loss_func(preds[8].squeeze(1).squeeze(1), labels[:,8]) 
            loss9 = loss_func(preds[9].squeeze(1).squeeze(1), labels[:,9]) 
            loss10 = loss_func(preds[10].squeeze(1).squeeze(1), labels[:,10])
            loss11 = loss_func(preds[11].squeeze(1).squeeze(1), labels[:,11]) 
            loss12 = loss_func(preds[12].squeeze(1).squeeze(1), labels[:,12]) 
            loss13 = loss_func(preds[13].squeeze(1).squeeze(1), labels[:,13]) 
            loss14 = loss_func(preds[14].squeeze(1).squeeze(1), labels[:,14]) 
            loss15 = loss_func(preds[15].squeeze(1).squeeze(1), labels[:,15]) 
            loss16 = loss_func(preds[16].squeeze(1).squeeze(1), labels[:,16]) 
            loss17 = loss_func(preds[17].squeeze(1).squeeze(1), labels[:,17]) 
            loss18 = loss_func(preds[18].squeeze(1).squeeze(1), labels[:,18]) 
            loss19 = loss_func(preds[19].squeeze(1).squeeze(1), labels[:,19])
            group6_loss = loss_func(preds[20].squeeze(1).squeeze(1), labels[:,20])
            #Multiply group loss by number of elements in group and add all groups
            global_loss = group6_loss*4.0 
            local_loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 + loss9 + loss10 + loss11 + loss12 + loss13 + loss14 + loss15 + loss16 + loss17 + loss18 + loss19
            loss = 1.0*global_loss + local_loss
            # Backpropogate the loss through our model
            #loss.register_hook(lambda grad: print(grad))
            #print(model.hidden_layers[0].weight.grad)
            #print(loss.grad)
            loss = loss*1000
            #print(i,model.hidden_layers[0].weight)
            loss.backward()
            #print(model.hidden_layers[0].weight.grad)
            #print(loss.grad)
            optimizer.step()
            #print("After",i, model.hidden_layers[0].weight)
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            sents = sents.to(device)
            pred = predict(model, sents)
            all_preds.extend(pred)
            all_labels.extend(list(labels))
        # #print(range(len(set(train_labels))))

        dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
        print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")
        scheduler.step()
        #print(optimizer)
    # Return the trained model
    return model

In [105]:
# import random
# from tqdm import tqdm_notebook as tqdm
# def training_loop(
#     num_epochs,
#     train_features,
#     train_labels,
#     dev_sents,
#     dev_labels,
#     optimizer,
#     #scheduler,
#     model,
# ):
#     print("Training...")
#     loss_func = torch.nn.BCELoss()
#     batches = list(zip(train_features, train_labels))
#     random.shuffle(batches)
#     for i in range(num_epochs):
#         losses = []
#         for features, labels in tqdm(batches):
#             # Empty the dynamic computation graph
#             features = features.to(device)
#             labels = labels.float()
#             labels = labels.to(device)
#             optimizer.zero_grad()
#             #preds0, preds1, preds2, preds3, preds4, preds5, preds6, preds7, preds8, preds9, preds10, preds11, preds12, preds13, preds14, preds15, preds16, preds17, preds18, preds19  = model(features)
#             preds = model(features)

#             #print(preds[0].shape)
#             #featlen = preds[0].shape[0]
#             #preds_temp = torch.empty((featlen, 20), dtype=torch.float)
#             #for k in range(featlen):
#             #    for j in range(20):
#             #        preds_temp[k][j] = preds[j][k][0][0]
#             #preds = preds.squeeze(1)
#             #print("Preds ",preds.shape)
#             #print("Labels ", labels.shape)
#             #preds_temp = preds_temp.to(device)
#             #print(preds_temp.is_cuda, labels.is_cuda)
#             #print(preds_temp.shape, labels.shape)
#             #loss = loss_func(preds_temp, labels)
#             #losses = [0]*20
#             #for i in range(20):
#             #    loss1 = loss_func(preds[i],labels[:,i])
#             #    losses.append(loss1)
#             #print(preds[0].squeeze(1).squeeze(1).shape, labels[:,0].shape)
#             loss0 = loss_func(preds[0].squeeze(1).squeeze(1), labels[:,0])
#             #loss = loss_func(preds0.squeeze(1), labels)
#             loss1 = loss_func(preds[1].squeeze(1).squeeze(1), labels[:,1]) 
#             loss2 = loss_func(preds[2].squeeze(1).squeeze(1), labels[:,2]) 
#             loss3 = loss_func(preds[3].squeeze(1).squeeze(1), labels[:,3]) 
#             loss4 = loss_func(preds[4].squeeze(1).squeeze(1), labels[:,4]) 
#             loss5 = loss_func(preds[5].squeeze(1).squeeze(1), labels[:,5]) 
#             loss6 = loss_func(preds[6].squeeze(1).squeeze(1), labels[:,6]) 
#             loss7 = loss_func(preds[7].squeeze(1).squeeze(1), labels[:,7]) 
#             loss8 = loss_func(preds[8].squeeze(1).squeeze(1), labels[:,8]) 
#             loss9 = loss_func(preds[9].squeeze(1).squeeze(1), labels[:,9]) 
#             loss10 = loss_func(preds[10].squeeze(1).squeeze(1), labels[:,10])
#             loss11 = loss_func(preds[11].squeeze(1).squeeze(1), labels[:,11]) 
#             loss12 = loss_func(preds[12].squeeze(1).squeeze(1), labels[:,12]) 
#             loss13 = loss_func(preds[13].squeeze(1).squeeze(1), labels[:,13]) 
#             loss14 = loss_func(preds[14].squeeze(1).squeeze(1), labels[:,14]) 
#             loss15 = loss_func(preds[15].squeeze(1).squeeze(1), labels[:,15]) 
#             loss16 = loss_func(preds[16].squeeze(1).squeeze(1), labels[:,16]) 
#             loss17 = loss_func(preds[17].squeeze(1).squeeze(1), labels[:,17]) 
#             loss18 = loss_func(preds[18].squeeze(1).squeeze(1), labels[:,18]) 
#             loss19 = loss_func(preds[19].squeeze(1).squeeze(1), labels[:,19])  
#             loss = loss0 + loss1 + loss2 + loss3 + loss4 + loss5 + loss6 + loss7 + loss8 + loss9 + loss10 + loss11 + loss12 + loss13 + loss14 + loss15 + loss16 + loss17 + loss18 + loss19
#             # Backpropogate the loss through our model
#             #loss.register_hook(lambda grad: print(grad))
#             #print(model.hidden_layers[0].weight.grad)
#             #print(loss.grad)
#             loss = loss*1000
#             #print(i,model.hidden_layers[0].weight)
#             loss.backward()
#             #print(model.hidden_layers[0].weight.grad)
#             #print(loss.grad)
#             optimizer.step()
#             #print("After",i, model.hidden_layers[0].weight)
#             losses.append(loss.item())
        
#         print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")
#         # Estimate the f1 score for the development set
#         print("Evaluating dev...")
#         all_preds = []
#         all_labels = []
#         for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
#             sents = sents.to(device)
#             pred = predict(model, sents)
#             all_preds.extend(pred)
#             all_labels.extend(list(labels))
#         # #print(range(len(set(train_labels))))

#         dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
#         print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")
#         # # #scheduler.step()
#         #print(optimizer)
#     # Return the trained model
#     return model

In [108]:
from transformers.optimization import get_linear_schedule_with_warmup
epochs = 10
epoch_warmup = 5 #Ideally 1/6 i.e 50 for 300 epochs
# TODO: Find a good learning rate
LR = 1e-4

possible_labels = 20
model = NLIClassifier(output_size=possible_labels, hidden_size=512)
scheduler = get_linear_schedule_with_warmup(optimizer, 20,epochs)
optimizer = torch.optim.AdamW(model.parameters(), LR)

model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [107]:
# LR = 1e-5
# optimizer = torch.optim.AdamW(model.parameters(), LR)

In [62]:
epochs = 100

In [109]:
model =training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    val_input_batches,
    val_label_batches,
    optimizer,
    scheduler,
    model,
)

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for features, labels in tqdm(batches):


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 0, loss: 12011.919674479166
Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.009803921568627449,  Dev Precision 0.33333333333333337, Dev Recall 0.0004975124378109452




  0%|          | 0/75 [00:00<?, ?it/s]

epoch 1, loss: 9282.385182291666
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.11965811965811965,  Dev Precision 0.4242424242424242, Dev Recall 0.006965174129353234


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 2, loss: 8821.266979166667
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.029431818181818187,  Dev Precision 0.48809523809523814, Dev Recall 0.012573209899867748


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 3, loss: 8655.963209635416
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.1697761457524186,  Dev Precision 0.41644736842105257, Dev Recall 0.026544492726242208


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 4, loss: 8512.527083333332
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.12204436938832668,  Dev Precision 0.4253459119496855, Dev Recall 0.032075614598559496


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 5, loss: 8375.685266927083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.1920064361991879,  Dev Precision 0.7891829442244992, Dev Recall 0.04147200373880099


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 6, loss: 8241.64322265625
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.1611828348374378,  Dev Precision 0.6682172557172559, Dev Recall 0.0637017956011701


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 7, loss: 8114.855735677083
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.20688082762190727,  Dev Precision 0.6857666877641898, Dev Recall 0.07896024593793663


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 8, loss: 7996.27130859375
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.24735997048949945,  Dev Precision 0.604939833092617, Dev Recall 0.09071974651666308


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 9, loss: 7886.575227864583
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.29002459696447475,  Dev Precision 0.6882396470828096, Dev Recall 0.10180415812714702


In [53]:
torch.save(model, 'Lclassifiers-grouped.pt')

In [54]:
model = torch.load('Lclassifiers-grouped.pt')

In [67]:
train_input_batches

[{'input_ids': tensor([[  101,  2302,  7040,  ...,     0,     0,     0],
         [  101,  3510,  2003,  ...,     0,     0,     0],
         [  101,  5970,  2064,  ...,     0,     0,     0],
         ...,
         [  101,  2065,  3008,  ...,     0,     0,     0],
         [  101,  2296,  2775,  ...,     0,     0,     0],
         [  101,  3478, 16012,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])},
 {'input_ids': tensor([[  101,  2529, 18856,  ...,     0,     0,     0],
         [  101, 26572, 22864,  ...,     0,     0,     0],
         [  101,  2591,  2865,  ...,     0,     0,     0],
         ...,
         [  101,  2000,  2108,  ...,     0,     0,     0],
         [  101,  5907,  8699,  ...,     0,     0,     0],
         [  101,  2529,  4424,  ...,     0,     0

In [68]:
for i in range(len(train_label_batches)):
    print(train_label_batches[i].shape, train_input_batches[i]['input_ids'].shape)

torch.Size([64, 21]) torch.Size([64, 59])
torch.Size([64, 21]) torch.Size([64, 69])
torch.Size([64, 21]) torch.Size([64, 57])
torch.Size([64, 21]) torch.Size([64, 71])
torch.Size([64, 21]) torch.Size([64, 72])
torch.Size([64, 21]) torch.Size([64, 52])
torch.Size([64, 21]) torch.Size([64, 101])
torch.Size([64, 21]) torch.Size([64, 56])
torch.Size([64, 21]) torch.Size([64, 61])
torch.Size([64, 21]) torch.Size([64, 84])
torch.Size([64, 21]) torch.Size([64, 53])
torch.Size([64, 21]) torch.Size([64, 81])
torch.Size([64, 21]) torch.Size([64, 111])
torch.Size([64, 21]) torch.Size([64, 98])
torch.Size([64, 21]) torch.Size([64, 53])
torch.Size([64, 21]) torch.Size([64, 50])
torch.Size([64, 21]) torch.Size([64, 60])
torch.Size([64, 21]) torch.Size([64, 140])
torch.Size([64, 21]) torch.Size([64, 56])
torch.Size([64, 21]) torch.Size([64, 60])
torch.Size([64, 21]) torch.Size([64, 55])
torch.Size([64, 21]) torch.Size([64, 58])
torch.Size([64, 21]) torch.Size([64, 50])
torch.Size([64, 21]) torch.Size

IndexError: list index out of range

In [None]:
print("Evaluating dev...")
all_preds = []
all_labels = []
for sents, labels in tqdm(zip(val_input_batches, val_label_batches), total=len(val_input_batches)):
    pred = predict(model, sents)
    all_preds.extend(pred)
    all_labels.extend(list(labels.numpy()))
# #print(range(len(set(train_labels))))

dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")


Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(val_input_batches, val_label_batches), total=len(val_input_batches)):


  0%|          | 0/8 [00:00<?, ?it/s]

[140, 114, 2, 0, 34, 0, 5, 34, 31, 37, 185, 0, 91, 0, 13, 0, 0, 0, 10, 50] [500, 500, 12, 9, 153, 0, 500, 500, 500, 500, 500, 0, 394, 0, 493, 0, 1, 0, 82, 482] [140, 114, 53, 6, 108, 22, 5, 34, 31, 37, 185, 145, 105, 62, 13, 39, 171, 26, 99, 52]
0.28 1.0 0
0.228 1.0 1
0.16666666666666666 0.03773584905660377 2
0.0 0.0 3
0.2222222222222222 0.3148148148148148 4
0.2222222222222222 0.0 5
0.01 1.0 6
0.068 1.0 7
0.062 1.0 8
0.074 1.0 9
0.37 1.0 10
0.37 0.0 11
0.23096446700507614 0.8666666666666667 12
0.23096446700507614 0.0 13
0.02636916835699797 1.0 14
0.02636916835699797 0.0 15
0.0 0.0 16
0.0 0.0 17
0.12195121951219512 0.10101010101010101 18
0.1037344398340249 0.9615384615384616 19
Dev F1 0.5140882946543324,  Dev Precision 0.140673202059074, Dev Recall 0.5140882946543324


In [None]:
np.sum(all_preds)

0

In [None]:
def f1Score_multiLabel(preds, labels):
    nLabels = 20
    relevants = [0]*20
    positives = [0]*20
    truePositives = [0]*20
    for i in range(len(preds)):
        for j in range(nLabels):
            if(preds[i][j]==1):
                positives[j] += 1
                if(labels[i][j]==1):
                    truePositives[j] += 1
    
    for i in range(len(labels)):
        for j in range(nLabels):
            if(labels[i][j]==1):
                relevants[j] += 1
    
    precisions = []*nLabels
    recalls = []*nLabels
    f1Scores = []*nLabels
    precision =0
    recall = 0
    f1 = 0
    print(truePositives, positives, relevants)
    for i in range(nLabels):
        if(positives[i]>0):
            precision = truePositives[i]/positives[i]
        precisions.append(precision)
        if(relevants[i]>0):
            recall = truePositives[i]/relevants[i]
        recalls.append(recall)
        print(precision,recall,i)
        if(positives[i]>0 and relevants[i]>0):
            f1 = 2 * precision * recall / (precision + recall)
        f1Scores.append(f1)
    precision_mean = np.mean(precisions)
    recall_mean = np.mean(recalls)
    f1_mean = np.mean(recalls)
    return f1_mean, precision_mean, recall_mean
    


In [None]:
train_label_batches[0][0][10]

tensor(0)

In [None]:
from torch import nn
loss = nn.CrossEntropyLoss()
input = torch.randn(5, requires_grad=True)
#target = torch.empty(3, dtype=torch.long).random_(5)
target = torch.randn(5).softmax(dim=0)
print(input, target)


tensor([-0.0589,  0.2156, -1.1975, -0.6472, -0.6455], requires_grad=True) tensor([0.2369, 0.0586, 0.4756, 0.1383, 0.0906])


In [None]:
output = loss(input, target)
print(output)

tensor(1.9810, grad_fn=<DivBackward1>)


In [None]:
a = torch.log(input)
a

tensor([[-0.1021,  0.0259,     nan, -0.8798,  0.3323]], grad_fn=<LogBackward0>)

In [None]:
a*target

tensor([[-0.0537,  0.0045,     nan, -0.0182,  0.0056]], grad_fn=<MulBackward0>)

In [None]:
target = torch.empty(1, dtype=torch.long)
target[0] = 1
output = loss(input, target)
print(output)

tensor(1.5930, grad_fn=<NllLossBackward0>)


In [None]:
i = 1
losses=[10]
print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")

epoch 1, loss: 10.0


In [None]:
y.dtype

torch.float32

In [None]:
from torch import nn
loss = nn.CrossEntropyLoss()
input = torch.randn(2, 5, requires_grad=True)
input

tensor([[-0.4920, -0.2966, -0.1552,  0.1589,  0.5150],
        [-0.3642,  0.2618,  0.9885,  0.9222, -0.0693]], requires_grad=True)

In [None]:
a = input[:,2]
a

tensor([-0.1552,  0.9885], grad_fn=<SelectBackward0>)