In [1]:
!python -m pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 7.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 66.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 44.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [1]:
import traceback
import csv

import pandas as pd


def write_tsv_dataframe(filepath, dataframe):
    """
        Stores `DataFrame` as tsv file

        Parameters
        ----------
        filepath : str
            Path to tsv file
        dataframe : pd.DataFrame
            DataFrame to store

        Raises
        ------
        IOError
            if the file can't be opened
    """
    try:
        dataframe.to_csv(filepath, encoding='utf-8', sep='\t', index=False, header=True, quoting=csv.QUOTE_NONE)
    except IOError:
        traceback.print_exc()


In [2]:
def combine_columns(df_arguments, df_labels):
    """Combines the two `DataFrames` on column `Argument ID`"""
    return pd.merge(df_arguments, df_labels, on='Argument ID')


In [3]:
def split_arguments(df_arguments):
    """Splits `DataFrame` by column `Usage` into `train`-, `validation`-, and `test`-arguments"""
    train_arguments = df_arguments.loc[df_arguments['Usage'] == 'train'].drop(['Usage'], axis=1).reset_index(drop=True)
    valid_arguments = df_arguments.loc[df_arguments['Usage'] == 'validation'].drop(['Usage'], axis=1).reset_index(drop=True)
    test_arguments = df_arguments.loc[df_arguments['Usage'] == 'test'].drop(['Usage'], axis=1).reset_index(drop=True)
    
    return train_arguments, valid_arguments, test_arguments


In [4]:
def create_dataframe_head(argument_ids, model_name):
    """
        Creates `DataFrame` usable to append predictions to it

        Parameters
        ----------
        argument_ids : list[str]
            First column of the resulting DataFrame
        model_name : str
            Second column of DataFrame will contain the given model name

        Returns
        -------
        pd.DataFrame
            prepared DataFrame
    """
    df_model_head = pd.DataFrame(argument_ids, columns=['Argument ID'])
    df_model_head['Method'] = [model_name] * len(argument_ids)

    return df_model_head


In [5]:
import json
class MissingColumnError(AttributeError):
    """Error indicating that an imported DataFrame lacks necessary columns"""
    pass


In [6]:
def load_json_file(filepath):
    """Load content of json-file from `filepath`"""
    with open(filepath, 'r') as  json_file:
        return json.load(json_file)


In [7]:
def load_values_from_json(filepath):
    """Load values per level from json-file from `filepath`"""
    json_values = load_json_file(filepath)
    values = { "1":set(), "2":set(), "3":set(), "4a":set(), "4b":set() }
    for value in json_values["values"]:
        values["1"].add(value["name"])
        values["2"].add(value["level2"])
        for valueLevel3 in value["level3"]:
            values["3"].add(valueLevel3)
        for valueLevel4a in value["level4a"]:
            values["4a"].add(valueLevel4a)
        for valueLevel4b in value["level4b"]:
            values["4b"].add(valueLevel4b)
    values["1"] = sorted(values["1"])
    values["2"] = sorted(values["2"])
    values["3"] = sorted(values["3"])
    values["4a"] = sorted(values["4a"])
    values["4b"] = sorted(values["4b"])
    return values


In [8]:
def load_arguments_from_tsv(filepath, default_usage='test'):
    """
        Reads arguments from tsv file

        Parameters
        ----------
        filepath : str
            The path to the tsv file
        default_usage : str, optional
            The default value if the column "Usage" is missing

        Returns
        -------
        pd.DataFrame
            the DataFrame with all arguments

        Raises
        ------
        MissingColumnError
            if the required columns "Argument ID" or "Premise" are missing in the read data
        IOError
            if the file can't be read
        """
    try:
        dataframe = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=0)
        if not {'Argument ID', 'Premise'}.issubset(set(dataframe.columns.values)):
            raise MissingColumnError('The argument "%s" file does not contain the minimum required columns [Argument ID, Premise].' % filepath)
        if 'Usage' not in dataframe.columns.values:
            dataframe['Usage'] = [default_usage] * len(dataframe)
        return dataframe
    except IOError:
        traceback.print_exc()
        raise


In [9]:
def load_labels_from_tsv(filepath, label_order):
    """
        Reads label annotations from tsv file

        Parameters
        ----------
        filepath : str
            The path to the tsv file
        label_order : list[str]
            The listing and order of the labels to use from the read data

        Returns
        -------
        pd.DataFrame
            the DataFrame with the annotations

        Raises
        ------
        MissingColumnError
            if the required columns "Argument ID" or names from `label_order` are missing in the read data
        IOError
            if the file can't be read
        """
    try:
        dataframe = pd.read_csv(filepath, encoding='utf-8', sep='\t', header=0)
        dataframe = dataframe[['Argument ID'] + label_order]
        return dataframe
    except IOError:
        traceback.print_exc()
        raise
    except KeyError:
        raise MissingColumnError('The file "%s" does not contain the required columns for its level.' % filepath)


In [10]:
import sys
import getopt
import os

In [11]:
model_dir = 'models'
data_dir = 'data'

In [12]:
if not os.path.exists(model_dir):
    os.makedirs(model_dir)


In [13]:
argument_filepath = os.path.join(data_dir, 'arguments.tsv')
value_json_filepath = os.path.join(data_dir, 'values.json')


In [14]:
df_arguments = load_arguments_from_tsv(argument_filepath, default_usage='train')

In [15]:
values = load_values_from_json(value_json_filepath)
num_labels_Lv2 = len(values['2'])


In [16]:
df_arguments.keys()

Index(['Argument ID', 'Part', 'Usage', 'Conclusion', 'Stance', 'Premise'], dtype='object')

In [17]:
for ip in df_arguments['Argument ID']:
  #print(df_arguments['Stance'][ip])
  print(ip)

A01001
A01002
A01003
A01004
A01005
A01006
A01007
A01008
A01009
A01010
A01011
A01012
A01013
A01014
A01015
A01016
A01017
A01018
A01019
A01020
A02001
A02002
A02003
A02004
A02005
A02006
A02007
A02008
A02009
A02010
A02011
A02012
A02013
A02014
A02015
A02016
A02017
A02018
A02019
A02020
A03001
A03002
A03003
A03004
A03005
A03006
A03007
A03008
A03009
A03010
A03011
A03012
A03013
A03014
A03015
A03016
A03017
A03018
A03019
A03020
A04001
A04002
A04003
A04004
A04005
A04006
A04007
A04008
A04009
A04010
A04011
A04012
A04013
A04014
A04015
A04016
A04017
A04018
A04019
A04020
A05001
A05002
A05003
A05004
A05005
A05006
A05007
A05008
A05009
A05010
A05011
A05012
A05013
A05014
A05015
A05016
A05017
A05018
A05019
A05020
A05021
A05022
A05023
A05024
A05025
A05026
A05027
A05028
A05029
A05030
A05031
A05032
A05033
A05034
A05035
A05036
A05037
A05038
A05043
A05045
A05048
A05049
A05050
A05052
A05053
A05054
A05055
A05056
A05057
A05058
A05059
A05062
A05063
A05064
A05065
A05066
A05067
A05069
A05071
A05072
A05073
A05074
A05075

A19210
A19211
A19212
A19213
A19214
A19215
A19216
A19217
A19218
A19219
A19220
A19221
A19222
A19225
A19228
A19229
A19230
A19231
A19232
A19233
A19234
A19235
A19236
A19238
A19239
A19241
A19242
A19243
A19244
A19245
A19246
A19247
A19248
A19250
A19252
A19253
A19254
A19255
A19256
A19258
A19259
A19261
A19262
A19263
A19264
A19265
A19266
A19267
A19268
A19269
A19270
A19271
A19272
A19273
A19274
A19276
A19277
A19278
A19279
A19280
A19281
A19282
A19284
A19285
A19287
A19288
A19289
A19290
A19291
A19292
A19293
A19294
A19295
A19296
A19297
A19298
A19299
A19301
A19302
A19303
A19304
A19305
A19306
A19307
A19308
A19309
A19310
A19311
A19312
A19313
A19314
A19315
A19316
A19317
A19318
A19319
A19320
A19321
A19322
A19323
A19324
A19325
A19326
A19328
A19331
A19332
A19333
A19334
A19335
A19336
A19338
A19339
A19342
A19343
A19344
A19345
A19346
A19347
A19348
A19349
A19350
A19351
A19352
A19353
A19355
A19356
A19358
A19359
A19362
A19363
A19365
A19366
A19367
A19369
A19370
A19371
A19372
A19373
A19375
A19376
A19378
A19379
A19383

A22327
A22328
A22329
A22330
A22331
A22332
A22335
A22336
A22337
A22338
A22339
A22340
A22341
A22342
A22343
A22345
A22347
A22348
A22349
A22350
A22351
A22353
A22354
A22355
A22356
A22357
A22358
A22359
A22360
A22361
A22362
A22363
A22364
A22365
A22367
A22368
A22370
A22371
A22372
A22374
A22375
A22376
A22377
A22378
A22379
A22380
A22381
A22382
A22383
A22384
A22385
A22386
A22387
A22388
A22389
A22390
A22391
A22392
A22393
A22394
A22396
A22397
A22398
A22400
A22401
A22402
A22404
A22405
A22408
A22409
A22410
A22411
A22412
A22413
A22414
A22415
A22416
A22417
A22418
A22419
A22420
A22421
A22423
A22424
A22425
A22426
A22427
A22428
A22429
A22430
A22431
A22432
A22433
A22434
A22435
A22436
A22437
A22439
A22440
A22443
A22444
A22445
A22446
A22447
A22448
A22449
A22450
A22453
A22455
A22457
A22458
A22459
A22460
A22461
A22463
A22464
A22467
A22468
A22469
A22470
A22471
A22472
A22473
A22474
A22475
A22477
A22480
A22481
A22482
A22484
A22485
A22486
A22487
A22488
A22489
A22490
A22491
A22492
A22493
A22494
A22495
A22496
A22497

A25369
A25370
A25371
A25372
A25373
A25374
A25375
A25377
A25378
A25379
A25380
A25381
A25382
A25383
A25384
A25386
A25387
A25388
A25389
A25391
A25392
A25393
A25394
A25395
A25396
A25397
A25399
A25400
A25401
A25402
A25403
A25404
A25405
A25406
A25407
A25408
A25409
A25410
A25411
A25412
A25413
A25414
A25415
A25416
A25417
A25418
A25420
A25421
A25422
A25423
A25424
A25425
A25427
A25428
A25429
A25431
A25433
A25434
A25435
A25438
A25439
A25440
A25441
A25442
A25443
A25444
A25445
A25446
A25447
A25448
A25449
A25450
A25451
A25452
A25453
A25454
A25455
A25456
A25457
A25458
A25459
A25460
A25461
A25462
A25463
A25464
A25467
A25468
A25470
A25471
A25472
A25473
A25474
A25475
A25477
A25478
A25479
A25481
A25482
A25483
A25484
A25485
A25487
A25488
A25490
A25491
A25492
A25495
A25496
A25497
A25498
A25499
A25500
A05060
A05061
A05068
A05070
A06018
A07005
A07007
A07064
A07091
A08015
A08017
A09018
A09061
A09088
A09098
A09099
A09100
A12012
A12018
A12044
A12061
A12079
A12149
A12154
A12186
A12196
A12199
A12200
A12203
A12207

In [18]:
level =2
label_filepath = os.path.join(data_dir, 'labels-level{}.tsv'.format(str(level)))
df_labels = load_labels_from_tsv(label_filepath, values[str(level)])

In [19]:
a = df_labels.keys()
for key in df_labels.keys():
  print(len(df_labels[key]),key)

5270 Argument ID
5270 Achievement
5270 Benevolence: caring
5270 Benevolence: dependability
5270 Conformity: interpersonal
5270 Conformity: rules
5270 Face
5270 Hedonism
5270 Humility
5270 Power: dominance
5270 Power: resources
5270 Security: personal
5270 Security: societal
5270 Self-direction: action
5270 Self-direction: thought
5270 Stimulation
5270 Tradition
5270 Universalism: concern
5270 Universalism: nature
5270 Universalism: objectivity
5270 Universalism: tolerance


In [20]:
df_labels['Achievement'][0]

0

In [21]:
from typing import Dict, List
#def generate_pairwise_input(dataset: Dict[List], labels: Dict[List]) -> (List[str], List[str], List[str], List[int]):
def generate_pairwise_input(dataset, labels):
    """
    TODO: group all premises and corresponding hypotheses and labels of the datapoints
    a datapoint as seen earlier is a dict of premis, hypothesis and label
    """
    #raise NotImplementedError
    premise=[]
    conclusion=[]
    stance=[]
    n_labels =labels.keys()
    n_labels = n_labels[1:]
    print(n_labels)
    label=[]
    
    n = len(dataset['Argument ID'])
    m = len(labels['Argument ID'])
    print(n,m)
    for i in range(n):
        premise.append(dataset['Premise'][i])
        conclusion.append(dataset['Conclusion'][i])
        stance.append(dataset['Stance'][i])
    for i in range(m):
        sent_label = []
        #print(i)
        for l in range(len(n_labels)):
            #print(n_labels[l])
            sent_label.append(int(labels[n_labels[l]][i]))
        label.append(sent_label)

    return premise, conclusion, stance, label

In [22]:
#Randomize them first
train_premises, train_conclusion, train_stance, train_labels = generate_pairwise_input(df_arguments, df_labels)


Index(['Achievement', 'Benevolence: caring', 'Benevolence: dependability',
       'Conformity: interpersonal', 'Conformity: rules', 'Face', 'Hedonism',
       'Humility', 'Power: dominance', 'Power: resources',
       'Security: personal', 'Security: societal', 'Self-direction: action',
       'Self-direction: thought', 'Stimulation', 'Tradition',
       'Universalism: concern', 'Universalism: nature',
       'Universalism: objectivity', 'Universalism: tolerance'],
      dtype='object')
5270 5270


In [None]:
import random 
random.seed(42)
def randomize_data(premises, conclusion, stance, labels):
  n = len(premises)
  data = random.shuffle(range(n))
  train_premises = []
  train_conclusion = []
  train_stance = []
  train_labels = []
  for i in data:
    train_premises.append(premises[i])
    train_conclusion.append(conclusion[i])
    train_stance.append(stance[i])
    train_labels.append(labels[:][i])
  return train_premises, train_conclusion, train_stance, train_labels


In [None]:
train_premises, train_conclusion, train_stance, train_labels = randomize_data(train_premises, train_conclusion, train_stance, train_labels)

In [23]:
val_premises = train_premises[-500:]
val_conclusion = train_conclusion[-500:]
val_stance = train_stance[-500:]
val_labels = train_labels[:][-500:]

In [24]:
train_premises = train_premises[:-500]
train_conclusion = train_conclusion[:-500]
train_stance = train_stance[:-500]
train_labels = train_labels[:][:-500]

In [25]:
# Nothing to do for this class!
import torch
from transformers import BertModel
from transformers import AutoTokenizer
from typing import Dict, List

class BatchTokenizer:
    """Tokenizes and pads a batch of input sentences."""

    def __init__(self):
        """Initializes the tokenizer

        Args:
            pad_symbol (Optional[str], optional): The symbol for a pad. Defaults to "<P>".
        """
        self.hf_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    
    def get_sep_token(self,):
        return self.hf_tokenizer.sep_token
    
    def __call__(self, prem_batch: List[str], hyp_batch: List[str], stance_batch: List[str]) -> List[List[str]]:
        """Uses the huggingface tokenizer to tokenize and pad a batch.

        We return a dictionary of tensors per the huggingface model specification.

        Args:
            batch (List[str]): A List of sentence strings

        Returns:
            Dict: The dictionary of token specifications provided by HuggingFace
        """
        # The HF tokenizer will PAD for us, and additionally combine 
        # The two sentences deimited by the [SEP] token.
        batch_len = len(prem_batch)
        #spaces = [" "]*batch_len
        conc_batch = [stance_batch[i]+" "+hyp_batch[i] for i in range(batch_len)]
        enc = self.hf_tokenizer(
            prem_batch,
            conc_batch,
            padding=True,
            return_token_type_ids=False,
            return_tensors='pt'
        )

        return enc
    

# HERE IS AN EXAMPLE OF HOW TO USE THE BATCH TOKENIZER
tokenizer = BatchTokenizer()
a = [["this is the premise.", "This is also a premise"], ["this is the hypothesis", "This is a second hypothesis"],["in favour of", "against"]]
x = tokenizer(*a)
print(x)
tokenizer.hf_tokenizer.batch_decode(x["input_ids"])



{'input_ids': tensor([[  101,  2023,  2003,  1996, 18458,  1012,   102,  1999,  7927,  1997,
          2023,  2003,  1996, 10744,   102],
        [  101,  2023,  2003,  2036,  1037, 18458,   102,  2114,  2023,  2003,
          1037,  2117, 10744,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}


['[CLS] this is the premise. [SEP] in favour of this is the hypothesis [SEP]',
 '[CLS] this is also a premise [SEP] against this is a second hypothesis [SEP] [PAD]']

In [26]:
def chunk(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[:][i:i + n]

def chunk_multi(lst1, lst2, lst3, n):
    for i in range(0, len(lst1), n):
        yield lst1[i: i + n], lst2[i: i + n], lst3[i: i + n]
        


In [27]:
sum=0
import numpy as np
# for i in range(5270):
#   sum += np.sum(np.array(train_labels[:][i]))
print(np.sum(np.array(train_labels)))
#print(sum)

16563


In [28]:
against=0
infavour = 0
for i in range(4770):
  if(train_stance[i]=='against'):
    against +=1
  elif(train_stance[i]=='in favor of'):
    infavour += 1
  else:
    print(train_stance[i])


In [29]:
# Notice that since we use huggingface, we tokenize and
# encode in all at once!
batch_size=64
tokenizer = BatchTokenizer()
train_input_batches = [b for b in chunk_multi(train_premises, train_conclusion, train_stance, batch_size)]
# Tokenize + encode
train_input_batches = [tokenizer(*batch) for batch in train_input_batches]

In [30]:
val_input_batches = [b for b in chunk_multi(val_premises, val_conclusion, val_stance, batch_size)]
# Tokenize + encode
val_input_batches = [tokenizer(*batch) for batch in val_input_batches]


In [31]:
len(val_labels[:][0])

20

In [32]:
def encode_labels(labels: List[List[int]]) -> torch.FloatTensor:
    """Turns the batch of labels into a tensor

    Args:
        labels (List[List[int]]): List of all labels in the batch

    Returns:
        torch.FloatTensor: Tensor of all labels in the batch
    """
    
    return torch.LongTensor(labels)


In [80]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
device

device(type='cuda', index=0)

In [34]:
train_label_batches = [b for b in chunk(train_labels, batch_size)]
train_label_batches = [encode_labels(batch) for batch in train_label_batches]

In [35]:
val_label_batches = [b for b in chunk(val_labels, batch_size)]
val_label_batches = [encode_labels(batch) for batch in val_label_batches]

In [36]:
val_label_batches[0][0]

tensor([1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [110]:
class NLIClassifier(torch.nn.Module):
    def __init__(self, output_size: int, hidden_size: int):
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size
        # Initialize BERT, which we use instead of a single embedding layer.
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        # TODO [OPTIONAL]: Updating all BERT parameters can be slow and memory intensive. 
        # Freeze them if training is too slow. Notice that the learning
        # rate should probably be smaller in this case.
        # Uncommenting out the below 2 lines means only our classification layer will be updated.
        for param in self.bert.parameters():
            param.requires_grad = False
        self.bert_hidden_dimension = self.bert.config.hidden_size
        print(self.bert_hidden_dimension)
        # TODO: Add an extra hidden layer in the classifier, projecting
        #      from the BERT hidden dimension to hidden size.
        # TODO: Add a relu nonlinearity to be used in the forward method
        #      https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
        self.hidden_layer1 = torch.nn.Linear(self.bert_hidden_dimension, 32)
        self.hidden_layers = [torch.nn.Linear(self.bert_hidden_dimension, 32).to(device) for i in range(self.output_size)]
        self.classifiers = [torch.nn.Linear(32, 1).to(device) for i in range(self.output_size)]
        #self.hidden_layer2 = torch.nn.Linear(self.hidden_size, 32)
        #self.hidden_layer3 = torch.nn.Linear(128, 32)
        #self.hidden_layer4 = torch.nn.Linear(32, 8)
        self.relu = torch.nn.ReLU()
        self.classifier = torch.nn.Linear(32, self.output_size)
        #self.classifier = torch.nn.Linear(self.hidden_size, self.output_size)
        self.log_softmax = torch.nn.LogSoftmax(dim=2)

    def encode_text(
        self,
        symbols: Dict
    ) -> torch.Tensor:
        """Encode the (batch of) sequence(s) of token symbols with an LSTM.
            Then, get the last (non-padded) hidden state for each symbol and return that.

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: The final hiddens tate of the LSTM, which represents an encoding of
                the entire sentence
        """
        # First we get the contextualized embedding for each input symbol
        # We no longer need an LSTM, since BERT encodes context and 
        # gives us a single vector describing the sequence in the form of the [CLS] token.
        embedded = self.bert(**symbols)
        #print(embedded)
        #print("Embedded", embedded.pooler_output.shape, embedded.last_hidden_state.shape)
        # TODO: Get the [CLS] token using the `pooler_output` from 
        #      The BertModel output. See here: https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
        #      and check the returns for the forward method.
        # We want to return a tensor of the form batch_size x 1 x bert_hidden_dimension
        #raise NotImplementedError
        
        #pool_output_shape = embedded.pooler_output.shape
        #return torch.reshape(embedded.pooler_output,(pool_output_shape[0],1,pool_output_shape[1]) )
        last_hidden_state = embedded.last_hidden_state[:,0,:]
        hidden_shape = last_hidden_state.shape
        return torch.reshape(last_hidden_state,(hidden_shape[0],1,hidden_shape[1]) )

    def forward(
        self,
        symbols: Dict,
    ) -> torch.Tensor:
        """_summary_

        Args:
            symbols (Dict): The Dict of token specifications provided by the HuggingFace tokenizer

        Returns:
            torch.Tensor: _description_
        """
        encoded_sents = self.encode_text(symbols)
        #output = self.hidden_layer1(encoded_sents)
        #output = self.relu(output)
        outputs = [self.hidden_layers[i](encoded_sents) for i in range(self.output_size)]
        outputs = [self.relu(outputs[i].to(device)) for i in range(self.output_size)]
        outputs = [self.classifiers[i](outputs[i].to(device)) for i in range(self.output_size)]
        #output = self.hidden_layer2(output)
        #output = self.relu(output)
        #output = self.hidden_layer3(output)
        #output = self.relu(output)
        #output = self.hidden_layer4(output)
        #output = self.relu(output)
        #output = self.classifier(output)
        #return self.log_softmax(output)
        return outputs

In [135]:
# For making predictions at test time TODO: Multi-label
def predict(model: torch.nn.Module, sents: torch.Tensor) -> List:
    sents = sents.to(device)
    logits = model(sents)
    res = []
    logitslen = logits[0].shape[0]
    for i in range(logitslen):
        datares = []
        for j in range(len(logits)):
            datares.append(logits[j][i][0][0] > 0.5)
        res.append(datares)
    return res
    #return list(torch.argmax(logits, axis=2).squeeze().numpy())
    #print(torch.max(logits), torch.min(logits))
    #return list((logits>0).squeeze())

In [136]:
import numpy as np

from numpy import logical_and, sum as t_sum
def precision(predicted_labels, true_labels, which_label=1):
    """
    Precision is True Positives / All Positives Predictions
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(pred_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def recall(predicted_labels, true_labels, which_label=1):
    """
    Recall is True Positives / All Positive Labels
    """
    pred_which = np.array([pred == which_label for pred in predicted_labels])
    true_which = np.array([lab == which_label for lab in true_labels])
    denominator = t_sum(true_which)
    if denominator:
        return t_sum(logical_and(pred_which, true_which))/denominator
    else:
        return 0.


def f1_score(
    predicted_labels: List[int],
    true_labels: List[int],
    which_label: int
):
    """
    F1 score is the harmonic mean of precision and recall
    """
    P = precision(predicted_labels, true_labels, which_label=which_label)
    R = recall(predicted_labels, true_labels, which_label=which_label)
    if P and R:
        return 2*P*R/(P+R)
    else:
        return 0.


def macro_f1(
    predicted_labels: List[int],
    true_labels: List[int],
    possible_labels: List[int]
):
    scores = [f1_score(predicted_labels, true_labels, l) for l in possible_labels]
    # Macro, so we take the uniform avg.
    print(scores)
    return sum(scores) / len(scores)

In [137]:
def f1Score_multiLabel(preds, labels):
    nLabels = 20
    relevants = [0]*20
    positives = [0]*20
    truePositives = [0]*20
    for i in range(len(preds)):
        for j in range(nLabels):
            if(preds[i][j]==1):
                positives[j] += 1
                if(labels[i][j]==1):
                    truePositives[j] += 1
    
    for i in range(len(labels)):
        for j in range(nLabels):
            if(labels[i][j]==1):
                relevants[j] += 1
    
    precisions = []*nLabels
    recalls = []*nLabels
    f1Scores = []*nLabels
    precision =0
    recall = 0
    f1 = 0
    #print(truePositives, positives, relevants)
    for i in range(nLabels):
        if(positives[i]>0):
            precision = truePositives[i]/positives[i]
        precisions.append(precision)
        if(relevants[i]>0):
            recall = truePositives[i]/relevants[i]
        recalls.append(recall)
        #print(precision,recall,i)
        if(precision>0 and recall>0):
            f1 = 2 * precision * recall / (precision + recall)
        f1Scores.append(f1)
    precision_mean = np.mean(precisions)
    recall_mean = np.mean(recalls)
    f1_mean = np.mean(recalls)
    return f1_mean, precision_mean, recall_mean
    


In [138]:
import random
from tqdm import tqdm_notebook as tqdm
def training_loop(
    num_epochs,
    train_features,
    train_labels,
    dev_sents,
    dev_labels,
    optimizer,
    #scheduler,
    model,
):
    print("Training...")
    loss_func = torch.nn.CrossEntropyLoss()
    batches = list(zip(train_features, train_labels))
    random.shuffle(batches)
    for i in range(num_epochs):
        losses = []
        for features, labels in tqdm(batches):
            # Empty the dynamic computation graph
            features = features.to(device)
            labels = labels.float()
            labels = labels.to(device)
            optimizer.zero_grad()
            preds = model(features)
            #print(preds[0].shape)
            featlen = preds[0].shape[0]
            preds_temp = torch.empty((featlen, 20), dtype=torch.float)
            for k in range(featlen):
                for j in range(20):
                    preds_temp[k][j] = preds[j][k][0][0]
            #preds = preds.squeeze(1)
            #print("Preds ",preds.shape)
            #print("Labels ", labels.shape)
            preds_temp = preds_temp.to(device)
            #print(preds_temp.is_cuda, labels.is_cuda)
            #print(preds_temp.shape, labels.shape)
            loss = loss_func(preds_temp, labels)
            # Backpropogate the loss through our model
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")
        # Estimate the f1 score for the development set
        print("Evaluating dev...")
        all_preds = []
        all_labels = []
        for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):
            sents = sents.to(device)
            pred = predict(model, sents)
            all_preds.extend(pred)
            all_labels.extend(list(labels))
        # #print(range(len(set(train_labels))))

        dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
        print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")
        # #scheduler.step()
        #print(optimizer)
    # Return the trained model
    return model

In [139]:
epochs = 50
# TODO: Find a good learning rate
LR = 1e-5

possible_labels = 20
model = NLIClassifier(output_size=possible_labels, hidden_size=512)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), LR)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


768


In [None]:
model =training_loop(
    epochs,
    train_input_batches,
    train_label_batches,
    val_input_batches,
    val_label_batches,
    optimizer,
    #scheduler,
    model,
)

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for features, labels in tqdm(batches):


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 63, loss: 10.399799480438233
Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(dev_sents, dev_labels), total=len(dev_sents)):


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.0,  Dev Precision 0.0, Dev Recall 0.0


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 63, loss: 10.399799480438233
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.0,  Dev Precision 0.0, Dev Recall 0.0


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 63, loss: 10.399799480438233
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.0,  Dev Precision 0.0, Dev Recall 0.0


  0%|          | 0/75 [00:00<?, ?it/s]

epoch 63, loss: 10.399799480438233
Evaluating dev...


  0%|          | 0/8 [00:00<?, ?it/s]

Dev F1 0.0,  Dev Precision 0.0, Dev Recall 0.0


  0%|          | 0/75 [00:00<?, ?it/s]

In [124]:
print("Evaluating dev...")
all_preds = []
all_labels = []
for sents, labels in tqdm(zip(val_input_batches, val_label_batches), total=len(val_input_batches)):
    pred = predict(model, sents)
    all_preds.extend(pred)
    all_labels.extend(list(labels.numpy()))
# #print(range(len(set(train_labels))))

dev_f1, dev_P, dev_R = f1Score_multiLabel(all_preds, all_labels)
print(f"Dev F1 {dev_f1},  Dev Precision {dev_P}, Dev Recall {dev_R}")


Evaluating dev...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for sents, labels in tqdm(zip(val_input_batches, val_label_batches), total=len(val_input_batches)):


  0%|          | 0/8 [00:00<?, ?it/s]

[140, 114, 2, 0, 34, 0, 5, 34, 31, 37, 185, 0, 91, 0, 13, 0, 0, 0, 10, 50] [500, 500, 12, 9, 153, 0, 500, 500, 500, 500, 500, 0, 394, 0, 493, 0, 1, 0, 82, 482] [140, 114, 53, 6, 108, 22, 5, 34, 31, 37, 185, 145, 105, 62, 13, 39, 171, 26, 99, 52]
0.28 1.0 0
0.228 1.0 1
0.16666666666666666 0.03773584905660377 2
0.0 0.0 3
0.2222222222222222 0.3148148148148148 4
0.2222222222222222 0.0 5
0.01 1.0 6
0.068 1.0 7
0.062 1.0 8
0.074 1.0 9
0.37 1.0 10
0.37 0.0 11
0.23096446700507614 0.8666666666666667 12
0.23096446700507614 0.0 13
0.02636916835699797 1.0 14
0.02636916835699797 0.0 15
0.0 0.0 16
0.0 0.0 17
0.12195121951219512 0.10101010101010101 18
0.1037344398340249 0.9615384615384616 19
Dev F1 0.5140882946543324,  Dev Precision 0.140673202059074, Dev Recall 0.5140882946543324


In [113]:
np.sum(all_preds)

0

In [111]:
def f1Score_multiLabel(preds, labels):
    nLabels = 20
    relevants = [0]*20
    positives = [0]*20
    truePositives = [0]*20
    for i in range(len(preds)):
        for j in range(nLabels):
            if(preds[i][j]==1):
                positives[j] += 1
                if(labels[i][j]==1):
                    truePositives[j] += 1
    
    for i in range(len(labels)):
        for j in range(nLabels):
            if(labels[i][j]==1):
                relevants[j] += 1
    
    precisions = []*nLabels
    recalls = []*nLabels
    f1Scores = []*nLabels
    precision =0
    recall = 0
    f1 = 0
    print(truePositives, positives, relevants)
    for i in range(nLabels):
        if(positives[i]>0):
            precision = truePositives[i]/positives[i]
        precisions.append(precision)
        if(relevants[i]>0):
            recall = truePositives[i]/relevants[i]
        recalls.append(recall)
        print(precision,recall,i)
        if(positives[i]>0 and relevants[i]>0):
            f1 = 2 * precision * recall / (precision + recall)
        f1Scores.append(f1)
    precision_mean = np.mean(precisions)
    recall_mean = np.mean(recalls)
    f1_mean = np.mean(recalls)
    return f1_mean, precision_mean, recall_mean
    


In [103]:
train_label_batches[0][0][10]

tensor(0)

In [85]:
from torch import nn
loss = nn.CrossEntropyLoss()
input = torch.randn(1, 5, requires_grad=True)
#target = torch.empty(3, dtype=torch.long).random_(5)
target = torch.randn(1, 5).softmax(dim=1)
print(input, target)


tensor([[ 0.9030,  1.0262, -0.0431,  0.4149,  1.3941]], requires_grad=True) tensor([[0.5257, 0.1721, 0.2647, 0.0207, 0.0167]])


In [86]:
output = loss(input, target)
print(output)

tensor(1.7929, grad_fn=<DivBackward1>)


In [88]:
a = torch.log(input)
a

tensor([[-0.1021,  0.0259,     nan, -0.8798,  0.3323]], grad_fn=<LogBackward0>)

In [89]:
a*target

tensor([[-0.0537,  0.0045,     nan, -0.0182,  0.0056]], grad_fn=<MulBackward0>)

In [84]:
target = torch.empty(1, dtype=torch.long)
target[0] = 1
output = loss(input, target)
print(output)

tensor(1.5930, grad_fn=<NllLossBackward0>)


In [77]:
i = 1
losses=[10]
print(f"epoch {i}, loss: {np.sum(losses)/len(losses)}")

epoch 1, loss: 10.0


In [70]:
y.dtype

torch.float32