In [43]:
import torch
from pytorch_metric_learning import losses
import data_handler
from siamese_network import SiameseNetwork, train
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from custom_losses import ContrastiveLoss

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
torch.zeros(1).cuda()
#print(f"torch version: {torch.__version__}")

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"torch cuda available: {torch.cuda.is_available()}")

torch cuda available: True


In [44]:
df_data, _ = data_handler.load(path="dataset/", filename_train="train.csv", sep_char='#')

In [45]:
df_train, df_val = data_handler.split_train_data(df_data, perc_split=0.8)

zero_train:  13100
one_train:  3408
zero_val:  3275
one_val:  852


In [46]:
df_train = data_handler.concatenate_topics(df_train)
df_val = data_handler.concatenate_topics(df_val)

In [47]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16508 entries, 0 to 16507
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   args        16508 non-null  object 
 1   key_points  16508 non-null  object 
 2   labels      16508 non-null  float64
dtypes: float64(1), object(2)
memory usage: 387.0+ KB


In [52]:
df_train = df_train[13200:13500]

In [56]:
df_train = df_train.reset_index()

In [57]:
df_train[:100]

Unnamed: 0,index,args,key_points,labels
0,13200,prostitution happens; legalizing it would make...,We should legalize prostitution Legalizing sex...,1.0
1,13201,somebody was working hard to make something an...,We should abolish intellectual property rights...,1.0
2,13202,"urbanization destroys natural habits, impactin...",We should fight urbanization Urbanization harm...,1.0
3,13203,we should not adopt atheism as people have the...,We should adopt atheism Atheism discriminates ...,1.0
4,13204,cannabis is a wonder drug for people who suffe...,We should legalize cannabis Cannabis is safe/h...,1.0
...,...,...,...,...
95,13295,journalism must reach all citizens and the cen...,We should subsidize journalism Journalism is i...,1.0
96,13296,Journalism is the heart of a nation developmen...,We should subsidize journalism Journalism is i...,1.0
97,13297,compulsory voting does not promote democratic ...,We should introduce compulsory voting Compulso...,1.0
98,13298,parents should be able to utilize any technolo...,We should legalize sex selection It is within ...,1.0


In [59]:
tokenized = data_handler.tokenize_df(df_train[:100], BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


<class 'pandas.core.series.Series'>
(100,)


In [70]:
model = SiameseNetwork(bert_type=BertModel.from_pretrained('bert-base-uncased'))

train_loader = DataLoader(tokenized, shuffle=False, batch_size=32)

#train_loss = ContrastiveLoss()
train_loss = losses.ContrastiveLoss()

optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

# Batch size: 16, 32
# Learning rate (Adam): 5e-5, 3e-5, 2e-5
# Number of epochs: 2, 3, 4

# The BERT authors recommend between 2 and 4.
epochs = 1

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: super(type, obj): obj must be an instance or subtype of type

In [81]:
for epoch in range(1, epochs + 1):
    train(model, None, train_loader, ContrastiveLoss, optimizer, epoch, scheduler)
    #test(model, device, test_loader)


0
Train Epoch: 1 batch: 0 / 100 loss: -36.10004806518555
1
2
3


In [94]:
for i, encodings in enumerate(train_loader):
    args = encodings['arg']
    kps = encodings['kp']
    labels = encodings['label']
    if i==0:
        break
    

In [95]:
output1, output2 = model(args, kps, labels)

In [87]:
ContrastiveLoss(output1, output2, labels)

tensor(-35.8793, grad_fn=<AddBackward0>)

In [97]:
output1[0].size()

torch.Size([512, 768])

In [150]:
a = torch.tensor([[ [1],[3]], [[3],[5]]]).float()
e = torch.tensor([[[1], [2]], [[3],[3]]]).float()
l = torch.tensor([[1],[0]]).float()
a[0].size()

torch.Size([2, 1])

In [151]:
compute_contrastive_loss(a[0],e[0],l[0], 0.0)

tensor(nan, dtype=torch.float64)

In [179]:
def compute_contrastive_loss(left_feature, right_feature, label, margin):

    """
    Compute the contrastive loss as in


    L = 0.5 * Y * D^2 + 0.5 * (Y-1) * {max(0, margin - D)}^2

    **Parameters**
     left_feature: First element of the pair
     right_feature: Second element of the pair
     label: Label of the pair (0 or 1)
     margin: Contrastive margin

    **Returns**
     Return the loss operation

    """

    label = label.float()
    one = 1.0

    d = compute_euclidean_distance(left_feature, right_feature)
    d_sqrt = torch.sqrt(compute_euclidean_distance(left_feature, right_feature))
    first_part = torch.matmul(one-label, d)# (Y-1)*(d)

    max_margin = torch.maximum(margin-d_sqrt, torch.tensor(0))
    
    max_part = torch.square(max_margin)
    second_part = torch.matmul(label, max_part)  # (Y) * max(margin - d, 0)

    loss = 0.5 * torch.mean(first_part + second_part)

    return loss

def compute_euclidean_distance(x, y):
    """
    Computes the euclidean distance between two tensorflow variables
    """

    d = torch.sum(torch.square(torch.sub(x, y)),1)
    return d

In [12]:
def compute_contrastive_loss(left_feature, right_feature, label, margin):

    """
    Compute the contrastive loss as in


    L = 0.5 * Y * D^2 + 0.5 * (Y-1) * {max(0, margin - D)}^2

    **Parameters**
     left_feature: First element of the pair
     right_feature: Second element of the pair
     label: Label of the pair (0 or 1)
     margin: Contrastive margin

    **Returns**
     Return the loss operation

    """
    
    # -y * log(sim) + (1-y)*log(1-sim)

    label = label.float()
    
    cosine = torch.nn.CosineSimilarity()
    
    sim = torch.mean(torch.square(cosine(left_feature, right_feature))).resize(1)
    one = 1.0
   
    loss = torch.matmul(-label, torch.log(sim)) + torch.matmul((one-label).double(), torch.log(one-sim).double())
    
    return loss

In [140]:
def ContrastiveLoss(output1, output2, labels):
    
    loss = torch.tensor(0.0)
    
    for i in range(output1.size(0)):
    
        loss += compute_contrastive_loss(output1[i], output2[i], labels[i].resize(1), 0.1)
        print(loss)
    
    return loss

In [275]:
ContrastiveLoss(output1, output2, labels)

tensor(-16.8239, grad_fn=<AddBackward0>)

In [18]:
df_train[df_train['labels'] == 1].index

Int64Index([13100, 13101, 13102, 13103, 13104, 13105, 13106, 13107, 13108,
            13109,
            ...
            16498, 16499, 16500, 16501, 16502, 16503, 16504, 16505, 16506,
            16507],
           dtype='int64', length=3408)