# Assessing the Funniness of Edited News Headlines

# Importing Required Libraries

In [None]:
# Installing HuggingFace transformers library
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b0/9e/5b80becd952d5f7250eaf8fc64b957077b12ccfe73e9c03d37146ab29712/transformers-4.6.0-py3-none-any.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 24.0MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 51.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 49.0MB/s 
Installing collec

In [None]:
import random
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

In [None]:
# To see complete text in the dataframe
pd.set_option("display.max_colwidth", -1)

# Data Collection and Preprocessing

## Data Collection

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
train_set = pd.read_csv("/content/drive/My Drive/DataSets/subtask-1/train.csv")
valid_set = pd.read_csv("/content/drive/My Drive/DataSets/subtask-1/dev.csv")

## Visualization and Normal Preprocessing

In [None]:
train_set

Unnamed: 0,id,original,edit,grades,meanGrade
0,14530,France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq,twins,10000,0.2
1,13034,"Pentagon claims 2,000 % increase in Russian trolls after <Syria/> strikes . What does that mean ?",bowling,33110,1.6
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor Crashes <Coalition/>,party,22100,1.0
3,76,"In an apparent first , Iran and Israel <engage/> each other militarily",slap,20000,0.4
4,6164,Trump was told weeks ago that Flynn misled <Vice/> President .,school,0,0.0
...,...,...,...,...,...
9647,10899,State officials blast ' unprecedented ' DHS <move/> to secure electoral system,idea,0,0.0
9648,1781,Protesters Rally for <Refugees/> Detained at JFK Airport After Trump Ban,stewardesses,20000,0.4
9649,5628,Cruise line Carnival Corp. joins the fight against Bermuda 's same-sex <marriage/> ban,raisin,21000,0.6
9650,14483,Columbia police hunt woman seen with <gun/> near University of Missouri campus,cake,32200,1.4


In [None]:
valid_set

Unnamed: 0,id,original,edit,grades,meanGrade
0,1723,Thousands of gay and bisexual <men/> convicted of long-abolished sexual offences are posthumously pardoned,swans,22100,1.0
1,12736,Special <prosecutor/> appointed to Trump Russia,chef,21100,0.8
2,12274,Spanish police detain man and search Ripoll addresses in hunt for terror <suspects/>,squad,21000,0.6
3,8823,N.Y. Times <reprimands/> reporter for sharing ' unfounded rumor ' about Melania Trump,applauds,32210,1.6
4,5087,Vladimir Putin Releases Video Simulation Of Russian <Missile/> striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .,balloon,11000,0.4
...,...,...,...,...,...
2414,1202,Supreme <Court/> Once Again Strikes Down Racial Gerrymandering In North Carolina,leaders,10000,0.2
2415,14764,Trump Mocks Schumer ’s Tears ; Vows to ‘ Make America <Safe/> Again ’,Insane,33333,3.0
2416,12595,US government memo on the <danger/> of leaking to media has been leaked,amusement,22111,1.4
2417,70,Newt Gingrich : Join Me in Supporting Judge Roy Moore to <Advance/> the President ’s Agenda,Molest,32110,1.4


In [None]:
# Creating edited headlines by replacing the tagged word in train[original] with edit

train_set["new"] = train_set.apply(lambda x:x["original"].replace(x["original"][x["original"].find('<'):x["original"].find(">")+1], x["edit"]), axis=1)
valid_set["new"] = valid_set.apply(lambda x:x["original"].replace(x["original"][x["original"].find('<'):x["original"].find(">")+1], x["edit"]), axis=1)

In [None]:
train_set[:10]

Unnamed: 0,id,original,edit,grades,meanGrade,new
0,14530,France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq,twins,10000,0.2,France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq
1,13034,"Pentagon claims 2,000 % increase in Russian trolls after <Syria/> strikes . What does that mean ?",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian trolls after bowling strikes . What does that mean ?"
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor Crashes <Coalition/>,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor Crashes party
3,76,"In an apparent first , Iran and Israel <engage/> each other militarily",slap,20000,0.4,"In an apparent first , Iran and Israel slap each other militarily"
4,6164,Trump was told weeks ago that Flynn misled <Vice/> President .,school,0,0.0,Trump was told weeks ago that Flynn misled school President .
5,8832,"All 22 <promises/> Trump made in his speech to Congress , in one chart",sounds,22200,1.2,"All 22 sounds Trump made in his speech to Congress , in one chart"
6,12174,New DOJ alert system will flag <crimes/> against police,laughter,32100,1.2,New DOJ alert system will flag laughter against police
7,3731,"As Someone Who Grew Up Among Fundamentalist <Christians/> In The US , I 'm Surprised Anyone 's Surprised About Roy Moore",morons,21110,1.0,"As Someone Who Grew Up Among Fundamentalist morons In The US , I 'm Surprised Anyone 's Surprised About Roy Moore"
8,6554,"Canadians may pay more taxes than Americans , but here 's what they get for their <money/>",loonies,10000,0.2,"Canadians may pay more taxes than Americans , but here 's what they get for their loonies"
9,14191,Dutch minister resigns in drug baron <row/>,blow,0,0.0,Dutch minister resigns in drug baron blow


In [None]:
# Creating a column containing the old words from the original sentence

train_set["old_words"] = train_set.apply(lambda x:x["original"][x["original"].find('<')+1:x["original"].find('>')-1],axis=1)
valid_set["old_words"] = valid_set.apply(lambda x:x["original"][x["original"].find('<')+1:x["original"].find('>')-1],axis=1)

In [None]:
train_set[:10]

Unnamed: 0,id,original,edit,grades,meanGrade,new,old_words
0,14530,France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq,twins,10000,0.2,France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq,Isis
1,13034,"Pentagon claims 2,000 % increase in Russian trolls after <Syria/> strikes . What does that mean ?",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian trolls after bowling strikes . What does that mean ?",Syria
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor Crashes <Coalition/>,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor Crashes party,Coalition
3,76,"In an apparent first , Iran and Israel <engage/> each other militarily",slap,20000,0.4,"In an apparent first , Iran and Israel slap each other militarily",engage
4,6164,Trump was told weeks ago that Flynn misled <Vice/> President .,school,0,0.0,Trump was told weeks ago that Flynn misled school President .,Vice
5,8832,"All 22 <promises/> Trump made in his speech to Congress , in one chart",sounds,22200,1.2,"All 22 sounds Trump made in his speech to Congress , in one chart",promises
6,12174,New DOJ alert system will flag <crimes/> against police,laughter,32100,1.2,New DOJ alert system will flag laughter against police,crimes
7,3731,"As Someone Who Grew Up Among Fundamentalist <Christians/> In The US , I 'm Surprised Anyone 's Surprised About Roy Moore",morons,21110,1.0,"As Someone Who Grew Up Among Fundamentalist morons In The US , I 'm Surprised Anyone 's Surprised About Roy Moore",Christians
8,6554,"Canadians may pay more taxes than Americans , but here 's what they get for their <money/>",loonies,10000,0.2,"Canadians may pay more taxes than Americans , but here 's what they get for their loonies",money
9,14191,Dutch minister resigns in drug baron <row/>,blow,0,0.0,Dutch minister resigns in drug baron blow,row


In [None]:
valid_set[:10]

Unnamed: 0,id,original,edit,grades,meanGrade,new,old_words
0,1723,Thousands of gay and bisexual <men/> convicted of long-abolished sexual offences are posthumously pardoned,swans,22100,1.0,Thousands of gay and bisexual swans convicted of long-abolished sexual offences are posthumously pardoned,men
1,12736,Special <prosecutor/> appointed to Trump Russia,chef,21100,0.8,Special chef appointed to Trump Russia,prosecutor
2,12274,Spanish police detain man and search Ripoll addresses in hunt for terror <suspects/>,squad,21000,0.6,Spanish police detain man and search Ripoll addresses in hunt for terror squad,suspects
3,8823,N.Y. Times <reprimands/> reporter for sharing ' unfounded rumor ' about Melania Trump,applauds,32210,1.6,N.Y. Times applauds reporter for sharing ' unfounded rumor ' about Melania Trump,reprimands
4,5087,Vladimir Putin Releases Video Simulation Of Russian <Missile/> striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .,balloon,11000,0.4,Vladimir Putin Releases Video Simulation Of Russian balloon striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .,Missile
5,13178,"Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to <replace/> New Jersey GOP Gov. Christie",chase,11000,0.4,"Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to chase New Jersey GOP Gov. Christie",replace
6,11799,Trump ’s next military <scapegoat/> : Foreign-born service members targeted by Pentagon,assassinations,21100,0.8,Trump ’s next military assassinations : Foreign-born service members targeted by Pentagon,scapegoat
7,13425,President Trump ’s Golden Age of <Trolling/>,Skydiving,21100,0.8,President Trump ’s Golden Age of Skydiving,Trolling
8,12497,"US urges UN to <punish/> Iran , but Russia says no sanctions",tickle,21110,1.0,"US urges UN to tickle Iran , but Russia says no sanctions",punish
9,1185,Taliban <kill/> 95 with ambulance bomb,bores,11000,0.4,Taliban bores 95 with ambulance bomb,kill


In [None]:
# Creating a column containing both the sentences seperated by [SEP] token, since humour is a relative quantity

train_set["new_text"] = train_set.apply(lambda x:x["new"] + ' [SEP] From '+x["old_words"] + ' to '+x["edit"] ,axis=1)
valid_set["new_text"] = valid_set.apply(lambda x:x["new"] + ' [SEP] From '+x["old_words"] + ' to '+x["edit"] ,axis=1)

In [None]:
train_set[:10]

Unnamed: 0,id,original,edit,grades,meanGrade,new,old_words,new_text
0,14530,France is ‘ hunting down its citizens who joined <Isis/> ’ without trial in Iraq,twins,10000,0.2,France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq,Isis,France is ‘ hunting down its citizens who joined twins ’ without trial in Iraq [SEP] From Isis to twins
1,13034,"Pentagon claims 2,000 % increase in Russian trolls after <Syria/> strikes . What does that mean ?",bowling,33110,1.6,"Pentagon claims 2,000 % increase in Russian trolls after bowling strikes . What does that mean ?",Syria,"Pentagon claims 2,000 % increase in Russian trolls after bowling strikes . What does that mean ? [SEP] From Syria to bowling"
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor Crashes <Coalition/>,party,22100,1.0,Iceland PM Calls Snap Vote as Pedophile Furor Crashes party,Coalition,Iceland PM Calls Snap Vote as Pedophile Furor Crashes party [SEP] From Coalition to party
3,76,"In an apparent first , Iran and Israel <engage/> each other militarily",slap,20000,0.4,"In an apparent first , Iran and Israel slap each other militarily",engage,"In an apparent first , Iran and Israel slap each other militarily [SEP] From engage to slap"
4,6164,Trump was told weeks ago that Flynn misled <Vice/> President .,school,0,0.0,Trump was told weeks ago that Flynn misled school President .,Vice,Trump was told weeks ago that Flynn misled school President . [SEP] From Vice to school
5,8832,"All 22 <promises/> Trump made in his speech to Congress , in one chart",sounds,22200,1.2,"All 22 sounds Trump made in his speech to Congress , in one chart",promises,"All 22 sounds Trump made in his speech to Congress , in one chart [SEP] From promises to sounds"
6,12174,New DOJ alert system will flag <crimes/> against police,laughter,32100,1.2,New DOJ alert system will flag laughter against police,crimes,New DOJ alert system will flag laughter against police [SEP] From crimes to laughter
7,3731,"As Someone Who Grew Up Among Fundamentalist <Christians/> In The US , I 'm Surprised Anyone 's Surprised About Roy Moore",morons,21110,1.0,"As Someone Who Grew Up Among Fundamentalist morons In The US , I 'm Surprised Anyone 's Surprised About Roy Moore",Christians,"As Someone Who Grew Up Among Fundamentalist morons In The US , I 'm Surprised Anyone 's Surprised About Roy Moore [SEP] From Christians to morons"
8,6554,"Canadians may pay more taxes than Americans , but here 's what they get for their <money/>",loonies,10000,0.2,"Canadians may pay more taxes than Americans , but here 's what they get for their loonies",money,"Canadians may pay more taxes than Americans , but here 's what they get for their loonies [SEP] From money to loonies"
9,14191,Dutch minister resigns in drug baron <row/>,blow,0,0.0,Dutch minister resigns in drug baron blow,row,Dutch minister resigns in drug baron blow [SEP] From row to blow


In [None]:
valid_set[:10]

Unnamed: 0,id,original,edit,grades,meanGrade,new,old_words,new_text
0,1723,Thousands of gay and bisexual <men/> convicted of long-abolished sexual offences are posthumously pardoned,swans,22100,1.0,Thousands of gay and bisexual swans convicted of long-abolished sexual offences are posthumously pardoned,men,Thousands of gay and bisexual swans convicted of long-abolished sexual offences are posthumously pardoned [SEP] From men to swans
1,12736,Special <prosecutor/> appointed to Trump Russia,chef,21100,0.8,Special chef appointed to Trump Russia,prosecutor,Special chef appointed to Trump Russia [SEP] From prosecutor to chef
2,12274,Spanish police detain man and search Ripoll addresses in hunt for terror <suspects/>,squad,21000,0.6,Spanish police detain man and search Ripoll addresses in hunt for terror squad,suspects,Spanish police detain man and search Ripoll addresses in hunt for terror squad [SEP] From suspects to squad
3,8823,N.Y. Times <reprimands/> reporter for sharing ' unfounded rumor ' about Melania Trump,applauds,32210,1.6,N.Y. Times applauds reporter for sharing ' unfounded rumor ' about Melania Trump,reprimands,N.Y. Times applauds reporter for sharing ' unfounded rumor ' about Melania Trump [SEP] From reprimands to applauds
4,5087,Vladimir Putin Releases Video Simulation Of Russian <Missile/> striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .,balloon,11000,0.4,Vladimir Putin Releases Video Simulation Of Russian balloon striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB .,Missile,Vladimir Putin Releases Video Simulation Of Russian balloon striking Florida conveniently right on top of USSOCOM headquarters at MacDill AFB . [SEP] From Missile to balloon
5,13178,"Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to <replace/> New Jersey GOP Gov. Christie",chase,11000,0.4,"Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to chase New Jersey GOP Gov. Christie",replace,"Ex-Goldman Sachs boss , Obama ambassador Murphy wins Democratic primary in bid to chase New Jersey GOP Gov. Christie [SEP] From replace to chase"
6,11799,Trump ’s next military <scapegoat/> : Foreign-born service members targeted by Pentagon,assassinations,21100,0.8,Trump ’s next military assassinations : Foreign-born service members targeted by Pentagon,scapegoat,Trump ’s next military assassinations : Foreign-born service members targeted by Pentagon [SEP] From scapegoat to assassinations
7,13425,President Trump ’s Golden Age of <Trolling/>,Skydiving,21100,0.8,President Trump ’s Golden Age of Skydiving,Trolling,President Trump ’s Golden Age of Skydiving [SEP] From Trolling to Skydiving
8,12497,"US urges UN to <punish/> Iran , but Russia says no sanctions",tickle,21110,1.0,"US urges UN to tickle Iran , but Russia says no sanctions",punish,"US urges UN to tickle Iran , but Russia says no sanctions [SEP] From punish to tickle"
9,1185,Taliban <kill/> 95 with ambulance bomb,bores,11000,0.4,Taliban bores 95 with ambulance bomb,kill,Taliban bores 95 with ambulance bomb [SEP] From kill to bores


## Tokenizing and Formatting data according to BERT standards

In [None]:
# Loading Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
# Train data
# Getting original sentences and their labels and combining data with [SEP] tokens in between and [CLS] at start
# Truncating max length to be of 32 and creating attention masks and converting the data into pytorch tensors

sentences = train_set["new_text"].values
labels = train_set["meanGrade"].values

input_ids = []
attention_masks = []

for sent in sentences:

    # encoding a sentence
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,  # [CLS] and [SEP]
                        truncation=True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = "pt"  # PyTorch Tensors
                    )
    
    # adding an encoded sentence to the list
    input_ids.append(encoded_dict["input_ids"])

    attention_masks.append(encoded_dict["attention_mask"])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

labels = torch.tensor(labels)

In [None]:
input_ids

tensor([[  101,  2605,  2003,  ...,     0,     0,     0],
        [  101, 20864,  4447,  ...,     0,     0,     0],
        [  101, 10399,  7610,  ...,     0,     0,     0],
        ...,
        [  101,  8592,  2240,  ...,     0,     0,     0],
        [  101,  3996,  2610,  ...,     0,     0,     0],
        [  101,  2182,  1005,  ...,     0,     0,     0]])

In [None]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

In [None]:
# Validation data
# Doing the above steps for validation data

sentences_valid = valid_set["new_text"].values
labels_valid = valid_set["meanGrade"].values

input_ids_valid = []
attention_masks_valid = []

for sent in sentences_valid:

    # encoding a sentence
    encoded_dict = tokenizer.encode_plus(
                        sent,
                        add_special_tokens = True,  # [CLS] and [SEP]
                        truncation=True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = "pt"  # PyTorch Tensors
                    )
    
    # adding an encoded sentence to the list
    input_ids_valid.append(encoded_dict["input_ids"])

    attention_masks_valid.append(encoded_dict["attention_mask"])

input_ids_valid = torch.cat(input_ids_valid, dim=0)
attention_masks_valid = torch.cat(attention_masks_valid, dim=0)

labels_valid = torch.tensor(labels_valid)

In [None]:
input_ids_valid

tensor([[  101,  5190,  1997,  ...,     0,     0,     0],
        [  101,  2569, 10026,  ...,     0,     0,     0],
        [  101,  3009,  2610,  ...,     0,     0,     0],
        ...,
        [  101,  2149,  2231,  ...,     0,     0,     0],
        [  101, 25597, 18353,  ...,     0,     0,     0],
        [  101,  1999,  3945,  ...,     0,     0,     0]])

In [None]:
attention_masks_valid

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])

## Creating DataLoaders for BERT

In [None]:
train_dataset = TensorDataset(input_ids, attention_masks, labels)
valid_dataset = TensorDataset(input_ids_valid, attention_masks_valid, labels_valid)

batch_size = 64

train_loader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset),
                batch_size = batch_size
            )

valid_loader = DataLoader(
                valid_dataset,
                sampler = SequentialSampler(valid_dataset),
                batch_size = batch_size
            )

# Modelling, basically Fine-Tuning BERT for this task

## Defining our BERT model

In [None]:
# We'll use BertForSequenceClassification model and we'll set the number of classes equal to 1, which will make it work as regressor
# We'll use the normal uncased, base version of BERT model which has 12 layers

model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = 1,
        output_attentions = False,
        output_hidden_states = False,
        return_dict=False
    )

# Moving model on to the GPU
model.cuda()

# To work on double type values
model.double()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## Defining training parameters

In [None]:
# Adam Algorithm with Weight Decay
optimizer = AdamW(model.parameters(),
                  lr = 9e-6,
                  eps = 1e-8)

epochs = 5

total_steps = len(train_loader) * epochs

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps = 0,
                num_training_steps = total_steps
            )

## Defining training loop

In [None]:
# Standard training code used for training and fine-tuning neural networks in PyTorch 

seed_value = 99

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

training_history = []

for e in range(epochs):

    print("")
    print(f"------------- Epoch {e+1} / {epochs} ----------------")
    print("Training........")

    total_train_loss = 0

    # Training mode
    model.train()

    for i, batch in enumerate(train_loader):

        # Unpacking data from loader and moving to gpu
        input_ids = batch[0].cuda()
        attention_masks = batch[1].cuda()
        labels = batch[2].cuda()

        # Clearing previous gradients
        model.zero_grad()

        # Doing a forward pass
        loss, logits = model(
                             input_ids,
                             token_type_ids = None,
                             attention_mask = attention_masks,
                             labels = labels
                         )
        
        # Accumulating training loss over all batches to calculate average later
        total_train_loss += loss.item()

        # Calculating gradients
        loss.backward()

        # Gradient clipping to deal with exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Updating gradients
        optimizer.step()

        # Updating learning rate with scheduler
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)

    print("")
    print(f"..... Average training loss: {avg_train_loss:.2f} ......")

    print("\nValidation........")

    # Evaluation Phase
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    y_pred = np.array([])
    y_true = np.array([])

    for batch in valid_loader:

        input_ids = batch[0].cuda()
        attention_masks = batch[1].cuda()
        labels = batch[2].cuda()

        # No gradients to be calculated
        with torch.no_grad():

            loss, logits = model(
                             input_ids,
                             token_type_ids = None,
                             attention_mask = attention_masks,
                             labels = labels
                         )
            
        total_eval_loss += loss.item()

        # Moving logits and labels to cpu
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        y_pred = np.append(y_pred,logits)
        y_true = np.append(y_true,label_ids)

    # Using RMSE as our evaluation metric, since it is a regression task
    rmse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
    print(f"---- RMSE: {rmse:.4f} ----")

    # Calculate average loss over all the batches
    avg_val_loss = total_eval_loss / len(valid_loader)

    print(f"..... Average validation loss: {avg_val_loss:.2f} ......")

    training_history.append(
        {
            "epoch": e+1,
            "Training Loss": avg_train_loss,
            "Validation Loss": avg_val_loss,
            "RMSE": rmse,
        }
    )

print("\n Training Finished!!")


------------- Epoch 1 / 5 ----------------
Training........

..... Average training loss: 0.36 ......

Validation........
---- RMSE: 0.5785 ----
..... Average validation loss: 0.34 ......

------------- Epoch 2 / 5 ----------------
Training........

..... Average training loss: 0.35 ......

Validation........
---- RMSE: 0.5795 ----
..... Average validation loss: 0.34 ......

------------- Epoch 3 / 5 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5786 ----
..... Average validation loss: 0.34 ......

------------- Epoch 4 / 5 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5780 ----
..... Average validation loss: 0.34 ......

------------- Epoch 5 / 5 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5781 ----
..... Average validation loss: 0.34 ......

 Training Finished!!


Training 5 epochs took around an hour

## Tuning Hyperparameters for better results

In [None]:
# Adam Algorithm with Weight Decay
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

epochs = 10
total_steps = len(train_loader) * epochs

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps = 0,
                num_training_steps = total_steps
            )

In [None]:
# Standard training code used for training and fine-tuning neural networks in PyTorch 

seed_value = 99

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

training_history = []

for e in range(epochs):

    print("")
    print(f"------------- Epoch {e+1} / {epochs} ----------------")
    print("Training........")

    total_train_loss = 0

    # Training mode
    model.train()

    for i, batch in enumerate(train_loader):

        # Unpacking data from loader and moving to gpu
        input_ids = batch[0].cuda()
        attention_masks = batch[1].cuda()
        labels = batch[2].cuda()

        # Clearing previous gradients
        model.zero_grad()

        # Doing a forward pass
        loss, logits = model(
                             input_ids,
                             token_type_ids = None,
                             attention_mask = attention_masks,
                             labels = labels
                         )
        
        # Accumulating training loss over all batches to calculate average later
        total_train_loss += loss.item()

        # Calculating gradients
        loss.backward()

        # Gradient clipping to deal with exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Updating gradients
        optimizer.step()

        # Updating learning rate with scheduler
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)

    print("")
    print(f"..... Average training loss: {avg_train_loss:.2f} ......")

    print("\nValidation........")

    # Evaluation Phase
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    y_pred = np.array([])
    y_true = np.array([])

    for batch in valid_loader:

        input_ids = batch[0].cuda()
        attention_masks = batch[1].cuda()
        labels = batch[2].cuda()

        # No gradients to be calculated
        with torch.no_grad():

            loss, logits = model(
                             input_ids,
                             token_type_ids = None,
                             attention_mask = attention_masks,
                             labels = labels
                         )
            
        total_eval_loss += loss.item()

        # Moving logits and labels to cpu
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        y_pred = np.append(y_pred,logits)
        y_true = np.append(y_true,label_ids)

    # Using RMSE as our evaluation metric, since it is a regression task
    rmse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
    print(f"---- RMSE: {rmse:.4f} ----")

    # Calculate average loss over all the batches
    avg_val_loss = total_eval_loss / len(valid_loader)

    print(f"..... Average validation loss: {avg_val_loss:.2f} ......")

    training_history.append(
        {
            "epoch": e+1,
            "Training Loss": avg_train_loss,
            "Validation Loss": avg_val_loss,
            "RMSE": rmse,
        }
    )

print("\n Training Finished!!")


------------- Epoch 1 / 10 ----------------
Training........

..... Average training loss: 0.36 ......

Validation........
---- RMSE: 0.5791 ----
..... Average validation loss: 0.34 ......

------------- Epoch 2 / 10 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5788 ----
..... Average validation loss: 0.34 ......

------------- Epoch 3 / 10 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5781 ----
..... Average validation loss: 0.34 ......

------------- Epoch 4 / 10 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5786 ----
..... Average validation loss: 0.34 ......

------------- Epoch 5 / 10 ----------------
Training........

..... Average training loss: 0.34 ......

Validation........
---- RMSE: 0.5786 ----
..... Average validation loss: 0.34 ......

------------- Epoch 6 / 10 ----------------
Train

Training for 10 epochs took more than 2 hours

In [None]:
# Adam Algorithm with Weight Decay
optimizer = AdamW(model.parameters(),
                  lr = 9e-4,
                  eps = 1e-6)

epochs = 7

total_steps = len(train_loader) * epochs

# Learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps = 0,
                num_training_steps = total_steps
            )

In [None]:
# Standard training code used for training and fine-tuning neural networks in PyTorch 

seed_value = 99

random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

training_history = []

for e in range(epochs):

    print("")
    print(f"------------- Epoch {e+1} / {epochs} ----------------")
    print("Training........")

    total_train_loss = 0

    # Training mode
    model.train()

    for i, batch in enumerate(train_loader):

        # Unpacking data from loader and moving to gpu
        input_ids = batch[0].cuda()
        attention_masks = batch[1].cuda()
        labels = batch[2].cuda()

        # Clearing previous gradients
        model.zero_grad()

        # Doing a forward pass
        loss, logits = model(
                             input_ids,
                             token_type_ids = None,
                             attention_mask = attention_masks,
                             labels = labels
                         )
        
        # Accumulating training loss over all batches to calculate average later
        total_train_loss += loss.item()

        # Calculating gradients
        loss.backward()

        # Gradient clipping to deal with exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Updating gradients
        optimizer.step()

        # Updating learning rate with scheduler
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)

    print("")
    print(f"..... Average training loss: {avg_train_loss:.2f} ......")

    print("\nValidation........")

    # Evaluation Phase
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    y_pred = np.array([])
    y_true = np.array([])

    for batch in valid_loader:

        input_ids = batch[0].cuda()
        attention_masks = batch[1].cuda()
        labels = batch[2].cuda()

        # No gradients to be calculated
        with torch.no_grad():

            loss, logits = model(
                             input_ids,
                             token_type_ids = None,
                             attention_mask = attention_masks,
                             labels = labels
                         )
            
        total_eval_loss += loss.item()

        # Moving logits and labels to cpu
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        y_pred = np.append(y_pred,logits)
        y_true = np.append(y_true,label_ids)

    # Using RMSE as our evaluation metric, since it is a regression task
    rmse = mean_squared_error(y_true=y_true, y_pred=y_pred, squared=False)
    print(f"---- RMSE: {rmse:.4f} ----")

    # Calculate average loss over all the batches
    avg_val_loss = total_eval_loss / len(valid_loader)

    print(f"..... Average validation loss: {avg_val_loss:.2f} ......")

    training_history.append(
        {
            "epoch": e+1,
            "Training Loss": avg_train_loss,
            "Validation Loss": avg_val_loss,
            "RMSE": rmse,
        }
    )

print("\n Training Finished!!")


------------- Epoch 1 / 7 ----------------
Training........

..... Average training loss: 0.43 ......

Validation........
---- RMSE: 0.5817 ----
..... Average validation loss: 0.34 ......

------------- Epoch 2 / 7 ----------------
Training........

..... Average training loss: 0.39 ......

Validation........
---- RMSE: 0.6045 ----
..... Average validation loss: 0.37 ......

------------- Epoch 3 / 7 ----------------
Training........

..... Average training loss: 0.37 ......

Validation........
---- RMSE: 0.5788 ----
..... Average validation loss: 0.34 ......

------------- Epoch 4 / 7 ----------------
Training........

..... Average training loss: 0.36 ......

Validation........
---- RMSE: 0.5834 ----
..... Average validation loss: 0.34 ......

------------- Epoch 5 / 7 ----------------
Training........

..... Average training loss: 0.36 ......

Validation........
---- RMSE: 0.5789 ----
..... Average validation loss: 0.34 ......

------------- Epoch 6 / 7 ----------------
Training...

Even after tuning parameters multiple times and training the models on those prameters the RMSE doesn't seem to decrease.