In [None]:
from google.colab import drive
drive.mount("/content/drive/")

Mounted at /content/drive/


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import shutil
import sys

##build train data

In [None]:
train_df = pd.read_csv('output-1.csv')

In [None]:
train_df.head()

Unnamed: 0,uuid,postText,targetParagraphs,spoiler,tags
0,0af11f6b-c889-4520-9372-66ba25cb7657,"Wes Welker Wanted Dinner With Tom Brady, But P...",It’ll be just like old times this weekend for ...,how about that morning we go throw?,passage
1,b1a1f63d-8853-4a11-89e8-6b2952a393ec,NASA sets date for full recovery of ozone hole,2070 is shaping up to be a great year for Moth...,2070,phrase
2,008b7b19-0445-4e16-8f9e-075b73f80ca4,This is what makes employees happy -- and it's...,"Despite common belief, money isn't the key to ...",intellectual stimulation,phrase
3,31ecf93c-3e21-4c80-949b-aa549a046b93,Passion is overrated — 7 work habits you need ...,"It’s common wisdom. Near gospel really, and no...",Purpose connects us to something bigger and in...,multi
4,31b108a3-c828-421a-a4b9-cf651e9ac859,The perfect way to cook rice so that it's perf...,"Boiling rice may seem simple, but there is a v...",in a rice cooker,phrase


In [None]:
# combining 'title' and 'abstract' column to| get more context
train_df['context'] = train_df['postText'] + ". " + train_df['targetParagraphs']

In [None]:
# dropping useless features/columns
train_df.drop(labels=['uuid', 'postText', 'targetParagraphs', 'spoiler'], axis=1, inplace=True)

In [None]:
train_df = train_df[['context', 'tags']]

In [None]:
train_df.head()

Unnamed: 0,context,tags
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",passage
1,NASA sets date for full recovery of ozone hole...,phrase
2,This is what makes employees happy -- and it's...,phrase
3,Passion is overrated — 7 work habits you need ...,multi
4,The perfect way to cook rice so that it's perf...,phrase


In [None]:
X = pd.get_dummies(train_df.tags, prefix='')

In [None]:
train_df['phrase'] = X['_phrase']
train_df['passage'] = X['_passage']
train_df['multi'] = X['_multi']

In [None]:
train_df.head()

Unnamed: 0,context,tags,phrase,passage,multi
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",passage,0,1,0
1,NASA sets date for full recovery of ozone hole...,phrase,1,0,0
2,This is what makes employees happy -- and it's...,phrase,1,0,0
3,Passion is overrated — 7 work habits you need ...,multi,0,0,1
4,The perfect way to cook rice so that it's perf...,phrase,1,0,0


In [None]:
train_df.drop(labels=['tags'], axis=1, inplace=True)

In [None]:
train_df.columns

Index(['context', 'phrase', 'passage', 'multi'], dtype='object')

##build valid data

In [None]:
test_df = pd.read_csv('valid_output.csv')

In [None]:
test_df['context'] = test_df['postText'] + ". " + test_df['targetParagraphs']
# train_df.drop(labels=['uuid', 'postText', 'targetParagraphs', 'spoiler'], axis=1, inplace=True)

In [None]:
target_list = ['phrase', 'passage', 'multi']

In [None]:
# hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
class PreprocessDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['context']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [None]:
train_size = 0.8
train_df = train_df.sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = train_df.drop(train_df.index).reset_index(drop=True)

In [None]:
train_dataset = PreprocessDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = PreprocessDataset(val_df, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset, 
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):

    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):

    f_path = checkpoint_path
    torch.save(state, f_path)
    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [None]:
class BERTTextClassification(torch.nn.Module):
    def __init__(self):
        super(BERTTextClassification, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 3)
    
    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids, 
            attention_mask=attn_mask, 
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTTextClassification()
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
val_targets=[]
val_outputs=[]

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, 
                optimizer, checkpoint_path, best_model_path):
   
  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
   
 
  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
    
    print('############# Epoch {}: Training End     #############'.format(epoch))
    
    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################    
    # validate the model #
    ######################
 
    model.eval()
   
    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
     
      train_loss = train_loss/len(training_loader)
      try: 
        valid_loss = valid_loss/len(validation_loader)
      except:
        valid_loss = 0
      # print training/validation statistics 
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
      
      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }
     
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))
        # save checkpoint as best model
        # save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss

    print('############# Epoch {}  Done   #############\n'.format(epoch))

  return model

In [None]:
ckpt_path = "/content/drive"
best_model_path = "/content/drive/best_model.pt"

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
trained_model = train_model(2, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000162 	Average Validation Loss: 0.000000
Validation loss decreased (inf --> 0.000000).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000132 	Average Validation Loss: 0.000000
Validation loss decreased (0.000000 --> 0.000000).  Saving model ...
############# Epoch 2  Done   #############



In [None]:
#testing the model
bert_predicted = []
# train_model.eval()
with torch.no_grad():
  for i in range(0, len(test_df)):
    example = test_df['context'][i]
    encodings = tokenizer.encode_plus(
        example,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        return_token_type_ids=True,
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids = encodings['input_ids'].to(device, dtype=torch.long)
    attention_mask = encodings['attention_mask'].to(device, dtype=torch.long)
    token_type_ids = encodings['token_type_ids'].to(device, dtype=torch.long)
    output = trained_model(input_ids, attention_mask, token_type_ids)
    final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
    output_tag = train_df.columns[1:].to_list()[int(np.argmax(final_output, axis=1))]
    bert_predicted.append(output_tag)

In [None]:
test_df['tags']

0      passage
1        multi
2       phrase
3        multi
4      passage
        ...   
795    passage
796     phrase
797      multi
798    passage
799     phrase
Name: tags, Length: 800, dtype: object

In [None]:
test_input = test_df['tags'].iloc()

In [None]:
bert_predicted

['passage',
 'passage',
 'multi',
 'passage',
 'passage',
 'phrase',
 'phrase',
 'passage',
 'phrase',
 'passage',
 'multi',
 'phrase',
 'passage',
 'phrase',
 'phrase',
 'passage',
 'passage',
 'phrase',
 'passage',
 'multi',
 'phrase',
 'passage',
 'passage',
 'passage',
 'passage',
 'phrase',
 'multi',
 'multi',
 'passage',
 'phrase',
 'multi',
 'phrase',
 'phrase',
 'phrase',
 'multi',
 'phrase',
 'phrase',
 'multi',
 'passage',
 'phrase',
 'multi',
 'passage',
 'phrase',
 'passage',
 'phrase',
 'phrase',
 'multi',
 'multi',
 'phrase',
 'multi',
 'phrase',
 'passage',
 'phrase',
 'phrase',
 'phrase',
 'passage',
 'phrase',
 'passage',
 'passage',
 'phrase',
 'phrase',
 'multi',
 'passage',
 'phrase',
 'passage',
 'phrase',
 'passage',
 'phrase',
 'phrase',
 'phrase',
 'multi',
 'passage',
 'phrase',
 'passage',
 'passage',
 'phrase',
 'multi',
 'passage',
 'multi',
 'phrase',
 'phrase',
 'passage',
 'passage',
 'phrase',
 'multi',
 'passage',
 'passage',
 'passage',
 'phrase',
 'pa

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(test_df['tags'], bert_predicted)

In [None]:
conf_mat

array([[ 85,  28,  30],
       [ 32, 204,  86],
       [ 26,  93, 216]])

In [None]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(test_df['tags'], bert_predicted))

Accuracy: 0.63125


In [None]:
print(metrics.f1_score(test_df['tags'], bert_predicted, average='macro'))

0.624228179465244
