In [1]:
import torch.nn as nn
import torch
import transformers
import pandas as pd
import numpy as np
from sklearn import model_selection
from transformers import AdamW, get_linear_schedule_with_warmup
from scipy import stats
import time 

import warnings 
warnings.filterwarnings('ignore')

In [2]:
class BERTClass(nn.Module):
    def __init__(self, model_path):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(model_path)
        self.out = nn.Linear(768, 3)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output = self.out(output)
        return self.softmax(output)

class BERTDatasetTraining:
    def __init__(self, question, context, targets, tokenizer, max_len):
        self.question = question
        self.context = context
        self.targets = targets
        
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.question)
    
    def __getitem__(self, item):
        question= str(self.question[item])
        context = str(self.context[item])

        inputs = self.tokenizer.encode_plus(
            question,
            context,
            add_special_tokens=True
        )

        ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        mask = inputs['attention_mask']
        
        padding_len = self.max_len - len(ids)
        
        ids = ids[:self.max_len] + ([0] * padding_len) 
        token_type_ids = token_type_ids[:self.max_len] + ([0] * padding_len)
        mask = mask[:self.max_len] + ([0] * padding_len)
        

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.long)
        }


def loss_fn(outputs, targets):
    return nn.CrossEntropyLoss()(outputs, targets)


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d['token_type_ids']
        targets = d['targets']
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.long)
        
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi % 50 == 0:
            print(f'bi={bi}, loss={loss}')

            
def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        with torch.no_grad():
            ids = d['ids'].to(device, dtype=torch.long)
            mask = d['mask'].to(device, dtype=torch.long)
            token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
            targets = d['targets'].to(device, dtype=torch.long)
          
            outputs = model(ids, mask, token_type_ids)
            #loss = loss_fn(outputs, targets)
          
            fin_targets.append(targets.cpu().detach().numpy())
            fin_outputs.append(outputs.cpu().detach().numpy())

    return np.vstack(fin_outputs), np.hstack(fin_targets)

In [3]:
# model_paths = ['bert-base-uncased',
# '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/model-trained-0-3531-4GB/',
# '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/model-trained-18-67089-4GB/',
# '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/model-trained-36-130647-4GB/',
# '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-0-7063/',
# '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-8-63567/',
model_paths = [
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-0-10596-12GB/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-3-42384-12GB/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-5-63576-12GB/'
              ]


In [4]:
import json
from sklearn.metrics import accuracy_score, f1_score

def run():
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 8
    EPOCHS = 12
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    start = time.perf_counter()

    for model_path in model_paths:
        
        tokenizer_path = ('bert-base-uncased' if model_path == 'bert-base-uncased'\
                      else '../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')
        model_name = model_path if model_path == 'bert-base-uncased' else model_path.split('/')[-2].split('.')[0]
        scores = []
        model_stats = {'model_name':model_name,
                       'seeds':[],
                       'batch_size':TRAIN_BATCH_SIZE,
                       'epochs':EPOCHS,
                       'metric':'accuracy',
                       'scores': [],
                       'mean_score':0
                        }
        for num, seed in enumerate([42,43,44,45,46], 1):

            tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_path)
            model = BERTClass(model_path)
            #model = DataParallel(model)
            print(f'Using model {model_name}, with tokenizer {tokenizer_path}')
            model.to(device)

            df_train = pd.read_json('./data/pqal_fold0/train_set.json', orient='index')

            target_cols = 'final_decision'
            train_targets_str = df_train[target_cols].values

            # Transformer the target col to numeric values
            target_dict = {'yes': 0, 'maybe': 1, 'no': 2}
            train_targets = [target_dict[key] for key in train_targets_str]

            train_dataset = BERTDatasetTraining(
                question=df_train.QUESTION.values,
                context=df_train.CONTEXTS.values,
                targets=train_targets,
                tokenizer=tokenizer,
                max_len=MAX_LEN
            )
            train_data_loader = torch.utils.data.DataLoader(
                train_dataset,
                batch_size=TRAIN_BATCH_SIZE,
                shuffle=True
            )

            optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
            num_training_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
            scheduler = get_linear_schedule_with_warmup(
                optimizer, 
                num_warmup_steps=0,
                num_training_steps=num_training_steps
            )

            for epoch in range(EPOCHS):
                train_loop_fn(train_data_loader, model, optimizer, device, scheduler)

            # Inference
            TEST_BATCH_SIZE = 4
            df_test = pd.read_json('./data/test_set.json', orient='index')
            test_targets_str = df_test[target_cols].values
            test_targets = [target_dict[key] for key in test_targets_str]
            ground_truth = pd.read_json('./data/test_ground_truth.json', orient='index')
            truth_list = ground_truth[0].tolist()
            target_dict = {'yes': 0, 'maybe': 1, 'no': 2}
            truth_list = [target_dict[key] for key in truth_list]
            
            test_dataset = BERTDatasetTraining(
                question=df_test.QUESTION.values,
                context=df_test.CONTEXTS.values,
                targets=test_targets, # This is not used
                tokenizer=tokenizer,
                max_len=MAX_LEN
            )
            test_data_loader = torch.utils.data.DataLoader(
                test_dataset,
                batch_size=TEST_BATCH_SIZE,
                shuffle=False
            )

            predictions = []
            model.eval()

            for bi, d in enumerate(test_data_loader):
                ids = d["ids"]
                mask = d["mask"]
                token_type_ids = d["token_type_ids"]

                ids = ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                token_type_ids = token_type_ids.to(device, dtype=torch.long)

                with torch.no_grad():
                    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids).cpu().detach().numpy()
                    predictions.append(outputs.argmax(1))
            pmids = df_test.index
            predictions = np.hstack(predictions)
            
            acc = (truth_list == predictions).sum() / len(predictions)
            print(f"Accuracy = {round(acc,3)}")
            model_stats['scores'].append(round(acc,6))
            torch.cuda.empty_cache()
            time.sleep(3)
            print(f'Training run {num} completed.')

        print('Logging model stats....')
        print()
        final_score = np.round(np.mean(model_stats['scores']), 4)
        model_stats['mean_score'] = final_score
        with open('logs/QA_stats.txt', 'a') as f:
            f.write(json.dumps(model_stats))
            f.write('\n')

    end = time.perf_counter() - start
    print(f'Total Training/Eval time: {round(end, 2)} seconds')

In [None]:
run()

Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoi

Using model run_8GB_model-trained-22-162449, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.1133925914764404
bi=50, loss=0.8660159707069397
bi=0, loss=0.907325267791748
bi=50, loss=1.023573637008667
bi=0, loss=0.9340406060218811
bi=50, loss=0.832759439945221
bi=0, loss=0.9248750805854797
bi=50, loss=0.9368119239807129
bi=0, loss=0.7019635438919067
bi=50, loss=0.5753198862075806
bi=0, loss=0.6756697297096252
bi=50, loss=0.6779095530509949
bi=0, loss=0.5551101565361023
bi=50, loss=0.9090008735656738
bi=0, loss=0.8020688891410828
bi=50, loss=0.6772118210792542
bi=0, loss=0.555112361907959
bi=50, loss=0.5545124411582947
bi=0, loss=0.7952770590782166
bi=50, loss=0.6669432520866394
bi=0, loss=0.555243968963623
bi=50, loss=0.6708150506019592
bi=0, loss=0.5549679398536682
bi=50, loss=0.5564172863960266
Accuracy = 0.52
Training run 1 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoi

Using model run_8GB_model-trained-22-162449, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.0981415510177612
bi=50, loss=1.041733980178833
bi=0, loss=1.06342613697052
bi=50, loss=0.9255462288856506
bi=0, loss=0.9133842587471008
bi=50, loss=0.7679619789123535
bi=0, loss=1.0605285167694092
bi=50, loss=0.8392605781555176
bi=0, loss=0.9063445925712585
bi=50, loss=0.7624925374984741
bi=0, loss=0.7119525074958801
bi=50, loss=0.6679531335830688
bi=0, loss=0.5713295340538025
bi=50, loss=0.5555200576782227
bi=0, loss=0.6817011833190918
bi=50, loss=0.5557990670204163
bi=0, loss=0.6795486211776733
bi=50, loss=0.6733416318893433
bi=0, loss=0.5552210807800293
bi=50, loss=0.6807407140731812
bi=0, loss=0.7884018421173096
bi=50, loss=0.5545552968978882
bi=0, loss=0.7944802045822144
bi=50, loss=0.554582417011261
Accuracy = 0.552
Training run 2 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoi

Using model run_8GB_model-trained-22-162449, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.062232255935669
bi=50, loss=0.8680485486984253
bi=0, loss=0.93532395362854
bi=50, loss=1.0912299156188965
bi=0, loss=1.031855583190918
bi=50, loss=0.9560345411300659
bi=0, loss=0.8096864223480225
bi=50, loss=0.7155058979988098
bi=0, loss=0.7850749492645264
bi=50, loss=0.6781296730041504
bi=0, loss=0.6428642868995667
bi=50, loss=0.6800328493118286
bi=0, loss=0.6783838272094727
bi=50, loss=0.6785610914230347
bi=0, loss=0.6675446629524231
bi=50, loss=0.6770884990692139
bi=0, loss=0.5546119809150696
bi=50, loss=0.6655932664871216
bi=0, loss=0.6654759645462036
bi=50, loss=0.6685956120491028
bi=0, loss=0.6733148694038391
bi=50, loss=0.55409836769104
bi=0, loss=0.6717512607574463
bi=50, loss=0.5541045665740967
Accuracy = 0.544
Training run 3 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoi

Using model run_8GB_model-trained-22-162449, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.1269338130950928
bi=50, loss=0.818053662776947
bi=0, loss=1.0434587001800537
bi=50, loss=1.008122205734253
bi=0, loss=0.9021208882331848
bi=50, loss=1.0993622541427612
bi=0, loss=0.8338726758956909
bi=50, loss=0.9737529754638672
bi=0, loss=0.6792771220207214
bi=50, loss=0.7548789978027344
bi=0, loss=0.7999293804168701
bi=50, loss=0.8616222739219666
bi=0, loss=0.5555644631385803
bi=50, loss=0.7876006960868835
bi=0, loss=0.677280068397522
bi=50, loss=0.6769613027572632
bi=0, loss=0.6685870289802551
bi=50, loss=0.6923445463180542
bi=0, loss=0.6638994812965393
bi=50, loss=0.8963330984115601
bi=0, loss=0.6781090497970581
bi=50, loss=0.64639812707901
bi=0, loss=0.6561946272850037
bi=50, loss=0.6533516645431519
Accuracy = 0.536
Training run 4 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoi

Using model run_8GB_model-trained-22-162449, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.1688302755355835
bi=50, loss=0.9872836470603943
bi=0, loss=0.8173239231109619
bi=50, loss=0.9192149639129639
bi=0, loss=1.0670415163040161
bi=50, loss=0.8715004324913025
bi=0, loss=0.7495954036712646
bi=50, loss=0.8047545552253723
bi=0, loss=0.8109761476516724
bi=50, loss=0.8134777545928955
bi=0, loss=0.782996416091919
bi=50, loss=0.6744764447212219
bi=0, loss=0.6783139705657959
bi=50, loss=0.6638122797012329
bi=0, loss=0.5542635321617126
bi=50, loss=0.5578639507293701
bi=0, loss=0.6737521290779114
bi=50, loss=0.7827971577644348
bi=0, loss=0.6819318532943726
bi=50, loss=0.7121126055717468
bi=0, loss=0.7413904666900635
bi=50, loss=0.6821694374084473
bi=0, loss=0.5968328714370728
bi=50, loss=0.5592142343521118
Accuracy = 0.348
Training run 5 completed.
Logging model stats....



Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-0-10596-12GB/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint a

Using model model-trained-0-10596-12GB, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.0679243803024292
bi=50, loss=1.156117558479309
bi=0, loss=0.9337173104286194
bi=50, loss=1.1117448806762695
bi=0, loss=0.9980647563934326
bi=50, loss=0.9548804759979248
bi=0, loss=1.0811457633972168
bi=50, loss=0.9964059591293335
bi=0, loss=0.9739193320274353
bi=50, loss=1.1130692958831787
bi=0, loss=0.7422862648963928
bi=50, loss=0.7508374452590942
bi=0, loss=0.590019941329956
bi=50, loss=0.9291213750839233
bi=0, loss=0.7625771760940552
bi=50, loss=0.6775168180465698
bi=0, loss=0.585480809211731
bi=50, loss=0.5553942918777466
bi=0, loss=0.8016871213912964
bi=50, loss=0.5533900856971741
bi=0, loss=0.7862836718559265
bi=50, loss=0.8015093803405762
bi=0, loss=0.6783978939056396
bi=50, loss=0.5528660416603088
Accuracy = 0.504
Training run 1 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-0-10596-12GB/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint a

Using model model-trained-0-10596-12GB, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=1.1322025060653687
