In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.nn.parallel import DataParallel
import torch
from sklearn import metrics
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
import os, json, time
from transformers import BertTokenizer, BertModel

In [2]:
!rm -r DC_data/text/.ipynb_checkpoints/
!rm -r DC_data/labels/.ipynb_checkpoints/

rm: cannot remove 'DC_data/text/.ipynb_checkpoints/': No such file or directory
rm: cannot remove 'DC_data/labels/.ipynb_checkpoints/': No such file or directory


### Data Process

In [3]:
#len should be 1852
len(os.listdir('DC_data/text/')), len(os.listdir("DC_data/labels/"))

(1852, 1852)

In [4]:
for dirpath, dirnames, filenames in os.walk('./DC_data/text/'):
    idx2text = {}
    for filename in filenames:
        if filename.startswith('.'):
            continue
        idx = filename.split('.')[0]
        with open(dirpath + filename) as f:
            abstract = ''
            for sentence in f:
                abstract += sentence
            abstract = abstract.replace('\n', ' ')
            idx2text[idx] = abstract

for dirpath, dirnames, filenames in os.walk('./DC_data/labels/'):
    idx2label = [] 
    for filename in filenames:
        if filename.startswith('.'):
            continue
        idx = filename.split('.')[0]
        with open(dirpath + filename) as f:
            labels = f.readline()
            for label in labels.split('<'):
                if not label or label.isspace():
                    continue
                key_label = label.split('--')[0].strip()
                if key_label == 'NULL':
                    continue
                idx2label.append((idx, key_label))

In [5]:
idx2label = list(set(idx2label))
df = pd.DataFrame(idx2label, columns=['idx', 'label'])
df['text'] = df['idx'].map(idx2text)

In [6]:
label_vals = list(set(df['label'].tolist()))

In [7]:
df = df.groupby(['idx', 'text']).agg({'label': lambda x: list(x)}).reset_index()

In [8]:
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('label')),
                          columns=mlb.classes_,
                          index=df.index))

In [9]:
df.head()

Unnamed: 0,idx,text,Activating invasion and metastasis,Avoiding immune destruction,Cellular energetics,Enabling replicative immortality,Evading growth suppressors,Genomic instability and mutation,Inducing angiogenesis,Resisting cell death,Sustaining proliferative signaling,Tumor promoting inflammation
0,11724768,Ghrelin was identified in the stomach as an en...,0,0,0,0,0,0,0,0,1,0
1,11773160,PURPOSE The epidermal growth factor receptor (...,0,0,0,0,1,0,0,0,1,0
2,11774243,Adoptive transfer of immunity against hepatiti...,0,1,0,0,0,0,0,0,0,0
3,11781072,The secretion of immunosuppressive factors lik...,0,1,0,0,0,0,0,1,1,0
4,11791181,To characterize the impact of increased produc...,1,0,0,0,0,0,0,0,1,0


In [10]:
del idx2label, idx2text

### Model Paths

In [11]:
model_paths = ['bert-base-uncased',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/model-trained-0-3531-4GB',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/model-trained-18-67089-4GB/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/model-trained-36-130647-4GB/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-0-7063/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-8-63567/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/8GB-checkpoints/run_8GB_model-trained-22-162449/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-0-10596-12GB/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-3-42384-12GB/',
'/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/12GB-checkpoints/model-trained-5-63576-12GB/'
              ]
model_paths = ['/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-0-3531/',
               '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-18-67089/',
               '/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/'
              ]
                  

In [13]:
model_paths = [model_paths[2]]
model_paths

['/home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/']

### Create Dataset/Model/Training/Eval loops

In [14]:
class HoCDataset:
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.sentences)
        
    def __getitem__(self, item):
        sentence = str(self.sentences[item])
        inputs = self.tokenizer.encode_plus(
            sentence,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        ids = inputs['input_ids']
        token_type_ids = inputs['token_type_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'label': torch.tensor(self.labels[item], dtype=torch.float)
        } 

    
class BERTClass(nn.Module):
    def __init__(self, model_path):
        super(BERTClass, self).__init__()
        self.bert = transformers.BertModel.from_pretrained(model_path)
        self.out = nn.Linear(768, 10)

    
    def forward(self, ids, mask, token_type_ids):
        _, output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        
        return self.out(output)
    
    
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)


def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for bi, d in enumerate(data_loader):
        ids = d['ids']
        mask = d['mask']
        token_type_ids = d['token_type_ids']
        labels = d['label']
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        labels = labels.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if bi % 50 == 0:
            print(f'bi={bi}, loss={loss}')


def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_labels = []
    fin_outputs = []
    for bi, d in enumerate(data_loader):
        with torch.no_grad():
            ids = d['ids'].to(device, dtype=torch.long)
            mask = d['mask'].to(device, dtype=torch.long)
            token_type_ids = d['token_type_ids'].to(device, dtype=torch.long)
            labels = d['label'].to(device, dtype=torch.long)
          
            outputs = model(ids, mask, token_type_ids)
          
            fin_labels.append(labels.cpu().detach().numpy())
            fin_outputs.append(torch.sigmoid(outputs).cpu().detach().numpy())

    return np.vstack(fin_outputs), np.vstack(fin_labels)

In [15]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:1


In [16]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
EPOCHS = 4
LEARNING_RATE = 3e-5


start = time.perf_counter()

for model_path in model_paths:
    tokenizer_path = ('bert-base-uncased' if model_path == 'bert-base-uncased'\
                  else '../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt')
    model_name = model_path if model_path == 'bert-base-uncased' else model_path.split('/')[-2].split('.')[0]
    scores = []
    model_stats = {'model_name':model_name,
                   'seeds':[],
                   'batch_size':TRAIN_BATCH_SIZE,
                   'epochs':EPOCHS,
                   'metric':'f1-score (micro)',
                   'scores': [],
                   'mean_score':0
                    }
    for num, seed in enumerate([42,43,44,45,46], 1):
        SEED = seed
        model_stats['seeds'].append(SEED)
        label_cols = list(df.drop(['idx', 'text'], axis=1).columns)
        labels = df[label_cols].values

        df_train, df_test, labels_train, labels_test = train_test_split(
            df, labels, test_size=0.2, random_state=SEED)


        tokenizer = transformers.BertTokenizer.from_pretrained(tokenizer_path)
        model = BERTClass(model_path)
        #model = DataParallel(model)
        print(f'Using model {model_name}, with tokenizer {tokenizer_path}')
        model.to(device)

        train_dataset = HoCDataset(
            sentences=df_train.text.values,
            labels=labels_train,
            tokenizer=tokenizer,
            max_len=MAX_LEN
        )
        train_data_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=TRAIN_BATCH_SIZE,
            shuffle=True
        )

        test_dataset = HoCDataset(
            sentences=df_test.text.values,
            labels=labels_test,
            tokenizer=tokenizer,
            max_len=MAX_LEN
        )
        test_data_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=4,
            shuffle=True,
            drop_last=True
        )

        optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
        num_training_steps = int(len(df_train) / TRAIN_BATCH_SIZE * EPOCHS)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, 
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )
        for epoch in range(EPOCHS):
            train_loop_fn(train_data_loader, model, optimizer, device, scheduler)
            
        output, target = eval_loop_fn(test_data_loader, model, device)
        preds = np.array(output) >= 0.5
        f1_score_micro = metrics.f1_score(target, preds, average='micro')
        print(f"F1 Score (Micro) = {round(f1_score_micro,4)}")
        model_stats['scores'].append(round(f1_score_micro,6))
        torch.cuda.empty_cache()
        time.sleep(3)
        print(f'Training run {num} completed.')
    
    print('Logging model stats....')
    print()
    final_score = np.round(np.mean(model_stats['scores']), 4)
    model_stats['mean_score'] = final_score
    with open('logs/doc_class_stats.txt', 'a') as f:
        f.write(json.dumps(model_stats))
        f.write('\n')

end = time.perf_counter() - start
print(f'Total Training/Eval time: {round(end, 2)} seconds')

Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/ were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model chec

Using model run_4GB-wwm_model-trained-36-130647, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=0.7011832594871521
bi=50, loss=0.42674341797828674
bi=100, loss=0.3139364421367645
bi=150, loss=0.24273283779621124
bi=0, loss=0.2029203623533249
bi=50, loss=0.16137097775936127
bi=100, loss=0.1476346105337143
bi=150, loss=0.19267944991588593
bi=0, loss=0.10692309588193893
bi=50, loss=0.26127123832702637
bi=100, loss=0.12819048762321472
bi=150, loss=0.09633461385965347
bi=0, loss=0.12276861816644669
bi=50, loss=0.11259844154119492
bi=100, loss=0.07996915280818939
bi=150, loss=0.08001072704792023
F1 Score (Micro) = 0.8206
Training run 1 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/ were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model chec

Using model run_4GB-wwm_model-trained-36-130647, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=0.6794794201850891
bi=50, loss=0.33877822756767273
bi=100, loss=0.30027303099632263
bi=150, loss=0.14046049118041992
bi=0, loss=0.21547093987464905
bi=50, loss=0.1617261916399002
bi=100, loss=0.13932211697101593
bi=150, loss=0.15022070705890656
bi=0, loss=0.08777099847793579
bi=50, loss=0.128418430685997
bi=100, loss=0.07263275235891342
bi=150, loss=0.09574980288743973
bi=0, loss=0.08731510490179062
bi=50, loss=0.08761493861675262
bi=100, loss=0.08517761528491974
bi=150, loss=0.07056932896375656
F1 Score (Micro) = 0.8343
Training run 2 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/ were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model chec

Using model run_4GB-wwm_model-trained-36-130647, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=0.7027550935745239
bi=50, loss=0.39727896451950073
bi=100, loss=0.2541312873363495
bi=150, loss=0.21545885503292084
bi=0, loss=0.20047719776630402
bi=50, loss=0.20243337750434875
bi=100, loss=0.1566799134016037
bi=150, loss=0.13840077817440033
bi=0, loss=0.09719154983758926
bi=50, loss=0.09782590717077255
bi=100, loss=0.0820310041308403
bi=150, loss=0.10493476688861847
bi=0, loss=0.0840710923075676
bi=50, loss=0.05158926174044609
bi=100, loss=0.06167454645037651
bi=150, loss=0.056629396975040436
F1 Score (Micro) = 0.8121
Training run 3 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/ were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model chec

Using model run_4GB-wwm_model-trained-36-130647, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=0.7014748454093933
bi=50, loss=0.33687806129455566
bi=100, loss=0.263121634721756
bi=150, loss=0.22524254024028778
bi=0, loss=0.21753326058387756
bi=50, loss=0.13807439804077148
bi=100, loss=0.12950682640075684
bi=150, loss=0.17360983788967133
bi=0, loss=0.1454714983701706
bi=50, loss=0.12453410774469376
bi=100, loss=0.07487290352582932
bi=150, loss=0.12451934814453125
bi=0, loss=0.11831802129745483
bi=50, loss=0.08225425332784653
bi=100, loss=0.06995587795972824
bi=150, loss=0.08223675191402435
F1 Score (Micro) = 0.8261
Training run 4 completed.


Some weights of the model checkpoint at /home/americanthinker/notebooks/pytorch/NationalSecurityBERT/Modeling/checkpoints/4GB-checkpoints/run_4GB-wwm_model-trained-36-130647/ were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model chec

Using model run_4GB-wwm_model-trained-36-130647, with tokenizer ../../Preprocessing/Tokenization/wp-vocab-30500-vocab.txt
bi=0, loss=0.7376038432121277
bi=50, loss=0.3538029193878174
bi=100, loss=0.3451709747314453
bi=150, loss=0.233695387840271
bi=0, loss=0.2343824952840805
bi=50, loss=0.14948777854442596
bi=100, loss=0.1623198240995407
bi=150, loss=0.13865043222904205
bi=0, loss=0.09316754341125488
bi=50, loss=0.0941113829612732
bi=100, loss=0.08734091371297836
bi=150, loss=0.13401448726654053
bi=0, loss=0.10819127410650253
bi=50, loss=0.13092494010925293
bi=100, loss=0.06515971571207047
bi=150, loss=0.0682067796587944
F1 Score (Micro) = 0.8093
Training run 5 completed.
Logging model stats....

Total Training/Eval time: 1061.04 seconds


In [22]:
torch.cuda.empty_cache()

In [19]:
2055/60

34.25