# Debug sentence classification


### Labels

In [55]:
import numpy as np
import pickle as pkl
from tqdm import tqdm, trange
from ftfy import fix_text
from collections import defaultdict
import torch, io, gzip, json, random, argparse, os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import (BertTokenizer, BertConfig, AdamW, BertForSequenceClassification,
        WarmupLinearSchedule)
from ftfy import fix_text
from arxiv_public_data.config import DIR_BASE, DIR_OUTPUT, DIR_FULLTEXT
f_metadata = os.path.join(DIR_BASE, 'arxiv-metadata-oai-2019-03-01.json.gz')

#Got these from Matt
cat_map = {
  "astro-ph": "astro-ph",
  "cond-mat": "cond-mat",
  "cs": "cs",
  "gr-qc": "gr-qc",
  "hep-ex": "hep-ex",
  "hep-lat": "hep-lat",
  "hep-ph": "hep-ph",
  "hep-th": "hep-th",
  "math-ph": "math-ph",
  "nlin": "nlin",
  "nucl-ex": "nucl-ex",
  "nucl-th": "nucl-th",
  "physics": "physics",
  "quant-ph": "quant-ph",
  "math": "math",
  "q-bio": "q-bio",
  "q-fin": "q-fin",
  "stat": "stat",
  "eess": "eess",
  "econ": "econ",
  "acc-phys": "physics.acc-ph",
  "adap-org": "nlin.AO",
  "alg-geom": "math.AG",
  "ao-sci": "physics.ao-ph",
  "atom-ph": "physics.atom-ph",
  "bayes-an": "physics.data-an",
  "chao-dyn": "nlin.CD",
  "chem-ph": "physics.chem-ph",
  "cmp-lg": "cs.CL",
  "comp-gas": "nlin.CG",
  "dg-ga": "math.DG",
  "funct-an": "math.FA",
  "mtrl-th": "cond-mat.mtrl-sci",
  "patt-sol": "nlin.PS",
  "plasm-ph": "physics.plasm-ph",
  "q-alg": "math.QA",
  "solv-int": "nlin.SI",
  "supr-con": "cond-mat.supr-con"
}


# I should experiment with and without this
def clean_doc(x):
    return fix_text(x)


def load_data(N, fname):
    #fname ='/home/khev/research/arxiv-public-datasets/arxiv-data/arxiv-metadata-oai-2019-03-01.json.gz'
    metadata = []
    ctr = 0
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        for row in fin.readlines():
            metadata.append(json.loads(row))
            ctr += 1
            if ctr > N:
                break
    return metadata


def process_data(metadata, data_type='title'):
    """
    data_type \element ['title', 'abstract']
    """

    sentences, labels, label_dict = [], [], {}
    for m in metadata:

        #sentences / titles
        sentence = clean_doc(m[data_type])
        
        # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
        sentence = "[CLS] " + sentence + " [SEP]" 
        sentences.append(sentence)

        #category
        category = m['categories'][0].split(' ')[0]

        #Take only primary index: 'math.CO' --> 'math'
        primaryCategories = False
        if primaryCategories:
            cutoff = len(category)
            try:
                cutoff = category.index('.')
            except ValueError:
                    pass
            category = category[:cutoff]
        
        if category not in label_dict:
            index = len(label_dict)
            label_dict[category] = index  # e.g. {'hep-ph':2}
        else:
            index = label_dict[category]
        labels.append(index)

    return sentences, labels, label_dict


def process_data_sub(metadata, data_type='title'):
    """
    Same as above, except I merge categories that are the same
    (origianl data in buggy: category names changed over times so have to be fixed)
    
    data_type='title' or 'abstract' or 'fulltext'
   
    """

    sentences, labels, label_dict = [], [], {}
    for i, m in enumerate(metadata):

        #sentences / titles
        if data_type != 'fulltext':
            sentence = clean_doc(m[data_type])
        else:
            sentence = load_ith_fulltext(i)  ###needs to be filled in
            sentence = clean_doc(sentence)
        
        # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
        sentence = "[CLS] " + sentence + " [SEP]" 
        sentences.append(sentence)

        #category
        category = m['categories'][0].split(' ')[0]
        
        #update cateogies -- apply matt's map
        if category in cat_map:
            category = cat_map[category]
        
        #Then add to the dics
        if category not in label_dict:
            index = len(label_dict)
            label_dict[category] = index  # ex: {'hep-ph':0, 'math.CO:1',,}
        else:
            index = label_dict[category]
        labels.append(index)

    return sentences, labels, label_dict


#N, data_type = 10**7, 'title'
#metadata = load_data(N,f_metadata)
#sentences, labels, label_dict_new = process_data_sub(metadata, data_type=data_type)
#len(label_dict_new)

In [25]:
N, data_type = 10**7, 'title'
metadata = load_data(N,f_metadata)
sentences, labels, label_dict_old = process_data(metadata, data_type=data_type)
len(label_dict_old)

171

In [22]:
cat_map

{'astro-ph': 'astro-ph',
 'cond-mat': 'cond-mat',
 'cs': 'cs',
 'gr-qc': 'gr-qc',
 'hep-ex': 'hep-ex',
 'hep-lat': 'hep-lat',
 'hep-ph': 'hep-ph',
 'hep-th': 'hep-th',
 'math-ph': 'math-ph',
 'nlin': 'nlin',
 'nucl-ex': 'nucl-ex',
 'nucl-th': 'nucl-th',
 'physics': 'physics',
 'quant-ph': 'quant-ph',
 'math': 'math',
 'q-bio': 'q-bio',
 'q-fin': 'q-fin',
 'stat': 'stat',
 'eess': 'eess',
 'econ': 'econ',
 'acc-phys': 'physics.acc-ph',
 'adap-org': 'nlin.AO',
 'alg-geom': 'math.AG',
 'ao-sci': 'physics.ao-ph',
 'atom-ph': 'physics.atom-ph',
 'bayes-an': 'physics.data-an',
 'chao-dyn': 'nlin.CD',
 'chem-ph': 'physics.chem-ph',
 'cmp-lg': 'cs.CL',
 'comp-gas': 'nlin.CG',
 'dg-ga': 'math.DG',
 'funct-an': 'math.FA',
 'mtrl-th': 'cond-mat.mtrl-sci',
 'patt-sol': 'nlin.PS',
 'plasm-ph': 'physics.plasm-ph',
 'q-alg': 'math.QA',
 'solv-int': 'nlin.SI',
 'supr-con': 'cond-mat.supr-con'}

In [28]:
'adap-org' in label_dict_old, 'adap-org' in label_dict_new

(True, False)

Looks good.

### Truncation

When we truncate the titles, do we chop off the special " [SEP] " token?

In [5]:
N, data_type = 10**2, 'title'
metadata = load_data(N,f_metadata)
sentences, labels, label_dict = process_data_sub(metadata, data_type=data_type)
print('Num classes = {}'.format(len(label_dict)))
print('Tokenizing')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

I1205 16:47:50.308459 139851126974272 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/khev/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


Num classes = 43
Tokenizing


In [10]:
MAX_LEN = 512  # BERT pretrained model width

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] #bert tokenizer
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") #pad

In [12]:
input_ids[:,0]

array([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101])

In [14]:
tokenizer.convert_ids_to_tokens([101])

['[CLS]']

Conclusion: the pretrained BERT used MAX_LEN = 512, so we're stuck with this

### Optimized

In [71]:
fname ='/home/khev/research/arxiv-public-datasets/arxiv-data/arxiv-metadata-oai-2019-03-01.json.gz'


def load_data_better(N, fname, data_type):
    
    
    #MAX_LENS = [50, 250, 500]  #truncate all titles, abstracts, fulltext to this level
    #N, data_type = args.N, args.data_type
    #if data_type == 'title':
    #     MAX_LEN = MAX_LENS[0]
    #elif data_type == 'abstract':
    #     MAX_LEN = MAX_LENS[1]
    #elif data_type == 'fulltext':
    #     MAX_LEN = MAX_LENS[2]
    
    MAX_LEN = 512  #BERT default
    input_ids = []
    labels, label_dict, ctr = [], {}, 0
    attention_masks = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        for row in fin.readlines():

            #Load metadata
            m = json.loads(row)

            #Build label list
            if data_type != 'fulltext':
                sentence = clean_doc(m[data_type])
            else:
                sentence = load_ith_fulltext(i)  ###needs to be filled in
                sentence = clean_doc(sentence)

            # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
            sentence = "[CLS] " + sentence + " [SEP]" 

            #category
            category = m['categories'][0].split(' ')[0]

            #update cateogies -- apply matt's map
            if category in cat_map: category = cat_map[category]

            #Then add to the dics
            if category not in label_dict:
                index = len(label_dict)
                label_dict[category] = index  # ex: {'hep-ph':0, 'math.CO:1',,}
            else:
                index = label_dict[category]
            labels.append(index)


            #Tokenize
            tokenized_sentence = tokenizer.tokenize(sentence)  #Ex: ['the', 'cat', 'ate']

            #Convert to IDs + pad
            input_id = tokenizer.convert_tokens_to_ids(tokenized_sentence)  #Ex: [1,10,3]
            input_id = pad_sequences([input_id], maxlen=MAX_LEN, dtype="long",truncating="post",padding="post")
            input_ids.append(input_id[0])
            
            #Attention mask
            seq_mask = [float(i>0) for i in input_id[0]]
            attention_masks.append(seq_mask)
            
            #Ctr
            ctr += 1
            if ctr >= N: break
                
    return np.array(input_ids), attention_masks, labels, label_dict 

            
data_type, N = 'title', 100
fname = '/home/khev/research/arxiv-public-datasets/arxiv-data/arxiv-metadata-oai-2019-03-01.json.gz'
input_ids, attention_masks, labels, label_dict = load_data_better(N,fname,data_type)
input_ids

I1205 17:57:26.602200 139851126974272 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/khev/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


array([[  101, 17208,  1997, ...,     0,     0,     0],
       [  101, 12403,  2869, ...,     0,     0,     0],
       [  101,  1996,  6622, ...,     0,     0,     0],
       ...,
       [  101, 24961,  1011, ...,     0,     0,     0],
       [  101,  2006,  1998, ...,     0,     0,     0],
       [  101, 19587,  2689, ...,     0,     0,     0]])

In [72]:
if gpu == True: device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else: device = torch.device("cpu")

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)



# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size, gpu = 2, True

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for
# loop with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print('Loading & prepping data')

#Model
print("Loading model")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
if gpu:
    model.cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]




#Test and train
# Store our loss and accuracy for plotting
train_loss_set = []
epochs = 2 

num_training_steps = epochs * len(train_dataloader)

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
scheduler = WarmupLinearSchedule(
    optimizer, warmup_steps=0.1 * num_training_steps,
    t_total=num_training_steps
)

print("Beginning training")
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    #Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        train_loss_set.append(loss)    
        #Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        scheduler.step()

    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    model.eval()

    #Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

  #Evaluate data for one epoch
    acc1, acc3, acc5, logliklihood = 0,0,0,0
    for batch in validation_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        #evaluate
        acc1_temp, acc3_temp, acc5_temp, logliklihood_temp = evaluate(logits,label_ids)

        acc1 += acc1_temp
        acc3 += acc3_temp
        acc5 += acc5_temp
        logliklihood += logliklihood_temp
        nb_eval_steps += 1

acc1 /= nb_eval_steps
acc3 /= nb_eval_steps
acc5 /= nb_eval_steps

num_examples = len(batch)*nb_eval_steps
perplexity = 2** (-logliklihood / num_examples / np.log(2))
line = '{}: top1, top3, top5, perplexity = {:.2f}, {:.2f}, {:.2f}, {:.2f}'.format(data_type, acc1,acc3,acc5,perplexity)
print(line)

#Save data
DIR_NAME = './output'
if not os.path.exists(DIR_NAME):
    os.makedirs(DIR_NAME)
fname = os.path.join(DIR_NAME[2:],'classification-results-N-{}-{}.txt'.format(N,data_type))
np.savetxt(fname,[line],fmt='%s')

I1205 17:57:37.939976 139851126974272 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/khev/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1205 17:57:37.941224 139851126974272 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 43,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

I1205 17:57:38.02938

Loading & prepping data
Loading model


I1205 17:57:39.796591 139851126974272 modeling_utils.py:405] Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
I1205 17:57:39.797329 139851126974272 modeling_utils.py:408] Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Beginning training
Train loss: 4.254676818847656





NameError: name 'evaluate' is not defined

### Old

In [64]:
b_input_mask

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.]], device='cuda:0')

In [65]:
b_labels

tensor([32,  0], device='cuda:0')

In [67]:
b_input_ids

tensor([[  101.,  8790.,  2389.,  ...,     0.,     0.,     0.],
        [  101., 17208.,  1997.,  ...,     0.,     0.,     0.]],
       device='cuda:0', dtype=torch.float64)