# Debug sentence classification


### Labels

In [1]:
import numpy as np
import pickle as pkl
from tqdm import tqdm, trange
from ftfy import fix_text
from collections import defaultdict
import torch, io, gzip, json, random, argparse, os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import (BertTokenizer, BertConfig, AdamW, BertForSequenceClassification,
        WarmupLinearSchedule)
from ftfy import fix_text
from arxiv_public_data.config import DIR_BASE, DIR_OUTPUT, DIR_FULLTEXT
f_metadata = os.path.join(DIR_BASE, 'arxiv-metadata-oai-2019-03-01.json.gz')

#Got these from Matt
cat_map = {
  "astro-ph": "astro-ph",
  "cond-mat": "cond-mat",
  "cs": "cs",
  "gr-qc": "gr-qc",
  "hep-ex": "hep-ex",
  "hep-lat": "hep-lat",
  "hep-ph": "hep-ph",
  "hep-th": "hep-th",
  "math-ph": "math-ph",
  "nlin": "nlin",
  "nucl-ex": "nucl-ex",
  "nucl-th": "nucl-th",
  "physics": "physics",
  "quant-ph": "quant-ph",
  "math": "math",
  "q-bio": "q-bio",
  "q-fin": "q-fin",
  "stat": "stat",
  "eess": "eess",
  "econ": "econ",
  "acc-phys": "physics.acc-ph",
  "adap-org": "nlin.AO",
  "alg-geom": "math.AG",
  "ao-sci": "physics.ao-ph",
  "atom-ph": "physics.atom-ph",
  "bayes-an": "physics.data-an",
  "chao-dyn": "nlin.CD",
  "chem-ph": "physics.chem-ph",
  "cmp-lg": "cs.CL",
  "comp-gas": "nlin.CG",
  "dg-ga": "math.DG",
  "funct-an": "math.FA",
  "mtrl-th": "cond-mat.mtrl-sci",
  "patt-sol": "nlin.PS",
  "plasm-ph": "physics.plasm-ph",
  "q-alg": "math.QA",
  "solv-int": "nlin.SI",
  "supr-con": "cond-mat.supr-con"
}


# I should experiment with and without this
def clean_doc(x):
    return fix_text(x)


def load_data(N, fname, data_type):
    
    
    #MAX_LENS = [50, 250, 500]  #truncate all titles, abstracts, fulltext to this level
    #N, data_type = args.N, args.data_type
    #if data_type == 'title':
    #     MAX_LEN = MAX_LENS[0]
    #elif data_type == 'abstract':
    #     MAX_LEN = MAX_LENS[1]
    #elif data_type == 'fulltext':
    #     MAX_LEN = MAX_LENS[2]
    
    MAX_LEN = 512  #BERT default
    input_ids = []
    labels, label_dict, ctr = [], {}, 0
    attention_masks = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        for row in fin.readlines():

            #Load metadata
            m = json.loads(row)

            #Build label list
            if data_type != 'fulltext':
                sentence = clean_doc(m[data_type])
            else:
                sentence = load_ith_fulltext(i)  ###needs to be filled in
                sentence = clean_doc(sentence)

            # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
            sentence = "[CLS] " + sentence + " [SEP]" 

            #category
            category = m['categories'][0].split(' ')[0]

            #update cateogies -- apply matt's map
            if category in cat_map: category = cat_map[category]

            #Then add to the dics
            if category not in label_dict:
                index = len(label_dict)
                label_dict[category] = index  # ex: {'hep-ph':0, 'math.CO:1',,}
            else:
                index = label_dict[category]
            labels.append(index)


            #Tokenize
            tokenized_sentence = tokenizer.tokenize(sentence)  #Ex: ['the', 'cat', 'ate']

            #Convert to IDs + pad
            input_id = tokenizer.convert_tokens_to_ids(tokenized_sentence)  #Ex: [1,10,3]
            input_id = pad_sequences([input_id], maxlen=MAX_LEN, dtype="long",truncating="post",padding="post")
            input_ids.append(input_id[0])
            
            #Attention mask
            seq_mask = [float(i>0) for i in input_id[0]]
            attention_masks.append(seq_mask)
            
            #Ctr
            ctr += 1
            if ctr >= N: break
                
    return np.array(input_ids), attention_masks, labels, label_dict  



#N, data_type = 10**7, 'title'
#metadata = load_data(N,f_metadata)
#sentences, labels, label_dict_new = process_data_sub(metadata, data_type=data_type)
#len(label_dict_new)

I1210 15:47:19.839966 140225670215488 file_utils.py:39] PyTorch version 1.3.1 available.
I1210 15:47:19.862981 140225670215488 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [25]:
N, data_type = 10**7, 'title'
metadata = load_data(N,f_metadata)
sentences, labels, label_dict_old = process_data(metadata, data_type=data_type)
len(label_dict_old)

171

In [22]:
cat_map

{'astro-ph': 'astro-ph',
 'cond-mat': 'cond-mat',
 'cs': 'cs',
 'gr-qc': 'gr-qc',
 'hep-ex': 'hep-ex',
 'hep-lat': 'hep-lat',
 'hep-ph': 'hep-ph',
 'hep-th': 'hep-th',
 'math-ph': 'math-ph',
 'nlin': 'nlin',
 'nucl-ex': 'nucl-ex',
 'nucl-th': 'nucl-th',
 'physics': 'physics',
 'quant-ph': 'quant-ph',
 'math': 'math',
 'q-bio': 'q-bio',
 'q-fin': 'q-fin',
 'stat': 'stat',
 'eess': 'eess',
 'econ': 'econ',
 'acc-phys': 'physics.acc-ph',
 'adap-org': 'nlin.AO',
 'alg-geom': 'math.AG',
 'ao-sci': 'physics.ao-ph',
 'atom-ph': 'physics.atom-ph',
 'bayes-an': 'physics.data-an',
 'chao-dyn': 'nlin.CD',
 'chem-ph': 'physics.chem-ph',
 'cmp-lg': 'cs.CL',
 'comp-gas': 'nlin.CG',
 'dg-ga': 'math.DG',
 'funct-an': 'math.FA',
 'mtrl-th': 'cond-mat.mtrl-sci',
 'patt-sol': 'nlin.PS',
 'plasm-ph': 'physics.plasm-ph',
 'q-alg': 'math.QA',
 'solv-int': 'nlin.SI',
 'supr-con': 'cond-mat.supr-con'}

In [28]:
'adap-org' in label_dict_old, 'adap-org' in label_dict_new

(True, False)

Looks good.

### Truncation

When we truncate the titles, do we chop off the special " [SEP] " token?

In [5]:
N, data_type = 10**2, 'title'
metadata = load_data(N,f_metadata)
sentences, labels, label_dict = process_data_sub(metadata, data_type=data_type)
print('Num classes = {}'.format(len(label_dict)))
print('Tokenizing')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

I1205 16:47:50.308459 139851126974272 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/khev/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


Num classes = 43
Tokenizing


In [10]:
MAX_LEN = 512  # BERT pretrained model width

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] #bert tokenizer
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") #pad

In [12]:
input_ids[:,0]

array([101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, 101,
       101, 101, 101, 101, 101, 101, 101, 101, 101, 101])

In [14]:
tokenizer.convert_ids_to_tokens([101])

['[CLS]']

Conclusion: the pretrained BERT used MAX_LEN = 512, so we're stuck with this

### Figure out loss

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#
batch_size, epochs = 12, 2
N, data_type = 100, 'title'
f_metadata = os.path.join(DIR_BASE, 'arxiv-metadata-oai-2019-03-01.json.gz')
input_ids, attention_masks, labels, label_dict = load_data(N,f_metadata,data_type)  

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)


# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for
# loop with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

I1210 15:46:19.362936 139953952757568 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/khev/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [5]:
#Model
print("Loading model")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))

#Optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}]

# This variable contains all of the hyperparemeter information our training loop needs
num_training_steps = epochs * len(train_dataloader)
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
scheduler = WarmupLinearSchedule(
    optimizer, warmup_steps=0.1 * num_training_steps,
    t_total=num_training_steps)


#train
epochs = 2
train_loss_set = []
#device = torch.device('cpu')
for _ in range(epochs):
    
    #train per batch
    for step, batch in enumerate(train_dataloader):
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        train_loss_set.append(loss)  
        
        #Backward pass
        loss.backward()
        
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        scheduler.step()

I1210 15:46:37.575002 139953952757568 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/khev/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1210 15:46:37.576409 139953952757568 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 43,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}



Loading model


I1210 15:46:37.672723 139953952757568 modeling_utils.py:337] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin from cache at /home/khev/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
I1210 15:46:39.547042 139953952757568 modeling_utils.py:405] Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
I1210 15:46:39.547733 139953952757568 modeling_utils.py:408] Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']


RuntimeError: Expected object of device type cuda but got device type cpu for argument #1 'self' in call to _th_index_select

### Roughworks

In [15]:
outs = []
with open('output/language-model-title/test-docs-for-prompts.txt') as f:
    for row in f.readlines():
        prompt = row  #take first 2 words as prompt
        generated_text = row
        print('\n\n Generated text: {} \n \n'.format(generated_text))
        outs.append(generated_text)



 Generated text: Daemons and DAMA: Their Celestial-Mechanics Interrelations
 
 



 Generated text: Millimeter-Thick Single-Walled Carbon Nanotube Forests: Hidden Role of
 
 



 Generated text:   Catalyst Support
 
 



 Generated text: The Use of Weighting in Periodicity Searches in All-Sky Monitor Data:
 
 



 Generated text:   Applications to the GLAST LAT
 
 



 Generated text: Composite Structure and Causality
 
 



 Generated text: Analytic solutions for marginal deformations in open superstring field
 
 



 Generated text:   theory
 
 



In [19]:
fname = 'output/language-model-title/test-docs-for-prompts.txt'
with open(fname) as f: data = f.readlines()
data

['Daemons and DAMA: Their Celestial-Mechanics Interrelations\n',
 'Millimeter-Thick Single-Walled Carbon Nanotube Forests: Hidden Role of\n',
 '  Catalyst Support\n',
 'The Use of Weighting in Periodicity Searches in All-Sky Monitor Data:\n',
 '  Applications to the GLAST LAT\n',
 'Composite Structure and Causality\n',
 'Analytic solutions for marginal deformations in open superstring field\n',
 '  theory\n']

In [27]:
import gzip, json, random

def load_data(N):
    fname ='/home/khev/research/arxiv-public-datasets/arxiv-data/arxiv-metadata-oai-2019-03-01.json.gz'
    metadata = []
    ctr = 0
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        for row in fin.readlines():
            metadata.append(json.loads(row))
            ctr += 1
            if ctr > N:
                break
    return metadata



#Load the training data
N = 100
data_type = 'title'  #'title' or 'abstract'
metadata = load_data(N)
docs = [m[data_type] for m in metadata]
del metadata
cutoff = int(0.9*len(docs))
docs_train, docs_test = docs[:cutoff], docs[cutoff:]

In [51]:
n = 5
prompts = random.sample(docs_test,n)
prompts = [p.replace('\n','') for p in prompts]
np.savetxt('temp1.txt',prompts,fmt='%s',delimiter='\n')

In [63]:
with open('output/language-model-title/test-docs-for-prompts.txt') as f:
    for row in f.readlines():
        prompt = " ".join(row.split()[:2])  #take first 2 words as prompt
        generated_text = row
        #generated_text = prompt + main(prompt)
        print('\n\n Generated text: {} \n \n'.format(generated_text))



 Generated text: Daemons and DAMA: Their Celestial-Mechanics Interrelations
 
 



 Generated text: Millimeter-Thick Single-Walled Carbon Nanotube Forests: Hidden Role of  Catalyst Support
 
 



 Generated text: The Use of Weighting in Periodicity Searches in All-Sky Monitor Data:  Applications to the GLAST LAT
 
 



 Generated text: Composite Structure and Causality
 
 



 Generated text: Analytic solutions for marginal deformations in open superstring field  theory
 
 



In [59]:
" ".join(row.split()[:2])

'Analytic solutions'