# Debug sentence classification


In [1]:
import numpy as np
import pickle as pkl
from tqdm import tqdm, trange
from ftfy import fix_text
from collections import defaultdict
import torch, io, gzip, json, random, argparse, os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import (BertTokenizer, BertConfig, AdamW, BertForSequenceClassification,
        WarmupLinearSchedule)
from ftfy import fix_text
from arxiv_public_data.config import DIR_BASE, DIR_OUTPUT, DIR_FULLTEXT
f_metadata = os.path.join(DIR_BASE, 'arxiv-metadata-oai-2019-03-01.json.gz')

#Got these from Matt
cat_map = {
  "astro-ph": "astro-ph",
  "cond-mat": "cond-mat",
  "cs": "cs",
  "gr-qc": "gr-qc",
  "hep-ex": "hep-ex",
  "hep-lat": "hep-lat",
  "hep-ph": "hep-ph",
  "hep-th": "hep-th",
  "math-ph": "math-ph",
  "nlin": "nlin",
  "nucl-ex": "nucl-ex",
  "nucl-th": "nucl-th",
  "physics": "physics",
  "quant-ph": "quant-ph",
  "math": "math",
  "q-bio": "q-bio",
  "q-fin": "q-fin",
  "stat": "stat",
  "eess": "eess",
  "econ": "econ",
  "acc-phys": "physics.acc-ph",
  "adap-org": "nlin.AO",
  "alg-geom": "math.AG",
  "ao-sci": "physics.ao-ph",
  "atom-ph": "physics.atom-ph",
  "bayes-an": "physics.data-an",
  "chao-dyn": "nlin.CD",
  "chem-ph": "physics.chem-ph",
  "cmp-lg": "cs.CL",
  "comp-gas": "nlin.CG",
  "dg-ga": "math.DG",
  "funct-an": "math.FA",
  "mtrl-th": "cond-mat.mtrl-sci",
  "patt-sol": "nlin.PS",
  "plasm-ph": "physics.plasm-ph",
  "q-alg": "math.QA",
  "solv-int": "nlin.SI",
  "supr-con": "cond-mat.supr-con"
}


# I should experiment with and without this
def clean_doc(x):
    return fix_text(x)


def load_data(N, fname, data_type):
    
    
    #MAX_LENS = [50, 250, 500]  #truncate all titles, abstracts, fulltext to this level
    #N, data_type = args.N, args.data_type
    #if data_type == 'title':
    #     MAX_LEN = MAX_LENS[0]
    #elif data_type == 'abstract':
    #     MAX_LEN = MAX_LENS[1]
    #elif data_type == 'fulltext':
    #     MAX_LEN = MAX_LENS[2]
    
    MAX_LEN = 512  #BERT default
    input_ids = []
    labels, label_dict, ctr = [], {}, 0
    attention_masks = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        for row in fin.readlines():

            #Load metadata
            m = json.loads(row)

            #Build label list
            if data_type != 'fulltext':
                sentence = clean_doc(m[data_type])
            else:
                sentence = load_ith_fulltext(i)  ###needs to be filled in
                sentence = clean_doc(sentence)

            # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
            sentence = "[CLS] " + sentence + " [SEP]" 

            #category
            category = m['categories'][0].split(' ')[0]

            #update cateogies -- apply matt's map
            if category in cat_map: category = cat_map[category]

            #Then add to the dics
            if category not in label_dict:
                index = len(label_dict)
                label_dict[category] = index  # ex: {'hep-ph':0, 'math.CO:1',,}
            else:
                index = label_dict[category]
            labels.append(index)


            #Tokenize
            tokenized_sentence = tokenizer.tokenize(sentence)  #Ex: ['the', 'cat', 'ate']

            #Convert to IDs + pad
            input_id = tokenizer.convert_tokens_to_ids(tokenized_sentence)  #Ex: [1,10,3]
            input_id = pad_sequences([input_id], maxlen=MAX_LEN, dtype="long",truncating="post",padding="post")
            input_ids.append(input_id[0])
            
            #Attention mask
            seq_mask = [float(i>0) for i in input_id[0]]
            attention_masks.append(seq_mask)
            
            #Ctr
            ctr += 1
            if ctr >= N: break
                
    return np.array(input_ids), attention_masks, labels, label_dict  

I1210 16:35:47.883074 139983411234624 file_utils.py:39] PyTorch version 1.3.1 available.
I1210 16:35:47.906023 139983411234624 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


### Train model

In [2]:
gpu = True
if gpu == True: device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else: device = torch.device("cpu")

#Load and process data
print('Loading & prepping data')
epochs = 10
N, data_type = 1000, 'title'
input_ids, attention_masks, labels, label_dict = load_data(N,f_metadata,data_type)
print('# classes = {}'.format(len(label_dict)))

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)



# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)


# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 4

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for
# loop with an iterator the entire dataset does not need to be loaded into memory
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print('Finished loading & prepping data')

#Model
print("Loading model")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
if gpu:
    model.cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]


num_training_steps = epochs * len(train_dataloader)

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
scheduler = WarmupLinearSchedule(
    optimizer, warmup_steps=0.1 * num_training_steps,
    t_total=num_training_steps)

print("Beginning training")
# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss, total = 0, 0

    #Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs[:2]
        tr_loss += loss.item()
        #Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        scheduler.step()
        
        total += logits.shape[0]

    tr_loss /= 1.0*(total)
    print("Train loss: {}".format(tr_loss))
    
    
    #Add the end of every epoch, find val_loss
    val_loss, total = 0, 0
    model.eval()
    for step, batch in enumerate(validation_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = outputs[:2]
        val_loss += loss.item()
        #Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        scheduler.step()
        
        total += logits.shape[0]
        
    val_loss /= 1.0*total
    print('Val loss: {}'.format(val_loss))
        
        
print('Training done: evaluating')

I1210 16:35:48.762283 139983411234624 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/khev/.cache/torch/transformers/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


Loading & prepping data


I1210 16:36:00.801364 139983411234624 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/khev/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
I1210 16:36:00.802296 139983411234624 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 87,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}



# classes = 87
Finished loading & prepping data
Loading model


I1210 16:36:00.900025 139983411234624 modeling_utils.py:337] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin from cache at /home/khev/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157
I1210 16:36:02.637931 139983411234624 modeling_utils.py:405] Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
I1210 16:36:02.638724 139983411234624 modeling_utils.py:408] Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
Epoch:   0%|          | 0/10 [00:00<?, ?it

Beginning training
Train loss: 1.0233550996250576


Epoch:  10%|█         | 1/10 [01:13<11:01, 73.51s/it]

Val loss: 0.8930472266674042
Train loss: 0.8056669653786553


Epoch:  20%|██        | 2/10 [02:27<09:50, 73.79s/it]

Val loss: 0.7068917477130889
Train loss: 0.6907255503204134


Epoch:  30%|███       | 3/10 [03:43<08:39, 74.19s/it]

Val loss: 0.6221694755554199
Train loss: 0.596981921394666


Epoch:  40%|████      | 4/10 [04:57<07:24, 74.12s/it]

Val loss: 0.5321426087617874
Train loss: 0.5228252749972874


Epoch:  50%|█████     | 5/10 [06:12<06:11, 74.38s/it]

Val loss: 0.45845855951309206
Train loss: 0.4673547583818436


Epoch:  60%|██████    | 6/10 [07:29<05:00, 75.21s/it]

Val loss: 0.4049462515115738
Train loss: 0.4223775040441089


Epoch:  70%|███████   | 7/10 [08:43<03:44, 74.86s/it]

Val loss: 0.3710770732164383
Train loss: 0.3870085896386041


Epoch:  80%|████████  | 8/10 [09:56<02:28, 74.27s/it]

Val loss: 0.3411976957321167
Train loss: 0.368580976261033


Epoch:  90%|█████████ | 9/10 [11:09<01:13, 73.94s/it]

Val loss: 0.32941774487495423
Train loss: 0.36382306390338476


Epoch: 100%|██████████| 10/10 [12:21<00:00, 73.47s/it]

Val loss: 0.3289579349756241
Training done: evaluating





### Prepare examples for summarization

In [18]:
import numpy as np
import os, gzip, json
from arxiv_public_data.config import DIR_BASE, DIR_OUTPUT, DIR_FULLTEXT
f_metadata = os.path.join(DIR_BASE, 'arxiv-metadata-oai-2019-03-01.json.gz')
fname = f_metadata

N = 100
cutoff = int(0.9*N)
OUTPUT_DIR = '/home/khev/2TB-harddrive/data/hugging-face-summarization/arxiv-abstracts-titles/small/'
with gzip.open(fname, 'rt', encoding='utf-8') as fin:
    for i,row in enumerate(fin.readlines()):

        #Load metadata
        m = json.loads(row)
        title, abstract = m['title'], m['abstract']
        title, abstract = title.replace('\n',''), abstract.replace('\n','')
        temp = abstract + '\n \n @ highlight \n \n ' + title  #form for hugging-face
        
        #Save as train
        fname_sav = os.path.join(OUTPUT_DIR,'{:0>3}.txt'.format(i))
        fout = open(fname_sav,'w')
        fout.write(temp)
        fout.close()
        
        if i >= N: break

### ArXiv

In [27]:
def spiralMatrix(matrix):
    """
    
    IDEAS:
    1. Peel off
    """
    
    
    results = []
    while matrix and matrix[0]:
        
        #Go right
        if matrix[0]:
            for i in matrix.pop(0):
                results.append(i)
            
        #Go down
        if matrix and matrix[0]:
            for row in matrix:
                results.append(row.pop(-1))
        
        #Go left
        if matrix and matrix[0]:
            for i in reversed(matrix.pop(-1)):
                results.append(i)
        
        #Go up
        if matrix and matrix[0]:
            for row in reversed(matrix):
                results.append(row.pop(0))
                
    return results


M = [
 [ 1, 2, 3 ],
 [ 4, 5, 6 ],
 [ 7, 8, 9 ]
]

spiralMatrix(M)

[1, 2, 3, 6, 9, 8, 7, 4, 5]