# Debug sentence classification


### Labels

In [24]:
import numpy as np
import pickle as pkl
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
from collections import defaultdict
import torch, io, gzip, json, random, argparse, os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import (BertTokenizer, BertConfig, AdamW, BertForSequenceClassification,
        WarmupLinearSchedule)

from arxiv_public_data.config import DIR_BASE, DIR_OUTPUT, DIR_FULLTEXT
f_metadata = os.path.join(DIR_BASE, 'arxiv-metadata-oai-2019-03-01.json.gz')

#Got these from Matt
cat_map = {
  "astro-ph": "astro-ph",
  "cond-mat": "cond-mat",
  "cs": "cs",
  "gr-qc": "gr-qc",
  "hep-ex": "hep-ex",
  "hep-lat": "hep-lat",
  "hep-ph": "hep-ph",
  "hep-th": "hep-th",
  "math-ph": "math-ph",
  "nlin": "nlin",
  "nucl-ex": "nucl-ex",
  "nucl-th": "nucl-th",
  "physics": "physics",
  "quant-ph": "quant-ph",
  "math": "math",
  "q-bio": "q-bio",
  "q-fin": "q-fin",
  "stat": "stat",
  "eess": "eess",
  "econ": "econ",
  "acc-phys": "physics.acc-ph",
  "adap-org": "nlin.AO",
  "alg-geom": "math.AG",
  "ao-sci": "physics.ao-ph",
  "atom-ph": "physics.atom-ph",
  "bayes-an": "physics.data-an",
  "chao-dyn": "nlin.CD",
  "chem-ph": "physics.chem-ph",
  "cmp-lg": "cs.CL",
  "comp-gas": "nlin.CG",
  "dg-ga": "math.DG",
  "funct-an": "math.FA",
  "mtrl-th": "cond-mat.mtrl-sci",
  "patt-sol": "nlin.PS",
  "plasm-ph": "physics.plasm-ph",
  "q-alg": "math.QA",
  "solv-int": "nlin.SI",
  "supr-con": "cond-mat.supr-con"
}


# I should experiment with and without this
def clean_doc(x):
    x = x.lower()
    x = x.replace('\n',' ')
    x = x.replace(' " ',' ')
    x = x.replace('"','')
    x = x.replace("'", "")
    x = x.replace(':',' ')
    x = x.replace('?',' ')
    x = x.replace('-',' ')
    x = x.replace(',','')
    x = x.replace('$',' $ ')
    x = x.replace('.','')
    x = x.replace('!',' ')
    x = x.replace('(',' ')
    x = x.replace(')',' ')
    return x


def load_data(N, fname):
    #fname ='/home/khev/research/arxiv-public-datasets/arxiv-data/arxiv-metadata-oai-2019-03-01.json.gz'
    metadata = []
    ctr = 0
    with gzip.open(fname, 'rt', encoding='utf-8') as fin:
        for row in fin.readlines():
            metadata.append(json.loads(row))
            ctr += 1
            if ctr > N:
                break
    return metadata


def process_data(metadata, data_type='title'):
    """
    data_type \element ['title', 'abstract']
    """

    sentences, labels, label_dict = [], [], {}
    for m in metadata:

        #sentences / titles
        sentence = clean_doc(m[data_type])
        
        # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
        sentence = "[CLS] " + sentence + " [SEP]" 
        sentences.append(sentence)

        #category
        category = m['categories'][0].split(' ')[0]

        #Take only primary index: 'math.CO' --> 'math'
        primaryCategories = False
        if primaryCategories:
            cutoff = len(category)
            try:
                cutoff = category.index('.')
            except ValueError:
                    pass
            category = category[:cutoff]
        
        if category not in label_dict:
            index = len(label_dict)
            label_dict[category] = index  # e.g. {'hep-ph':2}
        else:
            index = label_dict[category]
        labels.append(index)

    return sentences, labels, label_dict


def process_data_sub(metadata, data_type='title'):
    """
    Same as above, except I merge categories that are the same
    (origianl data in buggy: category names changed over times so have to be fixed)
    
    data_type='title' or 'abstract' or 'fulltext'
   
    """

    sentences, labels, label_dict = [], [], {}
    for i, m in enumerate(metadata):

        #sentences / titles
        if data_type != 'fulltext':
            sentence = clean_doc(m[data_type])
        else:
            sentence = load_ith_fulltext(i)  ###needs to be filled in
            sentence = clean_doc(sentence)
        
        # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
        sentence = "[CLS] " + sentence + " [SEP]" 
        sentences.append(sentence)

        #category
        category = m['categories'][0].split(' ')[0]
        
        #update cateogies -- apply matt's map
        if category in cat_map:
            category = cat_map[category]
        
        #Then add to the dics
        if category not in label_dict:
            index = len(label_dict)
            label_dict[category] = index  # ex: {'hep-ph':0, 'math.CO:1',,}
        else:
            index = label_dict[category]
        labels.append(index)

    return sentences, labels, label_dict


N, data_type = 10**7, 'title'
metadata = load_data(N,f_metadata)
sentences, labels, label_dict_new = process_data_sub(metadata, data_type=data_type)
len(label_dict_new)

153

In [25]:
N, data_type = 10**7, 'title'
metadata = load_data(N,f_metadata)
sentences, labels, label_dict_old = process_data(metadata, data_type=data_type)
len(label_dict_old)

171

In [22]:
cat_map

{'astro-ph': 'astro-ph',
 'cond-mat': 'cond-mat',
 'cs': 'cs',
 'gr-qc': 'gr-qc',
 'hep-ex': 'hep-ex',
 'hep-lat': 'hep-lat',
 'hep-ph': 'hep-ph',
 'hep-th': 'hep-th',
 'math-ph': 'math-ph',
 'nlin': 'nlin',
 'nucl-ex': 'nucl-ex',
 'nucl-th': 'nucl-th',
 'physics': 'physics',
 'quant-ph': 'quant-ph',
 'math': 'math',
 'q-bio': 'q-bio',
 'q-fin': 'q-fin',
 'stat': 'stat',
 'eess': 'eess',
 'econ': 'econ',
 'acc-phys': 'physics.acc-ph',
 'adap-org': 'nlin.AO',
 'alg-geom': 'math.AG',
 'ao-sci': 'physics.ao-ph',
 'atom-ph': 'physics.atom-ph',
 'bayes-an': 'physics.data-an',
 'chao-dyn': 'nlin.CD',
 'chem-ph': 'physics.chem-ph',
 'cmp-lg': 'cs.CL',
 'comp-gas': 'nlin.CG',
 'dg-ga': 'math.DG',
 'funct-an': 'math.FA',
 'mtrl-th': 'cond-mat.mtrl-sci',
 'patt-sol': 'nlin.PS',
 'plasm-ph': 'physics.plasm-ph',
 'q-alg': 'math.QA',
 'solv-int': 'nlin.SI',
 'supr-con': 'cond-mat.supr-con'}

In [28]:
'adap-org' in label_dict_old, 'adap-org' in label_dict_new

(True, False)

Looks good.

### Truncation